diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..1c6f7ad5f Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..5b6a06525 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/BPRSL.csv b/BPRSL.csv new file mode 100644 index 000000000..da407b548 --- /dev/null +++ b/BPRSL.csv @@ -0,0 +1,361 @@ +"","treatment","subject","weeks","bprs","week" +"1","1","1","week0",42,0 +"2","1","2","week0",58,0 +"3","1","3","week0",54,0 +"4","1","4","week0",55,0 +"5","1","5","week0",72,0 +"6","1","6","week0",48,0 +"7","1","7","week0",71,0 +"8","1","8","week0",30,0 +"9","1","9","week0",41,0 +"10","1","10","week0",57,0 +"11","1","11","week0",30,0 +"12","1","12","week0",55,0 +"13","1","13","week0",36,0 +"14","1","14","week0",38,0 +"15","1","15","week0",66,0 +"16","1","16","week0",41,0 +"17","1","17","week0",45,0 +"18","1","18","week0",39,0 +"19","1","19","week0",24,0 +"20","1","20","week0",38,0 +"21","2","1","week0",52,0 +"22","2","2","week0",30,0 +"23","2","3","week0",65,0 +"24","2","4","week0",37,0 +"25","2","5","week0",59,0 +"26","2","6","week0",30,0 +"27","2","7","week0",69,0 +"28","2","8","week0",62,0 +"29","2","9","week0",38,0 +"30","2","10","week0",65,0 +"31","2","11","week0",78,0 +"32","2","12","week0",38,0 +"33","2","13","week0",63,0 +"34","2","14","week0",40,0 +"35","2","15","week0",40,0 +"36","2","16","week0",54,0 +"37","2","17","week0",33,0 +"38","2","18","week0",28,0 +"39","2","19","week0",52,0 +"40","2","20","week0",47,0 +"41","1","1","week1",36,1 +"42","1","2","week1",68,1 +"43","1","3","week1",55,1 +"44","1","4","week1",77,1 +"45","1","5","week1",75,1 +"46","1","6","week1",43,1 +"47","1","7","week1",61,1 +"48","1","8","week1",36,1 +"49","1","9","week1",43,1 +"50","1","10","week1",51,1 +"51","1","11","week1",34,1 +"52","1","12","week1",52,1 +"53","1","13","week1",32,1 +"54","1","14","week1",35,1 +"55","1","15","week1",68,1 +"56","1","16","week1",35,1 +"57","1","17","week1",38,1 +"58","1","18","week1",35,1 +"59","1","19","week1",28,1 +"60","1","20","week1",34,1 +"61","2","1","week1",73,1 +"62","2","2","week1",23,1 +"63","2","3","week1",31,1 +"64","2","4","week1",31,1 +"65","2","5","week1",67,1 +"66","2","6","week1",33,1 +"67","2","7","week1",52,1 +"68","2","8","week1",54,1 +"69","2","9","week1",40,1 +"70","2","10","week1",44,1 +"71","2","11","week1",95,1 +"72","2","12","week1",41,1 +"73","2","13","week1",65,1 +"74","2","14","week1",37,1 +"75","2","15","week1",36,1 +"76","2","16","week1",45,1 +"77","2","17","week1",41,1 +"78","2","18","week1",30,1 +"79","2","19","week1",43,1 +"80","2","20","week1",36,1 +"81","1","1","week2",36,2 +"82","1","2","week2",61,2 +"83","1","3","week2",41,2 +"84","1","4","week2",49,2 +"85","1","5","week2",72,2 +"86","1","6","week2",41,2 +"87","1","7","week2",47,2 +"88","1","8","week2",38,2 +"89","1","9","week2",39,2 +"90","1","10","week2",51,2 +"91","1","11","week2",34,2 +"92","1","12","week2",49,2 +"93","1","13","week2",36,2 +"94","1","14","week2",36,2 +"95","1","15","week2",65,2 +"96","1","16","week2",45,2 +"97","1","17","week2",46,2 +"98","1","18","week2",27,2 +"99","1","19","week2",31,2 +"100","1","20","week2",27,2 +"101","2","1","week2",42,2 +"102","2","2","week2",32,2 +"103","2","3","week2",33,2 +"104","2","4","week2",27,2 +"105","2","5","week2",58,2 +"106","2","6","week2",37,2 +"107","2","7","week2",41,2 +"108","2","8","week2",49,2 +"109","2","9","week2",38,2 +"110","2","10","week2",31,2 +"111","2","11","week2",75,2 +"112","2","12","week2",36,2 +"113","2","13","week2",60,2 +"114","2","14","week2",31,2 +"115","2","15","week2",55,2 +"116","2","16","week2",35,2 +"117","2","17","week2",30,2 +"118","2","18","week2",29,2 +"119","2","19","week2",26,2 +"120","2","20","week2",32,2 +"121","1","1","week3",43,3 +"122","1","2","week3",55,3 +"123","1","3","week3",38,3 +"124","1","4","week3",54,3 +"125","1","5","week3",65,3 +"126","1","6","week3",38,3 +"127","1","7","week3",30,3 +"128","1","8","week3",38,3 +"129","1","9","week3",35,3 +"130","1","10","week3",55,3 +"131","1","11","week3",41,3 +"132","1","12","week3",54,3 +"133","1","13","week3",31,3 +"134","1","14","week3",34,3 +"135","1","15","week3",49,3 +"136","1","16","week3",42,3 +"137","1","17","week3",38,3 +"138","1","18","week3",25,3 +"139","1","19","week3",28,3 +"140","1","20","week3",25,3 +"141","2","1","week3",41,3 +"142","2","2","week3",24,3 +"143","2","3","week3",28,3 +"144","2","4","week3",31,3 +"145","2","5","week3",61,3 +"146","2","6","week3",33,3 +"147","2","7","week3",33,3 +"148","2","8","week3",39,3 +"149","2","9","week3",27,3 +"150","2","10","week3",34,3 +"151","2","11","week3",76,3 +"152","2","12","week3",27,3 +"153","2","13","week3",53,3 +"154","2","14","week3",38,3 +"155","2","15","week3",55,3 +"156","2","16","week3",27,3 +"157","2","17","week3",32,3 +"158","2","18","week3",33,3 +"159","2","19","week3",27,3 +"160","2","20","week3",29,3 +"161","1","1","week4",41,4 +"162","1","2","week4",43,4 +"163","1","3","week4",43,4 +"164","1","4","week4",56,4 +"165","1","5","week4",50,4 +"166","1","6","week4",36,4 +"167","1","7","week4",27,4 +"168","1","8","week4",31,4 +"169","1","9","week4",28,4 +"170","1","10","week4",53,4 +"171","1","11","week4",36,4 +"172","1","12","week4",48,4 +"173","1","13","week4",25,4 +"174","1","14","week4",25,4 +"175","1","15","week4",36,4 +"176","1","16","week4",31,4 +"177","1","17","week4",40,4 +"178","1","18","week4",29,4 +"179","1","19","week4",29,4 +"180","1","20","week4",25,4 +"181","2","1","week4",39,4 +"182","2","2","week4",20,4 +"183","2","3","week4",22,4 +"184","2","4","week4",31,4 +"185","2","5","week4",49,4 +"186","2","6","week4",28,4 +"187","2","7","week4",34,4 +"188","2","8","week4",55,4 +"189","2","9","week4",31,4 +"190","2","10","week4",39,4 +"191","2","11","week4",66,4 +"192","2","12","week4",29,4 +"193","2","13","week4",52,4 +"194","2","14","week4",35,4 +"195","2","15","week4",42,4 +"196","2","16","week4",25,4 +"197","2","17","week4",46,4 +"198","2","18","week4",30,4 +"199","2","19","week4",24,4 +"200","2","20","week4",25,4 +"201","1","1","week5",40,5 +"202","1","2","week5",34,5 +"203","1","3","week5",28,5 +"204","1","4","week5",50,5 +"205","1","5","week5",39,5 +"206","1","6","week5",29,5 +"207","1","7","week5",40,5 +"208","1","8","week5",26,5 +"209","1","9","week5",22,5 +"210","1","10","week5",43,5 +"211","1","11","week5",36,5 +"212","1","12","week5",43,5 +"213","1","13","week5",25,5 +"214","1","14","week5",27,5 +"215","1","15","week5",32,5 +"216","1","16","week5",31,5 +"217","1","17","week5",33,5 +"218","1","18","week5",28,5 +"219","1","19","week5",21,5 +"220","1","20","week5",27,5 +"221","2","1","week5",38,5 +"222","2","2","week5",20,5 +"223","2","3","week5",25,5 +"224","2","4","week5",26,5 +"225","2","5","week5",38,5 +"226","2","6","week5",26,5 +"227","2","7","week5",37,5 +"228","2","8","week5",51,5 +"229","2","9","week5",24,5 +"230","2","10","week5",34,5 +"231","2","11","week5",64,5 +"232","2","12","week5",27,5 +"233","2","13","week5",32,5 +"234","2","14","week5",30,5 +"235","2","15","week5",30,5 +"236","2","16","week5",22,5 +"237","2","17","week5",43,5 +"238","2","18","week5",26,5 +"239","2","19","week5",32,5 +"240","2","20","week5",23,5 +"241","1","1","week6",38,6 +"242","1","2","week6",28,6 +"243","1","3","week6",29,6 +"244","1","4","week6",47,6 +"245","1","5","week6",32,6 +"246","1","6","week6",33,6 +"247","1","7","week6",30,6 +"248","1","8","week6",26,6 +"249","1","9","week6",20,6 +"250","1","10","week6",43,6 +"251","1","11","week6",38,6 +"252","1","12","week6",37,6 +"253","1","13","week6",21,6 +"254","1","14","week6",25,6 +"255","1","15","week6",27,6 +"256","1","16","week6",29,6 +"257","1","17","week6",27,6 +"258","1","18","week6",21,6 +"259","1","19","week6",22,6 +"260","1","20","week6",21,6 +"261","2","1","week6",43,6 +"262","2","2","week6",19,6 +"263","2","3","week6",24,6 +"264","2","4","week6",24,6 +"265","2","5","week6",37,6 +"266","2","6","week6",27,6 +"267","2","7","week6",37,6 +"268","2","8","week6",55,6 +"269","2","9","week6",22,6 +"270","2","10","week6",41,6 +"271","2","11","week6",64,6 +"272","2","12","week6",21,6 +"273","2","13","week6",37,6 +"274","2","14","week6",33,6 +"275","2","15","week6",26,6 +"276","2","16","week6",22,6 +"277","2","17","week6",43,6 +"278","2","18","week6",36,6 +"279","2","19","week6",21,6 +"280","2","20","week6",23,6 +"281","1","1","week7",47,7 +"282","1","2","week7",28,7 +"283","1","3","week7",25,7 +"284","1","4","week7",42,7 +"285","1","5","week7",38,7 +"286","1","6","week7",27,7 +"287","1","7","week7",31,7 +"288","1","8","week7",25,7 +"289","1","9","week7",23,7 +"290","1","10","week7",39,7 +"291","1","11","week7",36,7 +"292","1","12","week7",36,7 +"293","1","13","week7",19,7 +"294","1","14","week7",26,7 +"295","1","15","week7",30,7 +"296","1","16","week7",26,7 +"297","1","17","week7",31,7 +"298","1","18","week7",25,7 +"299","1","19","week7",23,7 +"300","1","20","week7",19,7 +"301","2","1","week7",62,7 +"302","2","2","week7",18,7 +"303","2","3","week7",31,7 +"304","2","4","week7",26,7 +"305","2","5","week7",36,7 +"306","2","6","week7",23,7 +"307","2","7","week7",38,7 +"308","2","8","week7",59,7 +"309","2","9","week7",21,7 +"310","2","10","week7",42,7 +"311","2","11","week7",60,7 +"312","2","12","week7",22,7 +"313","2","13","week7",52,7 +"314","2","14","week7",30,7 +"315","2","15","week7",30,7 +"316","2","16","week7",22,7 +"317","2","17","week7",43,7 +"318","2","18","week7",33,7 +"319","2","19","week7",21,7 +"320","2","20","week7",23,7 +"321","1","1","week8",51,8 +"322","1","2","week8",28,8 +"323","1","3","week8",24,8 +"324","1","4","week8",46,8 +"325","1","5","week8",32,8 +"326","1","6","week8",25,8 +"327","1","7","week8",31,8 +"328","1","8","week8",24,8 +"329","1","9","week8",21,8 +"330","1","10","week8",32,8 +"331","1","11","week8",36,8 +"332","1","12","week8",31,8 +"333","1","13","week8",22,8 +"334","1","14","week8",26,8 +"335","1","15","week8",37,8 +"336","1","16","week8",30,8 +"337","1","17","week8",27,8 +"338","1","18","week8",20,8 +"339","1","19","week8",22,8 +"340","1","20","week8",21,8 +"341","2","1","week8",50,8 +"342","2","2","week8",20,8 +"343","2","3","week8",32,8 +"344","2","4","week8",23,8 +"345","2","5","week8",35,8 +"346","2","6","week8",21,8 +"347","2","7","week8",35,8 +"348","2","8","week8",66,8 +"349","2","9","week8",21,8 +"350","2","10","week8",39,8 +"351","2","11","week8",75,8 +"352","2","12","week8",23,8 +"353","2","13","week8",28,8 +"354","2","14","week8",27,8 +"355","2","15","week8",37,8 +"356","2","16","week8",22,8 +"357","2","17","week8",43,8 +"358","2","18","week8",30,8 +"359","2","19","week8",21,8 +"360","2","20","week8",23,8 diff --git a/Exercises/.DS_Store b/Exercises/.DS_Store new file mode 100644 index 000000000..5c0572c2d Binary files /dev/null and b/Exercises/.DS_Store differ diff --git a/Exercises/Exercise1.Rmd b/Exercises/Exercise1.Rmd new file mode 100644 index 000000000..7464570f1 --- /dev/null +++ b/Exercises/Exercise1.Rmd @@ -0,0 +1,2062 @@ +--- +title: "**Introduction to Open Data Science, Exercise set 1**" + +subtitle: "**R basics using the RHDS book**" + +output: + html_document: + theme: flatly + highlight: haddock + toc: true + toc_depth: 2 + number_section: false +--- + +This R Markdown sheet includes a quite large set of R code chunks taken from the RHDS book **R for Health Data Science** by Ewen Harrison and Riinu Pius (CRC Press, 2021), available online at . The structure follows the structure of the book (chapters 2, 3, 4, 5, 6, and 8 and their sections and subsections). + +The goal is to get familiar with R and R Markdown (in a quite quick pace) using typical R functions for data wrangling, visualization, and analysis. You do not need to do very much (yet), mostly just read or browse the book while following and exploring the work flow in RStudio, activating the R functions and studying the output (results, statistics, graphics etc.) that will be created (including some tiny errors that are included on purpose). There are also some small exercises, where you are encouraged to write a bit of R code yourself (based on the hints given in the book). You may well skip those exercises. + +The rest of the Exercise sets on this IODS (Introduction to Open Data Science) course have a different, more interactive structure. They have their own logic and detailed instructions, as you will see from the next week onwards. + +------------------------------------------------------------------------ + +# Chapter 1: Why we love R + +You should begin by reading (or browsing through) this chapter: + + + +**Note1:** On IODS course, we focus on writing the steps of the analysis in R Markdown, but we also use R scripts. Both ways of working are included in the RHDS book. Here, our focus is on the R Markdown (R scripts are a bit simpler). + +**Note2:** We have already copy-pasted the R codes for you in this R Markdown sheet, beginning from Chapter 2. We have also included the example datasets of RHDS book through our GitHub repository, so all the datasets of the book are easily accessible on the fly, without a need to download them separately. + +The following is the first example of an R code chunk, where you can activate R functions. Its contents have been copy-pasted from Section 1.7. Try now to activate the R "command" ("1001:1004") by putting the cursor on that line below, and pressing Ctrl+Enter (on Windows) or Cmd+Enter (on Mac)! + +```{r} +# Now we're printing bigger numbers: +1001:1004 + +``` + +As you see above, the results (in this case, the numbers 1001, 1002, 1003, and 1004) appear immediately below the R code chunk (preceded by the usual [1], which you will also get used to, very soon.) + +OK, as soon as you have read Chapter 1, you are ready to move on, to Chapter 2! + +------------------------------------------------------------------------ + +# Chapter 2: R basics + +Again, the idea is that you read (or at least browse) the contents of the book while following the work flow (the R code chunks that have been created here for you). + +So, begin reading at . + +As soon as you see R code in the book, look at your RStudio, in this R Markdown sheet. Activate the R functions one at a time and see what happens. + +Have fun! This will get you ready for the rest of the IODS course! + +## 2.1 Reading data into R + +**NOTE:** The function library(...) is used to call another R package to work. When there is a library(...) function involved (as there is in the code chunk below), you must be prepared to *install* the package (if you have not installed that before). You can do that in the RStudio "Packages" pane on the right, but also by using an R function install.packages("...") in this editor. In some cases (like just below), the install.packages("...") function is given as a comment to the library call, which is a *useful trick*: you can select (with mouse or arrow keys) the install.packages("...") and run it (by Ctrl+Enter / Cmd+Enter). It is NOT a good idea to leave the install.packages("...") function in the code, outside of the comment (similarly as the library, for example), as then the package would be installed unnecessarily often (which takes time and other resources). Usually, installing is done once, and then you can use the package (with library(...)) as many times as you need to. + +**So try this below:** select (with mouse or arrow keys) the install.packages("...") and run it (by Ctrl+Enter / Cmd+Enter). It will install the tidyverse package collection that we will need all the time. + +After an successful installation (that might take some time), continue by activating the actual functions, beginning from library(...) and then finally reading the data in: + +```{r} +library(tidyverse) # install.packages("tidyverse") +example_data <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/example_data.csv") +View(example_data) # look at the data (and then close that view to return to the editor) +``` + +### 2.1.2 Reading in the Global Burden of Disease example dataset + +```{r} +library(tidyverse) +gbd_short <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/global_burden_disease_cause-year.csv") +View(gbd_short) +``` + +## 2.2 Variable types and why we care + +**Note:** Some outputs may differ a bit from the book, because R is constantly developed further. One example is the following. (We have modified the code chunk a bit to reveal the outputs shown in the book.) + +```{r} +library(tidyverse) +typesdata <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/typesdata.csv") +spec(typesdata) +typesdata +``` + +```{r} +typesdata_faulty <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/typesdata_faulty.csv") +spec(typesdata_faulty) +typesdata_faulty +``` + +### 2.2.1 Numeric variables (continuous) + +```{r} +typesdata$measurement %>% mean() +measurement_mean <- typesdata$measurement %>% mean() + +measurement_mean == 3.333333 + +(0.10 + 0.05) == 0.15 + +library(tidyverse) +near(0.10+0.05, 0.15) + +near(measurement_mean, 3.333333, 0.000001) +``` + +### 2.2.2 Character variables + +```{r} +library(tidyverse) +typesdata %>% + count(group) + +typesdata %>% + count(group, sort = TRUE) + +# all ids are unique: +typesdata %>% + count(id, sort = TRUE) + +# we add in a duplicate row where id = ID3, +# then count again: +typesdata %>% + add_row(id = "ID3") %>% + count(id, sort = TRUE) +``` + +### 2.2.3 Factor variables (categorical) + +(just read - more about this topic will follow later...) + +### 2.2.4 Date/time variables + +Remember to install the lubridate package first... + +```{r} +library(lubridate) # install.packages("lubridate") # makes working with dates easier +current_datetime <- Sys.time() +current_datetime + +my_datetime <- "2020-12-01 12:00" +my_datetime + +# error that is explained in the book... (you may put '#' in front of it) +my_datetime - current_datetime + +current_datetime %>% class() +my_datetime %>% class() +``` + +2.2.4 continued... + +```{r} +my_datetime_converted <- ymd_hm(my_datetime) +my_datetime_converted + +# now it will work: +my_datetime_converted - current_datetime + +my_datesdiff <- my_datetime_converted - current_datetime +my_datesdiff %>% class() + +ymd_hm("2021-01-02 12:00") + my_datesdiff + +# another error challenge here... (again you may put '#' in front of that afterwards) +560/my_datesdiff +560/as.numeric(my_datesdiff) +``` + +2.2.4 continued... + +```{r} +parse_date_time("12:34 07/Jan'20", "%H:%M %d/%b'%y") + +Sys.time() +Sys.time() %>% format("%H:%M on %B-%d (%Y)") + +Sys.time() %>% format("Happy days, the current time is %H:%M %B-%d (%Y)!") +``` + +## 2.3 Objects and functions + +```{r} +library(tidyverse) +mydata <- tibble( + id = 1:4, + sex = c("Male", "Female", "Female", "Male"), + var1 = c(4, 1, 2, 3), + var2 = c(NA, 4, 5, NA), + var3 = c(2, 1, NA, NA) +) +``` + +### 2.3.1 data frame/tibble + +### 2.3.2 Naming objects + +```{r} +mydata +``` + +### 2.3.3 Function and its arguments + +```{r} +mydata$var1 + +mean(mydata$var1) + +mean(mydata$var2) + +mean(mydata$sex) + +mean(mydata$var2, na.rm = TRUE) + +Sys.time() +``` + +### 2.3.4 Working with objects + +```{r} +a <- 103 + +a + +seq(15, 30) + +example_sequence <- seq(15, 30) + +example_sequence <- example_sequence/2 + +example_sequence +``` + +### 2.3.5 \<- and = + +```{r} +mean_result <- mean(mydata$var2, na.rm = TRUE) +``` + +### 2.3.6 Recap: object, function, input, argument + +## 2.4 Pipe - %\>% + +```{r} +library(tidyverse) +mydata$var1 %>% mean() + +mean_result <- mydata$var1 %>% mean() +``` + +### 2.4.1 Using . to direct the pipe + +```{r} +mydata %>% + lm(var1~var2, data = .) +``` + +## 2.5 Operators for filtering data + +```{r} +gbd_short %>% + filter(year < 1995) + +gbd_short %>% + filter(year <= 1995) + +gbd_short %>% + filter(year == 1995) +``` + +2.5 continued... + +```{r} +# do you see what is wrong here? (you may fix it or hide it with '#' afterwards) +gbd_short %>% + filter(year = 1995) + +gbd_short %>% + filter(year == 1995 | year == 2017) + +gbd_short %>% + filter(year == max(year) | year == min(year)) +``` + +### 2.5.1 Worked examples + +```{r} +mydata_year2000 <- gbd_short %>% + filter(year == 2000) + +mydata_year2000 +``` + +2.5.1 continued... + +```{r} +new_data_selection <- gbd_short %>% + filter((year == 1990 | year == 2013) & cause == "Communicable diseases") + +new_data_selection +``` + +2.5.1 continued... + +```{r} +# Or we can get rid of the extra brackets around the years +# by moving cause into a new filter on a new line: + +new_data_selection <- gbd_short %>% + filter(year == 1990 | year == 2013) %>% + filter(cause == "Communicable diseases") + +new_data_selection +``` + +2.5.1 continued... + +```{r} + +# Or even better, we can include both in one filter() call, as all +# separate conditions are by default joined with "&": + +new_data_selection <- gbd_short %>% + filter(year == 1990 | year == 2013, + cause == "Communicable diseases") + +new_data_selection +``` + +## 2.6 The combine function: c() + +```{r} +gbd_short$cause %>% unique() + +gbd_short %>% + # also filtering for a single year to keep the result concise + filter(year == 1990) %>% + filter(cause == "Communicable diseases" | cause == "Non-communicable diseases") + +gbd_short %>% + filter(year == 1990) %>% + filter(cause %in% c("Communicable diseases", "Non-communicable diseases")) +``` + +## 2.7 Missing values (NAs) and filters + +```{r} +mydata + +mydata %>% + filter(is.na(var2)) + +mydata %>% + filter(!is.na(var2)) + +mydata %>% + filter(var2 != 5) + +mydata %>% + filter(var2 != 5 | is.na(var2)) +``` + +2.7 continued... + +```{r} +subset1 <- mydata %>% + filter(var2 == 5) + +subset2 <- mydata %>% + filter(! var2 == 5) + +subset1; subset2 + +nrow(mydata) +nrow(subset1) +nrow(subset2) + +nrow(subset1) + nrow(subset2) == nrow(mydata) + +subset3 <- mydata %>% + filter(is.na(var2)) + +nrow(subset1) + nrow(subset2) + nrow(subset3) == nrow(mydata) +``` + +## 2.8 Creating new columns - mutate() + +```{r} +typesdata + +typesdata$measurement + +typesdata$measurement/2 + +typesdata %>% + mutate(measurement/2) + +typesdata %>% + mutate(measurement_half = measurement/2) +``` + +2.8 continued... + +```{r} +mydata$`Nasty column name` + +# or + +mydata %>% + select(`Nasty column name`) +``` + +2.8 continued... + +```{r} +typesdata_modified <- typesdata %>% + mutate(measurement_half = measurement/2) + +typesdata_modified +``` + +2.8 continued... + +```{r} +library(lubridate) +typesdata %>% + mutate(reference_date = ymd_hm("2020-01-01 12:00"), + dates_difference = reference_date - date) %>% + select(date, reference_date, dates_difference) + +typesdata %>% + mutate(mean_measurement = mean(measurement)) + +typesdata %>% + mutate(mean_measurement = mean(measurement)) %>% + mutate(measurement_relative = measurement/mean_measurement) %>% + select(matches("measurement")) + +``` + +### 2.8.1 Worked example/exercise + +```{r} +typesdata %>% + mutate(reference_date = ymd_hm("2020-01-01 12:00"), + dates_difference = reference_date - date) %>% + mutate(dates_difference = round(dates_difference)) %>% + select(matches("date")) +``` + +## 2.9 Conditional calculations - if_else() + +```{r} +typesdata %>% + mutate(above_threshold = if_else(measurement > 3, + "Above three", + "Below three")) +``` + +## 2.10 Create labels - paste() + +```{r} +typesdata %>% + mutate(plot_label = paste(id, + "was last measured at", date, + ", and the value was", measurement)) %>% + select(plot_label) + +pastedata <- tibble(year = c(2007, 2008, 2009), + month = c("Jan", "Feb", "March"), + day = c(1, 2, 3)) +pastedata +``` + +2.10 continued... + +```{r} +pastedata %>% + mutate(date = paste(day, month, year, sep = "-")) + +library(lubridate) + +pastedata %>% + mutate(date = paste(day, month, year, sep = "-")) %>% + mutate(date = dmy(date)) +``` + +## 2.11 Joining multiple datasets + +```{r} +library(tidyverse) +patientdata <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/patient_data.csv") +patientdata + +labsdata <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/labs_data.csv") +labsdata + + full_join(patientdata, labsdata) + +inner_join(patientdata, labsdata) + + left_join(patientdata, labsdata) + +right_join(patientdata, labsdata) +``` + +### 2.11.1 Further notes about joins + +```{r} +patientdata_new <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/patient_data_updated.csv") +patientdata_new + +bind_rows(patientdata, patientdata_new) + +labsdata_updated <- labsdata %>% + add_row(id = 5, measurement = 2.49) +labsdata_updated + +left_join(patientdata, labsdata_updated) +``` + +Well done! That was active reading of Chapter 2. + +Working will continue below with Chapter 3... + +------------------------------------------------------------------------ + +# Chapter 3: Summarising data + +Continue reading at and working with the R code chunks. + +## 3.1 Get the data + +```{r} +library(tidyverse) +gbd_full <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/global_burden_disease_cause-year-sex-income.csv") + +# Creating a single-year tibble for printing and simple examples: +gbd2017 <- gbd_full %>% + filter(year == 2017) + +View(gbd2017) +``` + +## 3.2 Plot the data + +```{r} +gbd2017 %>% + # without the mutate(... = fct_relevel()) + # the panels get ordered alphabetically + mutate(income = fct_relevel(income, + "Low", + "Lower-Middle", + "Upper-Middle", + "High")) %>% + # defining the variables using ggplot(aes(...)): + ggplot(aes(x = sex, y = deaths_millions, fill = cause)) + + # type of geom to be used: column (that's a type of barplot): + geom_col(position = "dodge") + + # facets for the income groups: + facet_wrap(~income, ncol = 4) + + # move the legend to the top of the plot (default is "right"): + theme(legend.position = "top") + +``` + +## 3.3 Aggregating: group_by(), summarise() + +```{r} +gbd2017$deaths_millions %>% sum() + +gbd2017 %>% + summarise(sum(deaths_millions)) + +gbd2017 %>% + group_by(cause) %>% + summarise(sum(deaths_millions)) + +gbd2017 %>% + group_by(cause, sex) %>% + summarise(sum(deaths_millions)) +``` + +## 3.4 Add new columns: mutate() + +```{r} +gbd2017 %>% + group_by(cause, sex) %>% + summarise(deaths_per_group = sum(deaths_millions)) %>% + ungroup() %>% + mutate(deaths_total = sum(deaths_per_group)) +``` + +### 3.4.1 Percentages formatting: percent() + +```{r} +# percent() function for formatting percentages come from library(scales) +library(scales) +gbd2017_summarised <- gbd2017 %>% + group_by(cause, sex) %>% + summarise(deaths_per_group = sum(deaths_millions)) %>% + ungroup() %>% + mutate(deaths_total = sum(deaths_per_group), + deaths_relative = percent(deaths_per_group/deaths_total)) +gbd2017_summarised + +# using values from the first row as an example: +round(100*4.91/55.74, 1) %>% paste0("%") + +gbd2017_summarised %>% + mutate(deaths_relative = deaths_per_group/deaths_total) +``` + +## 3.5 summarise() vs mutate() + +```{r} +gbd_summarised <- gbd2017 %>% + group_by(cause, sex) %>% + summarise(deaths_per_group = sum(deaths_millions)) %>% + arrange(sex) + +gbd_summarised + +gbd_summarised_sex <- gbd_summarised %>% + group_by(sex) %>% + summarise(deaths_per_sex = sum(deaths_per_group)) + +gbd_summarised_sex +``` + +3.5 continued... + +```{r} +full_join(gbd_summarised, gbd_summarised_sex) + +gbd_summarised %>% + group_by(sex) %>% + mutate(deaths_per_sex = sum(deaths_per_group)) + +gbd2017 %>% + group_by(cause, sex) %>% + summarise(deaths_per_group = sum(deaths_millions)) %>% + group_by(sex) %>% + mutate(deaths_per_sex = sum(deaths_per_group), + sex_cause_perc = percent(deaths_per_group/deaths_per_sex)) %>% + arrange(sex, deaths_per_group) +``` + +## 3.6 Common arithmetic functions - sum(), mean(), median(), etc. + +```{r} +mynumbers <- c(1, 2, NA) +sum(mynumbers) + +sum(mynumbers, na.rm = TRUE) +``` + +## 3.7 select() columns + +```{r} +gbd_2rows <- gbd_full %>% + slice(1:2) + +gbd_2rows + +gbd_2rows %>% + select(cause, deaths_millions) + +gbd_2rows %>% + select(cause, deaths = deaths_millions) + +gbd_2rows %>% + rename(deaths = deaths_millions) + +gbd_2rows %>% + select(year, sex, income, cause, deaths_millions) + +gbd_2rows %>% + select(year, sex, everything()) + +gbd_2rows %>% + select(starts_with("deaths")) +``` + +## 3.8 Reshaping data - long vs wide format + +```{r} +gbd_wide <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/global_burden_disease_wide-format.csv") +gbd_long <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/global_burden_disease_cause-year-sex.csv") + +gbd_wide +gbd_long +``` + +### 3.8.1 Pivot values from rows into columns (wider) + +```{r} +gbd_long %>% + pivot_wider(names_from = year, values_from = deaths_millions) + +gbd_long %>% + pivot_wider(names_from = sex, values_from = deaths_millions) %>% + mutate(Male - Female) + +gbd_long %>% + pivot_wider(names_from = c(sex, year), values_from = deaths_millions) +``` + +### 3.8.2 Pivot values from columns to rows (longer) + +```{r} +gbd_wide %>% + pivot_longer(matches("Female|Male"), + names_to = "sex_year", + values_to = "deaths_millions") %>% + slice(1:6) + +gbd_wide %>% + select(matches("Female|Male")) +``` + +### 3.8.3 separate() a column into multiple columns + +```{r} +gbd_wide %>% + # same pivot_longer as before + pivot_longer(matches("Female|Male"), + names_to = "sex_year", + values_to = "deaths_millions") %>% + separate(sex_year, into = c("sex", "year"), sep = "_", convert = TRUE) +``` + +## 3.9 arrange() rows + +```{r} +gbd_long %>% + arrange(deaths_millions) %>% + # first 3 rows just for printing: + slice(1:3) + +gbd_long %>% + arrange(-deaths_millions) %>% + slice(1:3) + +gbd_long %>% + arrange(desc(sex)) %>% + # printing rows 1, 2, 11, and 12 + slice(1,2, 11, 12) +``` + +### 3.9.1 Factor levels + +```{r} +gbd_factored <- gbd_long %>% + mutate(cause = factor(cause)) + +gbd_factored$cause %>% levels() + +gbd_factored <- gbd_factored %>% + mutate(cause = cause %>% + fct_relevel("Injuries")) + +gbd_factored$cause %>% levels() +``` + +### 3.10.1 Exercise - pivot_wider() + +Look at Table 3.4 on page + +```{r} +gbd_long <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/global_burden_disease_cause-year-sex.csv") + +# Solution: + +gbd_long %>% + pivot_wider(names_from = cause, values_from = deaths_millions) +``` + +### 3.10.2 Exercise - group_by(), summarise() + +```{r} +gbd_full <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/RHDS/master/data/global_burden_disease_cause-year-sex-income.csv") + +glimpse(gbd_full) + +summary_data1 <- + gbd_full %>% + group_by(year) %>% + summarise(total_per_year = sum(deaths_millions)) + +summary_data1 + +summary_data2 <- + gbd_full %>% + group_by(year, cause) %>% + summarise(total_per_cause = sum(deaths_millions)) + +summary_data2 +``` + +### 3.10.3 Exercise - full_join(), percent() + +```{r} +# Solution: + +library(scales) +full_join(summary_data1, summary_data2) %>% + mutate(percentage = percent(total_per_cause/total_per_year)) +``` + +### 3.10.4 Exercise - mutate(), summarise() + +```{r} +# Solution: + +gbd_full %>% + # aggregate to deaths per cause per year using summarise() + group_by(year, cause) %>% + summarise(total_per_cause = sum(deaths_millions)) %>% + # then add a column of yearly totals using mutate() + group_by(year) %>% + mutate(total_per_year = sum(total_per_cause)) %>% + # add the percentage column + mutate(percentage = percent(total_per_cause/total_per_year)) %>% + # select the final variables for better viewing + select(year, cause, percentage) %>% + pivot_wider(names_from = cause, values_from = percentage) +``` + +### 3.10.5 Exercise - filter(), summarise(), pivot_wider() + +```{r} +# Solution: + +gbd_full %>% + filter(year == 1990) %>% + group_by(income, sex) %>% + summarise(total_deaths = sum(deaths_millions)) %>% + pivot_wider(names_from = income, values_from = total_deaths) +``` + +Wow! That was all for Chapter 3. GOOD JOB. + +The next chapter will take us in plotting awesome graphs. Let's proceed! + +------------------------------------------------------------------------ + +# Chapter 4: Different types of plots + +Look at Figure 4.1 at . + +You will now re-create the figure step by step! + +## 4.1 Get the data + +```{r} +library(tidyverse) +library(gapminder) # install.packages("gapminder") + +glimpse(gapminder) + +gapminder$year %>% unique() +gapminder$country %>% n_distinct() +gapminder$continent %>% unique() + +gapdata2007 <- gapminder %>% + filter(year == 2007) + +gapdata2007 + +# loads the gapminder dataset from the package environment +# into your Global Environment +gapdata <- gapminder +``` + +## 4.2 Anatomy of ggplot explained + +```{r} +# recommended form: +gapdata2007 %>% + ggplot(aes(x = gdpPercap, y = lifeExp)) + +# NOT recommended form: +ggplot(gapdata2007, aes(x = gdpPercap, y = lifeExp)) + +# just a schematic example of using the pipe: +# +# data %>% +# filter(...) %>% +# mutate(...) %>% +# ggplot(aes(...)) + +# ... +``` + +4.2 continued... + +```{r} +gapdata2007 %>% + ggplot(aes(x = gdpPercap, y = lifeExp)) + + geom_point() + +gapdata2007 %>% + ggplot(aes(x = continent, y = lifeExp)) + + geom_point() + +gapdata2007 %>% + ggplot(aes(x = gdpPercap, y = lifeExp, colour = continent)) + + geom_point() + +gapdata2007 %>% + ggplot(aes(x = gdpPercap, y = lifeExp, colour = continent)) + + geom_point(shape = 1) + +``` + +4.2 continued... + +```{r} +gapdata2007 %>% + ggplot(aes(x = gdpPercap, y = lifeExp, colour = continent)) + + geom_point(shape = 1) + + facet_wrap(~continent) + +gapdata2007 %>% + ggplot(aes(x = gdpPercap, y = lifeExp, colour = continent)) + + geom_point(shape = 1) + + facet_wrap(~pop > 50000000) + +gapdata2007 %>% + ggplot(aes(x = gdpPercap/1000, y = lifeExp, colour = continent)) + + geom_point(shape = 1) + + facet_wrap(~continent) + + theme_bw() +``` + +## 4.3 Set your theme - grey vs white + +```{r} +theme_set(theme_bw()) + +library(tidyverse) +theme_set(theme_bw()) +``` + +## 4.4 Scatter plots/bubble plots + +```{r} +gapdata2007 %>% + ggplot(aes(x = gdpPercap/1000, y = lifeExp, size = pop)) + + geom_point() + +gapdata2007 %>% + ggplot(aes(x = gdpPercap/1000, y = lifeExp, size = pop)) + + geom_point(shape = 1, alpha = 0.5) +``` + +## 4.5 Line plots/time series plots + +```{r} +gapdata %>% + filter(country == "United Kingdom") %>% + ggplot(aes(x = year, y = lifeExp)) + + geom_line() + +gapdata %>% + ggplot(aes(x = year, y = lifeExp)) + + geom_line() + +gapdata %>% + ggplot(aes(x = year, y = lifeExp, group = country)) + + geom_line() + +``` + +### 4.5.1 Exercise + +Look at Figure 4.9 on page + +```{r} +gapdata %>% + ggplot(aes(x = year, y = lifeExp, group = country)) + + geom_line() +``` + +Try to transform the above graph following these instructions: + +- Colour lines by continents: add colour = continent inside aes(); +- Continents on separate facets: + facet_wrap(\~continent); +- Use a nicer colour scheme: + scale_colour_brewer(palette = "Paired") + +## 4.6 Bar plots + +### 4.6.1 Summarised data + +```{r} +gapdata2007 %>% + filter(country %in% c("United Kingdom", "France", "Germany")) %>% + ggplot(aes(x = country, y = lifeExp)) + + geom_col() +``` + +### 4.6.2 Countable data + +```{r} +gapdata2007 %>% + count(continent) + +gapdata2007 %>% + ggplot(aes(x = continent)) + + geom_bar() + +gapdata2007 %>% + ggplot(aes(x = continent, colour = country)) + + geom_bar(fill = NA) + + theme(legend.position = "none") +``` + +### 4.6.3 colour vs fill + +### 4.6.4 Proportions + +```{r} +gapdata2007 %>% + ggplot(aes(x = "Global", fill = continent)) + + geom_bar() +``` + +### 4.6.5 Exercise + +Look at Figure 4.13 on the page and try to recreate it! + +```{r} + +``` + +## 4.7 Histograms + +```{r} +gapdata2007 %>% + ggplot(aes(x = lifeExp)) + + geom_histogram(binwidth = 10) +``` + +## 4.8 Box plots + +```{r} +gapdata2007 %>% + ggplot(aes(x = continent, y = lifeExp)) + + geom_boxplot() +``` + +## 4.9 Multiple geoms, multiple aes() + +```{r} +# (1) +gapdata2007 %>% + ggplot(aes(x = continent, y = lifeExp)) + + geom_boxplot() + + geom_point() + +# (2) +gapdata2007 %>% + ggplot(aes(x = continent, y = lifeExp)) + + geom_boxplot() + + geom_jitter() +``` + +4.9 continued... + +```{r} +# (3) +gapdata2007 %>% + ggplot(aes(x = continent, y = lifeExp, colour = continent)) + + geom_boxplot() + + geom_jitter() + +# (4) +gapdata2007 %>% + ggplot(aes(x = continent, y = lifeExp)) + + geom_boxplot() + + geom_jitter(aes(colour = continent)) +``` + +### 4.9.1 Worked example - three geoms together + +```{r} +label_data <- gapdata2007 %>% + group_by(continent) %>% + filter(lifeExp == max(lifeExp)) %>% + select(country, continent, lifeExp) + +# since we filtered for lifeExp == max(lifeExp) +# these are the maximum life expectancy countries at each continent: +label_data + +gapdata2007 %>% + ggplot(aes(x = continent, y = lifeExp)) + + # First geom - boxplot + geom_boxplot() + + # Second geom - jitter with its own aes(colour = ) + geom_jitter(aes(colour = continent)) + + # Third geom - label, with its own dataset (label_data) and aes(label = ) + geom_label(data = label_data, aes(label = country)) +``` + +Try the suggested experiments given on page with the R code above. (Copy-paste the above code chunk below to experiment with it!) + +## 4.10 All other types of plots + +## 4.11 Solutions + +You will find solutions to Exercises 4.5.1 and 4.6.5 on the page + +## 4.12 Extra: Advanced examples + +(this is all extra material, if you are curious and have some time to check it!) + +```{r} +gapdata %>% + filter(continent == "Europe") %>% + ggplot(aes(y = fct_reorder(country, lifeExp, .fun=max), + x = lifeExp, + colour = year)) + + geom_point(shape = 15, size = 2) + + scale_colour_distiller(palette = "Greens", direction = 1) + + theme_bw() +``` + +another example: + +```{r} +gapdata2007 %>% + group_by(continent) %>% + mutate(country_number = seq_along(country)) %>% + ggplot(aes(x = continent)) + + geom_bar(aes(colour = continent), fill = NA, show.legend = FALSE) + + geom_text(aes(y = country_number, label = country), vjust = 1)+ + geom_label(aes(label = continent), y = -1) + + theme_void() +``` + +Great! That was it for Chapter 4. + +The next chapter will take us further in fine tuning plots. It can be well considered EXTRA MATERIAL, so if you are curious, just go on and try it, too! + +**You may also skip Chapter 5 and proceed straight to Chapter 6.** + +You may come back in Chapter 5 anytime later, to learn the fine-tuning tricks! + +------------------------------------------------------------------------ + +# Chapter 5: Fine tuning plots + + + +## 5.1 Get the data + +```{r} +library(gapminder) +library(tidyverse) + +p0 <- gapminder %>% + filter(year == 2007) %>% + ggplot(aes(y = lifeExp, x = gdpPercap, colour = continent)) + + geom_point(alpha = 0.3) + + theme_bw() + + geom_smooth(method = "lm", se = FALSE) + + scale_colour_brewer(palette = "Set1") + +p0 +``` + +## 5.2 Scales + +### 5.2.1 Logarithmic + +```{r} +p1 <- p0 + scale_x_log10() + +p1 +``` + +### 5.2.2 Expand limits + +```{r} +p2 <- p0 + expand_limits(y = 0) + +p2 + +p3 <- p0 + expand_limits(y = c(0, 100)) + +p3 + +p4 <- p0 + + expand_limits(y = c(0, 100)) + + coord_cartesian(expand = FALSE) + +p4 + +# you may install the package also by selecting the command after '#' and activating it: +library(patchwork) # install.packages("patchwork") +p1 + p2 + p3 + p4 + plot_annotation(tag_levels = "1", tag_prefix = "p") +``` + +### 5.2.3 Zoom in + +```{r} +p5 <- p0 + + coord_cartesian(ylim = c(70, 85), xlim = c(20000, 40000)) + +p5 +``` + +### 5.2.4 Exercise + +```{r} +p6 <- p0 + + scale_y_continuous(limits = c(70, 85)) + + scale_x_continuous(limits = c(20000, 40000)) + +# Compare: + +p5 + labs(tag = "p5") + p6 + labs(tag = "p6") +``` + +### 5.2.5 Axis ticks + +```{r} +# calculating the maximum value to be included in the axis breaks: +max_value = gapminder %>% + filter(year == 2007) %>% + summarise(max_lifeExp = max(lifeExp)) %>% + pull(max_lifeExp) %>% + round(1) + +# using scale_y_continuous(breaks = ...): +p7 <- p0 + + coord_cartesian(ylim = c(0, 100), expand = 0) + + scale_y_continuous(breaks = c(18, 50, max_value)) + +# we may also include custom labels for our breaks: +p8 <- p0 + + coord_cartesian(ylim = c(0, 100), expand = 0) + + scale_y_continuous(breaks = c(18, 50, max_value), labels = c("Adults", "50", "MAX")) + +p7 + labs(tag = "p7") + p8 + labs(tag = "p8") +``` + +## 5.3 Colours + +### 5.3.1 Using the Brewer palettes: + +```{r} +p9 <- p0 + + scale_color_brewer(palette = "Paired") +``` + +### 5.3.2 Legend title + +```{r} +p10 <- p0 + + scale_color_brewer("Continent - \n one of 5", palette = "Paired") + +p9 + labs(tag = "p9") + p10 + labs(tag = "p10") +``` + +### 5.3.3 Choosing colours manually + +```{r} +p11 <- p0 + + scale_color_manual(values = c("red", "green", "blue", "purple", "pink")) + +p12 <- p0 + + scale_color_manual(values = c("#8dd3c7", "#ffffb3", "#bebada", + "#fb8072", "#80b1d3")) + +p11 + labs(tag = "p11") + p12 + labs(tag = "p12") +``` + +## 5.4 Titles and labels + +```{r} +p13 <- p0 + + labs(x = "Gross domestic product per capita", + y = "Life expectancy", + title = "Health and economics", + subtitle = "Gapminder dataset, 2007", + caption = Sys.Date(), + tag = "p13") + +p13 +``` + +### 5.4.1 Annotation + +```{r} +p14 <- p0 + + annotate("text", + x = 25000, + y = 50, + label = "No points here!") + +p15 <- p0 + + annotate("label", + x = 25000, + y = 50, + label = "No points here!") + +p16 <- p0 + + annotate("label", + x = 25000, + y = 50, + label = "No points here!", + hjust = 0) + +p14 + labs(tag = "p14") + (p15 + labs(tag = "p15"))/ (p16 + labs(tag = "p16")) +``` + +### 5.4.2 Annotation with a superscript and a variable + +```{r} +# a value we made up for this example +# a real analysis would get it from the linear model object +fit_glance <- tibble(r.squared = 0.7693465) + + +plot_rsquared <- paste0( + "R^2 == ", + fit_glance$r.squared %>% round(2)) + + +p17 <- p0 + + annotate("text", + x = 25000, + y = 50, + label = plot_rsquared, parse = TRUE, + hjust = 0) + +p17 + labs(tag = "p17") +``` + +## 5.5 Overall look - theme() + +### 5.5.1 Text size + +```{r} +p18 <- p0 + + theme(axis.text.y = element_text(colour = "green", size = 14), + axis.text.x = element_text(colour = "red", angle = 45, vjust = 0.5), + axis.title = element_text(colour = "blue", size = 16) + ) + +p18 + labs(tag = "p18") +``` + +### 5.5.2 Legend position + +```{r} +p19 <- p0 + + theme(legend.position = "none") + +p20 <- p0 + + theme(legend.position = c(1,0), #bottom-right corner + legend.justification = c(1,0)) + +p19 + labs(tag = "p19") + p20 + labs(tag = "p20") +``` + +5.5.2 continued... + +```{r} +p21 <- p0 + + guides(colour = guide_legend(ncol = 2)) + + theme(legend.position = "top") # moving to the top optional + +p21 + labs(tag = "p21") +``` + +## 5.6 Saving your plot + +```{r} +ggsave(p0, file = "my_saved_plot.pdf", width = 5, height = 4) + +ggsave(p0, file = "my_saved_plot_larger.pdf", width = 10, height = 8) +``` + +Congrats - that was all for Chapter 5! + +Chapters 6 and 8 prepare for the corresponding analysis chapters (7 and 9) in the RHDS book. + +Both chapters (6 and 8) include useful things, for example, for plotting and data wrangling with continuous (Ch.6) and categorical (Ch.8) variables. Hence you should practice them through, too. + +------------------------------------------------------------------------ + +# Chapter 6: Working with continuous outcome variables + + + +## 6.1 Continuous data + +## 6.2 The Question + +## 6.3 Get and check the data + +```{r} +# Load packages (and install the finalfit package if not yet installed) +library(tidyverse) +library(finalfit) # install.packages("finalfit") +library(gapminder) + +# Create object gapdata from object gapminder +gapdata <- gapminder + +glimpse(gapdata) # each variable as line, variable type, first values + +missing_glimpse(gapdata) # missing data for each variable + +ff_glimpse(gapdata) # summary statistics for each variable +``` + +## 6.4 Plot the data + +### 6.4.1 Histogram + +```{r} +gapdata %>% + filter(year %in% c(2002, 2007)) %>% + ggplot(aes(x = lifeExp)) + # remember aes() + geom_histogram(bins = 20) + # histogram with 20 bars + facet_grid(year ~ continent) # optional: add scales="free" +``` + +### 6.4.2 Quantile-quantile (Q-Q) plot + +```{r} +gapdata %>% + filter(year %in% c(2002, 2007)) %>% + ggplot(aes(sample = lifeExp)) + # Q-Q plot requires 'sample' + geom_qq() + # defaults to normal distribution + geom_qq_line(colour = "blue") + # add the theoretical line + facet_grid(year ~ continent) +``` + +### 6.4.3 Boxplot + +```{r} +gapdata %>% + filter(year %in% c(2002, 2007)) %>% + ggplot(aes(x = continent, y = lifeExp)) + + geom_boxplot() + + facet_wrap(~ year) +``` + +6.4.3 continued... + +```{r} +gapdata %>% + filter(year %in% c(2002, 2007)) %>% + ggplot(aes(x = factor(year), y = lifeExp)) + + geom_boxplot(aes(fill = continent)) + # add colour to boxplots + geom_jitter(alpha = 0.4) + # alpha = transparency + facet_wrap(~ continent, ncol = 5) + # spread by continent + theme(legend.position = "none") + # remove legend + xlab("Year") + # label x-axis + ylab("Life expectancy (years)") + # label y-axis + ggtitle( + "Life expectancy by continent in 2002 v 2007") # add title +``` + +## 6.5 Compare the means of two groups + +### 6.5.1 t-test + +### 6.5.2 Two-sample t-tests + +```{r} +ttest_data <- gapdata %>% # save as object ttest_data + filter(year == 2007) %>% # 2007 only + filter(continent %in% c("Asia", "Europe")) # Asia/Europe only + +ttest_result <- ttest_data %>% # example using pipe + t.test(lifeExp ~ continent, data = .) # note data = ., see below +ttest_result + +ttest_result$p.value # Extracted element of result object + +ttest_result$conf.int # Extracted element of result object +``` + +### 6.5.3 Paired t-tests + +```{r} +paired_data <- gapdata %>% # save as object paired_data + filter(year %in% c(2002, 2007)) %>% # 2002 and 2007 only + filter(continent == "Asia") # Asia only + +paired_data %>% + ggplot(aes(x = year, y = lifeExp, + group = country)) + # for individual country lines + geom_line() +``` + +6.5.3 continued... + +```{r} +paired_table <- paired_data %>% # save object paired_data + select(country, year, lifeExp) %>% # select vars interest + pivot_wider(names_from = year, # put years in columns + values_from = lifeExp) %>% + mutate( + dlifeExp = `2007` - `2002` # difference in means + ) +paired_table + +# Mean of difference in years +paired_table %>% summarise( mean(dlifeExp) ) + +paired_data %>% + t.test(lifeExp ~ year, data = ., paired = TRUE) +``` + +### 6.5.4 What if I run the wrong test? + +## 6.6 Compare the mean of one group: one sample t-tests + +```{r} +# the tidy() function comes from the package broom - install it first! +library(broom) # install.packages("broom") +gapdata %>% + filter(year == 2007) %>% # 2007 only + group_by(continent) %>% # split by continent + do( # dplyr function + t.test(.$lifeExp, mu = 77) %>% # compare mean to 77 years + tidy() # tidy into tibble + ) + +``` + +### 6.6.1 Interchangeability of t-tests + +```{r} +# note that we're using dlifeExp +# so the differences we calculated above +t.test(paired_table$dlifeExp, mu = 0) +``` + +## 6.7 Compare the means of more than two groups + +### 6.7.1 Plot the data + +```{r} +gapdata %>% + filter(year == 2007) %>% + filter(continent %in% + c("Americas", "Europe", "Asia")) %>% + ggplot(aes(x = continent, y=lifeExp)) + + geom_boxplot() +``` + +### 6.7.2 ANOVA + +```{r} +aov_data <- gapdata %>% + filter(year == 2007) %>% + filter(continent %in% c("Americas", "Europe", "Asia")) + +fit = aov(lifeExp ~ continent, data = aov_data) +summary(fit) +``` + +6.7.2 continued... + +```{r} +library(broom) # install.packages("broom") +gapdata %>% + filter(year == 2007) %>% + filter(continent %in% c("Americas", "Europe", "Asia")) %>% + aov(lifeExp~continent, data = .) %>% + tidy() +``` + +### 6.7.3 Assumptions + +```{r} +library(ggfortify) # install.packages("ggfortify") +autoplot(fit) +``` + +## 6.8 Multiple testing + +### 6.8.1 Pairwise testing and multiple comparisons + +```{r} +pairwise.t.test(aov_data$lifeExp, aov_data$continent, + p.adjust.method = "bonferroni") + +pairwise.t.test(aov_data$lifeExp, aov_data$continent, + p.adjust.method = "fdr") +``` + +## 6.9 Non-parametric tests + +### 6.9.1 Transforming data + +```{r} +africa2002 <- gapdata %>% # save as africa2002 + filter(year == 2002) %>% # only 2002 + filter(continent == "Africa") %>% # only Africa + select(country, lifeExp) %>% # only these variables + mutate( + lifeExp_log = log(lifeExp) # log life expectancy + ) +head(africa2002) # inspect + +africa2002 %>% + # pivot lifeExp and lifeExp_log values to same column (for easy plotting): + pivot_longer(contains("lifeExp")) %>% + ggplot(aes(x = value)) + + geom_histogram(bins = 15) + # make histogram + facet_wrap(~name, scales = "free") # facet with axes free to vary +``` + +### 6.9.2 Non-parametric test for comparing two groups + +```{r} +africa_data <- gapdata %>% + filter(year %in% c(1982, 2007)) %>% # only 1982 and 2007 + filter(continent %in% c("Africa")) # only Africa + +p1 <- africa_data %>% # save plot as p1 + ggplot(aes(x = lifeExp)) + + geom_histogram(bins = 15) + + facet_wrap(~year) + +p2 <- africa_data %>% # save plot as p2 + ggplot(aes(sample = lifeExp)) + # `sample` for Q-Q plot + geom_qq() + + geom_qq_line(colour = "blue") + + facet_wrap(~year) + +p3 <- africa_data %>% # save plot as p3 + ggplot(aes(x = factor(year), # try without factor(year) to + y = lifeExp)) + # see the difference + geom_boxplot(aes(fill = factor(year))) + # colour boxplot + geom_jitter(alpha = 0.4) + # add data points + theme(legend.position = "none") # remove legend + +library(patchwork) # install.packages("patchwork") # great for combining plots +p1 / p2 | p3 + +africa_data %>% + wilcox.test(lifeExp ~ year, data = .) +``` + +### 6.9.3 Non-parametric test for comparing more than two groups + +```{r} +library(broom) +gapdata %>% + filter(year == 2007) %>% + filter(continent %in% c("Americas", "Europe", "Asia")) %>% + kruskal.test(lifeExp~continent, data = .) %>% + tidy() +``` + +## 6.10 Finalfit approach + +```{r} +dependent <- "year" +explanatory <- c("lifeExp", "pop", "gdpPercap") +africa_data %>% + mutate( + year = factor(year) + ) %>% + summary_factorlist(dependent, explanatory, + cont = "median", p = TRUE) +``` + +6.10 continued... + +```{r} +dependent <- "year" +explanatory <- c("lifeExp", "pop", "gdpPercap") +africa_data %>% + mutate( + year = factor(year) + ) %>% + summary_factorlist(dependent, explanatory, + cont_nonpara = c(1, 3), # variable 1&3 are non-parametric + cont_range = TRUE, # lower and upper quartile + p = TRUE, # include hypothesis test + p_cont_para = "t.test", # use t.test/aov for parametric + add_row_totals = TRUE, # row totals + include_row_missing_col = FALSE, # missing values row totals + add_dependent_label = TRUE) # dependent label +``` + +## 6.11 Conclusions + +## 6.12 Exercises + +See page + +You may try yourself, and then check the solutions (link below)! + +### 6.12.1 Exercise + +```{r} + +``` + +### 6.12.2 Exercise + +```{r} + +``` + +### 6.12.3 Exercise + +```{r} + +``` + +### 6.12.4 Exercise + +```{r} + +``` + +## 6.13 Solutions + +Solutions to the above exercises! + + + +Great job! Chapter 6 DONE. Next: Chapter 8! + +------------------------------------------------------------------------ + +# Chapter 8: Working with categorical outcome variables + + + +## 8.1 Factors + +## 8.2 The Question + +## 8.3 Get the data + +```{r} +# Get the data from the boot package (that includes tools for bootstrapping methods): +meldata <- boot::melanoma # Survival from Malignant Melanoma +``` + +## 8.4 Check the data + +```{r} +library(tidyverse) +library(finalfit) +theme_set(theme_bw()) +meldata %>% glimpse() + +meldata %>% ff_glimpse() +``` + +## 8.5 Recode the data + +```{r} +meldata <- meldata %>% + mutate(sex.factor = # Make new variable + sex %>% # from existing variable + factor() %>% # convert to factor + fct_recode( # forcats function + "Female" = "0", # new on left, old on right + "Male" = "1") %>% + ff_label("Sex"), # Optional label for finalfit + + # same thing but more condensed code: + ulcer.factor = factor(ulcer) %>% + fct_recode("Present" = "1", + "Absent" = "0") %>% + ff_label("Ulcerated tumour"), + + status.factor = factor(status) %>% + fct_recode("Died melanoma" = "1", + "Alive" = "2", + "Died - other causes" = "3") %>% + ff_label("Status")) + +View(meldata) # take a look at the data! +``` + +## 8.6 Should I convert a continuous variable to a categorical variable? + +```{r} +# Summary of age +meldata$age %>% + summary() + +meldata %>% + ggplot(aes(x = age)) + + geom_histogram() +``` + +### 8.6.1 Equal intervals vs quantiles + +```{r} +meldata <- meldata %>% + mutate( + age.factor = + age %>% + cut(4) + ) + +meldata$age.factor %>% + summary() +``` + +8.6.1 continued... + +```{r} +meldata <- meldata %>% + mutate( + age.factor = + age %>% + Hmisc::cut2(g=4) # Note, cut2 comes from the Hmisc package + ) + +meldata$age.factor %>% + summary() + +View(meldata) # take a look at the data! +``` + +8.6.1 continued... + +```{r} +meldata <- meldata %>% + mutate( + age.factor = + age %>% + cut(breaks = c(4,20,40,60,95), include.lowest = TRUE) %>% + fct_recode( + "≤20" = "[4,20]", + "21 to 40" = "(20,40]", + "41 to 60" = "(40,60]", + ">60" = "(60,95]" + ) %>% + ff_label("Age (years)") + ) +head(meldata$age.factor) + +View(meldata) # take a look at the data! +``` + +## 8.7 Plot the data + +```{r} +p1 <- meldata %>% + ggplot(aes(x = ulcer.factor, fill = status.factor)) + + geom_bar() + + theme(legend.position = "none") + +p2 <- meldata %>% + ggplot(aes(x = ulcer.factor, fill = status.factor)) + + geom_bar(position = "fill") + + ylab("proportion") + +library(patchwork) +p1 + p2 +``` + +8.7 continued... + +```{r} +p1 <- meldata %>% + ggplot(aes(x = ulcer.factor, fill = status.factor)) + + geom_bar(position = position_stack(reverse = TRUE)) + + theme(legend.position = "none") + +p2 <- meldata %>% + ggplot(aes(x = ulcer.factor, fill = status.factor)) + + geom_bar(position = position_fill(reverse = TRUE)) + + ylab("proportion") + +library(patchwork) +p1 + p2 +``` + +8.7 continued... + +```{r} +p1 <- meldata %>% + ggplot(aes(x = ulcer.factor, fill=status.factor)) + + geom_bar(position = position_stack(reverse = TRUE)) + + facet_grid(sex.factor ~ age.factor) + + theme(legend.position = "none") + +p2 <- meldata %>% + ggplot(aes(x = ulcer.factor, fill=status.factor)) + + geom_bar(position = position_fill(reverse = TRUE)) + + facet_grid(sex.factor ~ age.factor)+ + theme(legend.position = "bottom") + + ylab("proportion") # this line was missing in the book + +p1 / p2 +``` + +## 8.8 Group factor levels together - fct_collapse() + +```{r} +meldata <- meldata %>% + mutate( + status_dss = fct_collapse( # dss - disease specific survival + status.factor, + "Alive" = c("Alive", "Died - other causes")) + ) + +View(meldata) # take a look at the data! +``` + +## 8.9 Change the order of values within a factor - fct_relevel() + +```{r} +# dss - disease specific survival +meldata$status_dss %>% levels() + +meldata <- meldata %>% + mutate(status_dss = status_dss %>% + fct_relevel("Alive") + ) + +meldata$status_dss %>% levels() +``` + +## 8.10 Summarising factors with finalfit + +```{r} +library(finalfit) +meldata %>% + summary_factorlist(dependent = "status_dss", + explanatory = "ulcer.factor") +``` + +8.10 continued... + +```{r} +library(finalfit) +meldata %>% + summary_factorlist(dependent = "status_dss", + explanatory = + c("ulcer.factor", "age.factor", + "sex.factor", "thickness") + ) +``` + +## 8.11 Pearson's chi-squared and Fisher's exact tests + +### 8.11.1 Base R + +```{r} +table(meldata$ulcer.factor, meldata$status_dss) + +# both give same result + +with(meldata, table(ulcer.factor, status_dss)) +``` + +8.11.1 continued... + +```{r} +library(magrittr) +meldata %$% # note $ sign here + table(ulcer.factor, status_dss) + +meldata %$% + table(ulcer.factor, status_dss) %>% + prop.table(margin = 1) # 1: row, 2: column etc. + +meldata %$% # note $ sign here + table(ulcer.factor, status_dss) %>% + chisq.test() + +library(broom) +meldata %$% # note $ sign here + table(ulcer.factor, status_dss) %>% + chisq.test() %>% + tidy() +``` + +## 8.12 Fisher's exact test + +```{r} +meldata %$% # note $ sign here + table(age.factor, status_dss) %>% + chisq.test() + +meldata %$% # note $ sign here + table(age.factor, status_dss) %>% + fisher.test() +``` + +## 8.13 Chi-squared / Fisher's exact test using finalfit + +```{r} +library(finalfit) +meldata %>% + summary_factorlist(dependent = "status_dss", + explanatory = "ulcer.factor", + p = TRUE) + +meldata %>% + summary_factorlist(dependent = "status_dss", + explanatory = + c("ulcer.factor", "age.factor", + "sex.factor", "thickness"), + p = TRUE) + +meldata %>% + summary_factorlist(dependent = "status_dss", + explanatory = + c("ulcer.factor", "age.factor", + "sex.factor", "thickness"), + p = TRUE, + p_cat = "fisher") +``` + +(The code chunk in 8.13, below the text "Further options can be included" does not work...) + +## 8.14 Exercises + +### 8.14.1 Exercise + +```{r} + +``` + +### 8.14.2 Exercise + +```{r} +meldata %>% + count(ulcer.factor, status.factor) %>% + group_by(status.factor) %>% + mutate(total = sum(n)) %>% + mutate(percentage = round(100*n/total, 1)) %>% + mutate(count_perc = paste0(n, " (", percentage, ")")) %>% + select(-total, -n, -percentage) %>% + spread(status.factor, count_perc) +``` + +### 8.14.3 Exercise + +(Solutions to these exercises are not included in the book.) + +Gooood job! Chapter 8 DONE. + +------------------------------------------------------------------------ diff --git a/Exercises/Exercise2.Rmd b/Exercises/Exercise2.Rmd new file mode 100644 index 000000000..608d24758 --- /dev/null +++ b/Exercises/Exercise2.Rmd @@ -0,0 +1,620 @@ +--- +title: "**Introduction to Open Data Science, Exercise Set 2**" + +subtitle: "**Regression and model validation**" + +output: + html_document: + theme: flatly + highlight: haddock + toc: true + toc_depth: 2 + number_section: false +--- + + +This set consists of a few numbered exercises. +Go to each exercise in turn and do as follows: + +1. Read the brief description of the exercise. +2. Run the (possible) pre-exercise-code chunk. +3. Follow the instructions to fix the R code! + +## 2.0 INSTALL THE REQUIRED PACKAGES FIRST! + +One or more extra packages (in addition to `tidyverse`) will be needed below. + +```{r} +# Select (with mouse or arrow keys) the install.packages("...") and +# run it (by Ctrl+Enter / Cmd+Enter): + +library(dplyr) +library(tidyverse) +library(GGally) +library(ggplot2) +library(gapminder) +library(finalfit) +library(broom) + + +``` + + +## 2.1 Reading data from the web + +The first step of data analysis with R is reading data into R. This is done with a function. Which function and function arguments to use to do this, depends on the original format of the data. + +Conveniently in R, the same functions for reading data can usually be used whether the data is saved locally on your computer or somewhere else behind a web URL. + +After the correct function has been identified and data read into R, the data will usually be in R `data.frame` format. Te dimensions of a data frame are ($n$,$d$), where $n$ is the number of rows (the observations) and $d$ the number of columns (the variables). + +**The purpose of this course is to expose you to some basic and more advanced tasks of programming and data analysis with R.** + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# (No pre-exercise code in this exercise! Just go on!) + +``` + +### Instructions +- Read the `lrn14` data frame to memory with `read.table()`. There is information related to the data [here](http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt) +- Use `dim()` on the data frame to look at the dimensions of the data. How many rows and colums does the data have? +- Look at the structure of the data with `str()`. + +Hint: +- For both functions you can pass a data frame as the first (unnamed) argument. + +### R code +```{r} +# This is a code chunk in RStudio editor. +# Work with the exercise in this chunk, step-by-step. Fix the R code! + +# read the data into memory +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt", sep="\t", header=TRUE) + +# Look at the dimensions of the data + +# Look at the structure of the data +#use .txt file to import data set for better description. +# Preliminary results available at http://www.slideshare.net/kimmovehkalahti/the-relationship-between-learning-approaches-and-students-achievements-in-an-introductory-statistics-course-in-finland +#Total respondents n=183, total question n=60, so 184 rows including heading and 60 columns +#The code as respective column heading represents a question related to the survey and number. Each SN is a respondents and the answers to each question are given in a Lickert scale (0-5). + +dim(lrn14) +str(lrn14) +``` + + +## 2.2 Scaling variables + +The next step is [wrangling the data](https://en.wikipedia.org/wiki/Data_wrangling) into a format that is easy to analyze. We will wrangle our data for the next few exercises. + +A neat thing about R is that may operations are *vectorized*. It means that a single operation can affect all elements of a vector. This is often convenient. + +The column `Attitude` in `lrn14` is a sum of 10 questions related to students attitude towards statistics, each measured on the [Likert scale](https://en.wikipedia.org/wiki/Likert_scale) (1-5). Here we'll scale the combination variable back to the 1-5 scale. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# read the data into memory +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-data.txt", sep="\t", header=TRUE) + +``` + +### Instructions +- Execute the example codes to see how vectorized division works +- Use vector division to create a new column `attitude` in the `lrn14` data frame, where each observation of `Attitude` is scaled back to the original scale of the questions, by dividing it with the number of questions. + +Hint: +- Assign 'Attitude divided by 10' to the new column 'attitude. + +### R code +```{r} +lrn14$attitude <- lrn14$Attitude / 10 +``` + + +## 2.3 Combining variables + +Our data includes many questions that can be thought to measure the same *dimension*. You can read more about the data and the variables [here](http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt). Here we'll combine multiple questions into combination variables. Useful functions for summation with data frames in R are + +function | description +------------- | ---------- +`colSums(df)` | returns a sum of each column in `df` +`rowSums(df)` | returns a sum of each row in `df` +`colMeans(df)`| returns the mean of each column in `df` +`rowMeans(df)`| return the mean of each row in `df` + +We'll combine the use of `rowMeans()`with the `select()` function from the **dplyr** library to average the answers of selected questions. See how it is done from the example codes. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# read the data into memory +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-data.txt", sep="\t", header=TRUE) +lrn14$attitude <- lrn14$Attitude / 10 +``` + +### Instructions +- Access the **dplyr** library +- Execute the example codes to create the combination variables 'deep' and 'surf' as columns in `lrn14` +- Select the columns related to strategic learning from `lrn14` +- Create the combination variable 'stra' as a column in `lrn14` + +Hints: +- Columns related to strategic learning are in the object `strategic_questions`. Use it for selecting the correct columns. +- Use the function `rowMeans()` identically to the examples + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# lrn14 is available + +# Access the dplyr library +library(dplyr) + +# questions related to deep, surface and strategic learning +deep_questions <- c("D03", "D11", "D19", "D27", "D07", "D14", "D22", "D30","D06", "D15", "D23", "D31") +surface_questions <- c("SU02","SU10","SU18","SU26", "SU05","SU13","SU21","SU29","SU08","SU16","SU24","SU32") +strategic_questions <- c("ST01","ST09","ST17","ST25","ST04","ST12","ST20","ST28") + + +# select the columns related to deep learning +deep_columns <- select(lrn14, one_of(deep_questions)) +# and create column 'deep' by averaging +lrn14$deep <- rowMeans(deep_columns) + +# select the columns related to surface learning +surface_columns <- select(lrn14, one_of(surface_questions)) +# and create column 'surf' by averaging +lrn14$surf <- rowMeans(surface_columns) + +# select the columns related to strategic learning +strategic_columns <- select(lrn14, one_of(strategic_questions)) +# and create column 'stra' by averaging +lrn14$stra <- rowMeans(strategic_columns) + +``` + + +## 2.4 Selecting columns + +Often it is convenient to work with only a certain column or a subset of columns of a bigger data frame. There are many ways to select columns of data frame in R and you saw one of them in the previous exercise: `select()` from **dplyr***. + +**dplyr** is a popular library for *data wrangling*. There are also convenient [data wrangling cheatsheets by RStudio](https://www.rstudio.com/resources/cheatsheets/) to help you get started (dplyr, tidyr etc.) + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-data.txt", sep="\t", header=TRUE) +lrn14$attitude <- lrn14$Attitude / 10 +deep_questions <- c("D03", "D11", "D19", "D27", "D07", "D14", "D22", "D30","D06", "D15", "D23", "D31") +lrn14$deep <- rowMeans(lrn14[, deep_questions]) +surface_questions <- c("SU02","SU10","SU18","SU26", "SU05","SU13","SU21","SU29","SU08","SU16","SU24","SU32") +lrn14$surf <- rowMeans(lrn14[, surface_questions]) +strategic_questions <- c("ST01","ST09","ST17","ST25","ST04","ST12","ST20","ST28") +lrn14$stra <- rowMeans(lrn14[, strategic_questions]) +``` + + +### Instructions +- Access the **dplyr** library +- Create object `keep_columns` +- Use `select()` (possibly together with `one_of()`) to create a new data frame `learning2014` with the columns named in `keep_columns`. +- Look at the structure of the new dataset + +Hint: +- See the previous exercise or the data wrangling cheatsheet for help on how to select columns + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! + +# lrn14 is available + +# access the dplyr library +library(dplyr) + +# choose a handful of columns to keep +keep_columns <- c("gender","Age","attitude", "deep", "stra", "surf", "Points") + +# select the 'keep_columns' to create a new dataset +learning2014 <- select(lrn14,all_of(keep_columns)) + +# see the structure of the new dataset + +print(learning2014) +``` +## 2.5 Modifying column names + +Sometimes you want to rename your column. You could do this by creating copies of the columns with new names, but you can also directly get and set the column names of a data frame, using the function `colnames()`. + +The **dplyr** library has a `rename()` function, which can also be used. Remember [the cheatsheets](https://www.rstudio.com/resources/cheatsheets/). + +### Instructions +- Print out the column names of `learning2014` +- Change the name of the second column to 'age' +- Change the name of 'Points' to 'points' +- Print out the column names again to see the changes + +Hint: +- You can use `colnames()` similarily to the example. Which index matches the column 'Points'? + +### R code + +```{r} +print(names(learning2014)) +colnames(learning2014)[2] <- "age" +learning2014 <- rename(learning2014, points = Points) +``` + +```{r} +print(dim(learning2014)) #check the dimension now (must have 166 rown and 7) +``` + +## 2.6 Excluding observations + +Often your data includes outliers or other observations which you wish to remove before further analysis. Or perhaps you simply wish to work with some subset of your data. + +In the **learning2014** data the variable 'points' denotes the students exam points in a statistics course exam. If the student did not attend an exam, the value of 'points' will be zero. We will remove these observations from the data. + +### R code +```{r, echo=FALSE} +learning2014 <- learning2014[learning2014$points > 0,] +``` + +### Instructions +- Access the **dplyr** library +- As an example, create object `male_students` by selecting the male students from `learning2014` +- Override `learning2014` and select rows where the 'points' variable is greater than zero. +- If you do not remember how logical comparison works in R, see the 'Logical comparison' exercise from the course 'R Short and Sweet'. + +Hint: +- The "greater than" logical operator is `>` + +```{r} +dim(lrn14) +dim(learning2014) + +#Export csv file +setwd("~/Documents/GitHub/IODS-project") +write_csv(learning2014, 'learning2014.csv') + +``` + +## 2.7 Visualizations with ggplot2 + +[**ggplot2**](http://ggplot2.org/) is a popular library for creating stunning graphics with R. It has some advantages over the basic plotting system in R, mainly consistent use of function arguments and flexible plot alteration. ggplot2 is an implementation of Leland Wilkinson's *Grammar of Graphics* — a general scheme for data visualization. + +In ggplot2, plots may be created via the convenience function `qplot()` where arguments and defaults are meant to be similar to base R's `plot()` function. More complex plotting capacity is available via `ggplot()`, which exposes the user to more explicit elements of the grammar. (from [wikipedia](https://en.wikipedia.org/wiki/Ggplot2)) + +RStudio has a [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) for data visualization with ggplot2. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Access the **ggplot2** library +- Initialize the plot with data and aesthetic mappings +- Adjust the plot initialization: Add an aesthetic element to the plot by defining `col = gender` inside `aes()`. +- Define the visualization type (points) +- Draw the plot to see how it looks at this point +- *Add* a regression line to the plot +- *Add* the title "Student's attitude versus exam points" with `ggtitle("")` to the plot with regression line +- Draw the plot again to see the changes + +Hints: +- Use `+` to add the title to the plot +- The plot with regression line is saved in the object `p3` +- You can draw the plot by typing the object name where the plot is saved + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# Access the gglot2 library +library(ggplot2) + +# initialize plot with data and aesthetic mapping +p1 <- ggplot(learning2014, aes(x = attitude, y = points)) + +# define the visualization type (points) +p2 <- p1 + geom_point() + +# draw the plot +p2 + +# add a regression line +p3 <- p2 + geom_smooth(method = "lm") + +# draw the plot +p3 + +#Lets try and overview summary +p <- ggpairs(learning2014, mapping = aes(col = gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20))) + # draw the plot! +p + +``` + +## 2.8 Exploring a data frame + +Often the most interesting feature of your data are the relationships between the variables. If there are only a handful of variables saved as columns in a data frame, it is possible to visualize all of these relationships neatly in a single plot. + +Base R offers a fast plotting function `pairs()`, which draws all possible scatter plots from the columns of a data frame, resulting in a scatter plot matrix. Libraries **GGally** and **ggplot2** together offer a slow but more detailed look at the variables, their distributions and relationships. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Draw a scatter matrix of the variables in learning2014 (other than gender) +- Adjust the code: Add the argument `col` to the `pairs()` function, defining the colour with the 'gender' variable in learning2014. +- Draw the plot again to see the changes. +- Access the **ggpot2** and **GGally** libraries and create the plot `p` with `ggpairs()`. +- Draw the plot. Note that the function is a bit slow. +- Adjust the argument `mapping` of `ggpairs()` by defining `col = gender` inside `aes()`. +- Draw the plot again. +- Adjust the code a little more: add another aesthetic element `alpha = 0.3` inside `aes()`. +- See the difference between the plots? + +Hints: +- You can use `$` to access a column of a data frame. +- Remember to separate function arguments with a comma +- You can draw the plot `p` by simply typing it's name: just like printing R objects. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# draw a scatter plot matrix of the variables in learning2014. +# [-1] excludes the first column (gender) +pairs(learning2014[-1]) + +# access the GGally and ggplot2 libraries +library(GGally) +library(ggplot2) + +# create a more advanced plot matrix with ggpairs() +p <- ggpairs(learning2014, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20))) + + + + +``` + + +## 2.9 Simple regression + +[Regression analysis](https://en.wikipedia.org/wiki/Regression_analysis) with R is easy once you have your data in a neat data frame. You can simply use the `lm()` function to fit a linear model. The first argument of `lm()` is a `formula`, which defines the target variable and the explanatory variable(s). + +The formula should be `y ~ x`, where `y` is the target (or outcome) variable and `x` the explanatory variable (predictor). The second argument of `lm()` is `data`, which should be a data frame where `y` and `x` are columns. + +The output of `lm()` is a linear model object, which can be saved for later use. The generic function `summary()` can be used to print out a summary of the model. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Create a scatter plot of 'points' versus 'attitude'. +- Fit a regression model where 'points' is the target and 'attitude' is the explanatory variable +- Print out the summary of the linear model object + +Hints: +- Replace `1` with the name of the explanatory variable in the formula inside `lm()` +- Use `summary()` on the model object to print out a summary + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# a scatter plot of points versus attitude +library(ggplot2) +qplot(attitude, points, data = learning2014) + geom_smooth(method = "lm") + +# fit a linear model +my_model <- lm(points ~ 1, data = learning2014) + +# print out a summary of the model +summary(my_model) + +``` + + +## 2.10 Multiple regression + +When there are more than one explanatory variables in the linear model, it is called multiple regression. In R, it is easy to include more than one explanatory variables in your linear model. This is done by simply defining more explanatory variables with the `formula` argument of `lm()`, as below + +``` +y ~ x1 + x2 + .. +``` +Here `y` is again the target variable and `x1, x2, ..` are the explanatory variables. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Draw a plot matrix of the learning2014 data with `ggpairs()` +- Fit a regression model where `points` is the target variable and both `attitude` and `stra` are the explanatory variables. +- Print out a summary of the model. +- Adjust the code: Add one more explanatory variable to the model. Based on the plot matrix, choose the variable with the third highest (absolute) correlation with the target variable and use that as the third variable. +- Print out a summary of the new model. + +Hint: +- The variable with the third highest absolute correlation with `points` is `surf`. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +library(GGally) +library(ggplot2) +# create an plot matrix with ggpairs() +ggpairs(learning2014, lower = list(combo = wrap("facethist", bins = 20))) + +# create a regression model with multiple explanatory variables +my_model2 <- lm(points ~ attitude + stra, data = learning2014) + +# print out a summary of the model +summary(my_model2) + +``` + + +## 2.11 Graphical model validation + +R makes it easy to graphically explore the validity of your model assumptions. If you give a linear model object as the first argument to the `plot()` function, the function automatically assumes you want diagnostic plots and will produce them. You can check the help page of plotting an lm object by typing `?plot.lm` or `help(plot.lm)` to the R console. + +In the plot function you can then use the argument `which` to choose which plots you want. `which` must be an integer vector corresponding to the following list of plots: + +which | graphic +----- | -------- +1 | Residuals vs Fitted values +2 | Normal QQ-plot +3 | Standardized residuals vs Fitted values +4 | Cook's distances +5 | Residuals vs Leverage +6 | Cook's distance vs Leverage + +
+We will focus on plots 1, 2 and 5: Residuals vs Fitted values, Normal QQ-plot and Residuals vs Leverage. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Create the linear model object `my_model2` +- Produce the following diagnostic plots using the `plot()` function: Residuals vs Fitted values, Normal QQ-plot and Residuals vs Leverage using the argument `which`. +- Before the call to the `plot()` function, add the following: `par(mfrow = c(2,2))`. This will place the following 4 graphics to the same plot. Execute the code again to see the effect. + +Hint: +- You can combine integers to an integer vector with `c()`. For example: `c(1,2,3)`. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# create a regression model with multiple explanatory variables +my_model2 <- lm(points ~ attitude + stra, data = learning2014) + +# draw diagnostic plots using the plot() function. Choose the plots 1, 2 and 5 +plot(my_model2, which = 1) + +plot(my_model2, which = 2) + +plot(my_model2, which = 3) + +plot(my_model2, which = 4) + +plot(my_model2, which = 5) + +plot(my_model2, which = 6) + +``` + + +## 2.12 Making predictions + +Okay, so let's assume that we have a linear model which seems to fit our standards. What can we do with it? + +The model quantifies the relationship between the explanatory variable(s) and the dependent variable. The model can also be used for predicting the dependent variable based on new observations of the explanatory variable(s). + +In R, predicting can be done using the `predict()` function. (see `?predict`). The first argument of predict is a model object and the argument `newdata` (a data.frame) can be used to make predictions based on new observations. One or more columns of `newdata` should have the same name as the explanatory variables in the model object. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Create object `m` and print out a summary of the model +- Create object `new_attitudes` +- Adjust the code: Create a new data frame with a column named 'attitude' holding the new attitudes defined in `new_attitudes` +- Print out the new data frame +- `predict()` the new student's exam points based on their attitudes, using the `newdata` argument + +Hints: +- Type `attitude = new_attitudes` inside the `data.frame()` function. +- Give the `new_data` data.frame as the `newdata` argument for `predict()` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# Create model object m +m <- lm(points ~ attitude, data = learning2014) + +# print out a summary of the model +summary(m) + +# New observations +new_attitudes <- c("Mia" = 3.8, "Mike"= 4.4, "Riikka" = 2.2, "Pekka" = 2.9) +new_data <- data.frame(attitude = new_attitudes) + +# Print out the new data +summary(new_data) + +# Predict the new students exam points based on attitude +predict(m, newdata = new_data) + + +``` + +**Awesome work!** + + + diff --git a/Exercises/Exercise3.Rmd b/Exercises/Exercise3.Rmd new file mode 100644 index 000000000..668110817 --- /dev/null +++ b/Exercises/Exercise3.Rmd @@ -0,0 +1,745 @@ + +## 3.0 Installing packages + +```{r} +install.packages("boot") +install.packages("readr") +``` + + +## 3.1 More datasets + +We will be combining, wrangling and analysing two new data sets retrieved from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets.html), a great source for open data. + +The data are from two identical questionnaires related to secondary school student alcohol consumption in Portugal. Read about the data and the variables [here](https://archive.ics.uci.edu/ml/datasets/Student+Performance). + +R offers the convenient `paste()` function which makes it easy to combine characters. Let's utilize it to get our hands on the data! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +url <- "https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/" +``` + +### Instructions +- Create and print out the object `url_math`. +- Create object `math` by reading the math class questionaire data from the web address defined in `url_math`. +- Create and print out `url_por`. +- Adjust the code: similarily to `url_math`, make `url_por` into a valid web address using `paste()` and the `url` object. +- Create object `por` by reading the Portuguese class questionaire data from the web address defined in `url_por`. +- Print out the names of the columns in both data sets. + +Hint: +- You can see the `paste()` functions help page with `?paste` or `help(paste)` + +### R code +```{r} +# This is a code chunk in RStudio editor. +# Work with the exercise in this chunk, step-by-step. Fix the R code! + +url <- "https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/" + +# web address for math class data +url_math <- paste(url, "student-mat.csv", sep = "/") + +# print out the address +url_math + +# read the math class questionnaire data into memory +math <- read.table(url_math, sep = ";" , header = TRUE) + +# web address for Portuguese class data +url_por <- paste("replace me!", "student-por.csv", sep = " change me! ") + +# print out the address + + +# read the Portuguese class questionnaire data into memory +por <- read.table(url_por, sep = ";", header = TRUE) + +# look at the column names of both data sets +colnames(math); colnames(por) + + +``` + + +## 3.2 Joining two datasets + +There are multiple students who have answered both questionnaires in our two datasets. Unfortunately we do not have a single identification variable to identify these students. However, we can use a bunch of background questions together for identification. + +Combining two data sets is easy if the data have a mutual identifier column or if a combination of mutual columns can be used as identifiers. (That's not always the case.) + +Here we'll use `inner_join()` function from the dplyr package to combine the data - remember the [dplyr etc. cheatsheets!](https://www.rstudio.com/resources/cheatsheets/). This means that we'll only keep the students who answered the questionnaire in both math and Portuguese classes. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +math <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/student-mat.csv", sep=";", header=TRUE) +por <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/student-por.csv", sep=";", header=TRUE) +``` + +### Instructions + +- Access the dplyr library and create the objects `free_cols` and `join_cols`: +- The first one (`free_cols`) will be a vector of the names of the six columns that vary in the data sets, namely: failures, paid, absences, G1, G2, and G3. +- The other one (`join_cols`) will be a vector of the names of the rest of the variables; use the handy `setdiff()` function to obtain that vector, based on the `free_cols`. +- Adjust the code: define the argument `by` in the `inner_join()` function to join the `math` and `por` data frames. Use the columns defined in `join_cols`. +- Print out the column names of the joined data set. +- Adjust the code again: add the argument `suffix` to `inner_join()` and give it a vector of two strings: ".math" and ".por". +- Join the data sets again and print out the new column names. +- Use the `glimpse()` function (from dplyr) to look at the joined data. Which data types are present? + +Hints: +- You can create a vector with `c()`. Comma will separate the values. +- Remember to use quotes when creating string or character objects. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# math and por data sets are available + +# access the dplyr package +library(dplyr) + +# give the columns that vary in the two data sets +free_cols + +# the rest of the columns are common identifiers used for joining the data sets +join_cols <- setdiff(colnames(por), "change me!") + +# join the two data sets by the selected identifiers +math_por <- inner_join(math, por, by = "change me!") + +# look at the column names of the joined data set + + +# glimpse at the joined data set + + + +``` + + +## 3.3 The if-else structure + +The `math_por` data frame now contains - in addition to the background variables used for joining `por` and `math` - two possibly different answers to the same questions for each student. To fix this, you'll use programming to combine these 'duplicated' answers by either: + +- taking the rounded average (**if** the two variables are numeric) +- simply choosing the first answer (**else**). + +You'll do this by using a combination of a `for`-loop and an `if`-`else` structure. + +The `if()` function takes a single logical condition as an argument and performs an action only if that condition is true. `if` can then be combined with `else`, which handles the cases where the condition is false. Schematically: + +``` +if(condition) { + do something +} else { + do something else +} +``` + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +math <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/student-mat.csv", sep=";", header=TRUE) +por <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/student-por.csv", sep=";", header=TRUE) +free_cols <- c("failures","paid","absences","G1","G2","G3") +join_cols <- setdiff(colnames(por), free_cols) +math_por <- inner_join(math, por, by = join_cols, suffix = c(".math", ".por")) +``` + +### Instructions +- Print out the column names of `math_por` (use `colnames()`) +- Adjust the code: Create the data frame `alc` by selecting only the columns in `math_por` which were used for joining the two questionnaires. The names of those columns are available in the `join_cols` object. +- Print out the object that you created earlier for the columns that varied (`free_cols`). +- Execute the `for` loop (don't mind the "change me!"). +- Take a `glimpse()` at the `alc` data frame. As you can see, it's not ready yet... +- Adjust the code inside the `for` loop: if the first of the two selected columns is not numeric, add the first column to the `alc` data frame. +- Execute the modified `for` loop and `glimpse()` at the new data again. + +Hint: +- Inside the for-loop, use `first_col` in exchange for the "change me!". + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# dplyr, math_por, join_by are available + +# print out the column names of 'math_por' + + +# create a new data frame with only the joined columns +alc <- select(math_por, all_of("change me!")) + +# print out the columns not used for joining (those that varied in the two data sets) + + +# for every column name not used for joining... +for(col_name in free_cols) { + # select two columns from 'math_por' with the same original name + two_cols <- select(math_por, starts_with(col_name)) + # select the first column vector of those two columns + first_col <- select(two_cols, 1)[[1]] + + # then, enter the if-else structure! + # if that first column vector is numeric... + if(is.numeric(first_col)) { + # take a rounded average of each row of the two columns and + # add the resulting vector to the alc data frame + alc[col_name] <- round(rowMeans(two_cols)) + } else { # else (if the first column vector was not numeric)... + # add the first column vector to the alc data frame + alc[col_name] <- "change me!" + } +} + +# glimpse at the new combined data + + + +``` + + +## 3.4 Mutations + +Mutating a data frame means adding new variables as mutations of the existing ones. The `mutate()` function is also from the dplyr package, which belongs to the tidyverse packages. The tidyverse includes several packages that work well together, such as dplyr and ggplot2. + +The tidyverse functions have a lot of similarities. For example, the first argument of the tidyverse functions is usually `data`. They also have other consistent features which makes them work well together and easy to use. + +Let's now create some new variables in the joined data set! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +math <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/student-mat.csv", sep=";", header=TRUE) +por <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/student-por.csv", sep=";", header=TRUE) +free_cols <- c("failures","paid","absences","G1","G2","G3") +join_cols <- setdiff(colnames(por), free_cols) +math_por <- inner_join(math, por, by = join_cols, suffix = c(".math", ".por")) +alc <- select(math_por, all_of(join_cols)) +for(col_name in free_cols) { + two_cols <- select(math_por, starts_with(col_name)) + first_col <- select(two_cols, 1)[[1]] + if(is.numeric(first_col)) { + alc[col_name] <- round(rowMeans(two_cols)) + } else { + alc[col_name] <- first_col + } +} +library(ggplot2) +``` + +### Instructions +- Mutate `alc` by creating the new column `alc_use` by averaging weekday and weekend alcohol consumption. +- Draw a bar plot of `alc_use`. +- Define a new asthetic element to the bar plot of `alc_use` by defining `fill = sex`. Draw the plot again. +- Adjust the code: Mutate `alc` by creating a new column `high_use`, which is true if `alc_use` is greater than 2 and false otherwise. +- Initialize a ggplot object with `high_use` on the x-axis and then draw a bar plot. +- Add this element to the latter plot (using `+`): `facet_wrap("sex")`. + +Hints: +- Use the `>` operator to test if the values of a vector are greater than some value. +- Add the argument `aes(x = high_use)` to the `ggplot()` function to initialize a plot with `high_use` on the x-axis. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# alc is available + +# access the tidyverse packages dplyr and ggplot2 +library(dplyr); library(ggplot2) + +# define a new column alc_use by combining weekday and weekend alcohol use +alc <- mutate(alc, alc_use = (Dalc + Walc) / 2) + +# initialize a plot of alcohol use +g1 <- ggplot(data = alc, aes(x = alc_use)) + +# define the plot as a bar plot and draw it +g1 + geom_bar() + +# define a new logical column 'high_use' +alc <- mutate(alc, high_use = "change me!" > 2) + +# initialize a plot of 'high_use' +g2 <- ggplot(data = alc) + +# draw a bar plot of high_use by sex + + +``` + + +## 3.5 So many plots + +You are probably curious to find out how the distributions of some of the other variables in the data look like. Well, why don't we visualize all of them! + +You'll also meet another new tidyverse toy, the pipe-operator: `%>%`. + +The pipe (`%>%`) takes the result produced on its left side and uses it as the first argument in the function on its right side. Since the first argument of the tidyverse functions is usually `data`, this allows for some cool chaining of commands. + +We'll look at `%>%` more closely in the next exercise. But now, let's draw some plots. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +``` + +### Instructions +- Access the tidyverse libraries tidyr, dplyr and ggplot2 +- Take a glimpse at the `alc` data +- Apply the `gather()` function on `alc` and then take a glimpse at the resulting data directly after, utilizing the pipe (`%>%`). What does gather do? +- Take a more detailed look similarly, but using the `View()` function, and browse the data. Can you now see better what happens when you use the `gather()` function? +- Draw a plot of each variable in the `alc` data by first changing the values into names-value pairs and then visualizing them with ggplot. Define the plots as bar plots by adding the element `geom_bar()`, (using `+`). + +Hint: +- Add the code `+ geom_bar()` to the line where the plots are drawn. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# alc is available + +# access the tidyverse libraries tidyr, dplyr, ggplot2 +library(tidyr); library(dplyr); library(ggplot2) + +# glimpse at the alc data + + +# use gather() to gather columns into key-value pairs and then glimpse() at the resulting data +gather(alc) %>% glimpse + +# it may help to take a closer look by View() and browse the data +gather(alc) %>% View + +# draw a bar plot of each variable +gather(alc) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + + +``` + + +## 3.6 The pipe: summarising by group + +The pipe operator, `%>%`, takes the result of the left-hand side and uses it as the first argument of the function on the right-hand side. For example: + +``` +1:10 %>% mean() # result: 5.5 +``` + +The parentheses of the 'target' function (here mean) can be dropped unless one wants to specify more arguments for it. + +``` +1:10 %>% mean # result: 5.5 +``` + +Chaining operations with the pipe is great fun, so let's try it! + +Utilizing the pipe, you'll apply the functions `group_by()` and `summarise()` on your data. The first one splits the data to groups according to a grouping variable (a factor, for example). The latter can be combined with any summary function such as `mean()`, `min()`, `max()` to summarize the data. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +``` + +### Instructions +- Access the tidyverse libraries dplyr and ggplot2 +- Execute the sample code to see the counts of males and females in the data +- Adjust the code to calculate means of the grades of the students: inside `summarise()`, after the definition of `count`, define `mean_grade` by using `mean()` on the variable `G3`. +- Adjust the code: After `sex`, add `high_use` as another grouping variable. Execute the code again. + +Hints: +- Remember to separate inputs inside functions with a comma. Here the first input of `summarise` is `count`, and the second one should be `mean_grade`. +- Also separate the grouping variables with a comma. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# alc is available + +# access the tidyverse libraries dplyr and ggplot2 +library(dplyr); library(ggplot2) + +# produce summary statistics by group +alc %>% group_by(sex) %>% summarise(count = n()) + + +``` + + +## 3.7 Box plots by groups + +[Box plots](https://en.wikipedia.org/wiki/Box_plot) are an excellent way of displaying and comparing distributions. A box plot visualizes the 25th, 50th and 75th percentiles (the box), the typical range (the whiskers) and the outliers of a variable. + +The whiskers extending from the box can be computed by several techniques. The default (in base R and ggplot) is to extend them to reach to a data point that is no more than 1.5*IQR away from the box, where IQR is the inter quartile range defined as + +`IQR = 75th percentile - 25th percentile` + +Values outside the whiskers can be considered as outliers, unusually distant observations. For more information on IQR, see [wikipedia](https://en.wikipedia.org/wiki/Interquartile_range), for example. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +``` + +### Instructions + +- Initialize a plot of student grades (`G3`), with `high_use` grouping the grade distributions on the x-axis. Draw the plot as a box plot. +- Add an aesthetic element to the plot by defining `col = sex` inside `aes()` +- Define a similar (box) plot of the variable `absences` grouped by `high_use` on the x-axis and the aesthetic `col = sex`. +- Add a main title to the last plot with `ggtitle("title here")`. Use "Student absences by alcohol consumption and sex" as a title, for example. +- Does high use of alcohol have a connection to school absences? + +Hints: +- In ggplot, you can add stuff to the initialized plot with the `+` operator, e.g. `+ ylab("text here")`. Same goes for titles. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +library(ggplot2) + +# initialize a plot of high_use and G3 +g1 <- ggplot(alc, aes(x = high_use, y = G3)) + +# define the plot as a boxplot and draw it +g1 + geom_boxplot() + ylab("grade") + +# initialize a plot of high_use and absences + + +# define the plot as a box plot and draw it + + +``` + + +## 3.8 Learning a logistic regression model + +We will now use [logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) to identify factors related to higher than average student alcohol consumption. You will also attempt to learn to identify (predict) students who consume high amounts of alcohol using background variables and school performance. + +Because logistic regression can be used to classify observations into one of two groups (by giving the group probability) it is a [binary classification](https://en.wikipedia.org/wiki/Binary_classification) method. You will meet more classification methods in the next week's exercises. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +library(dplyr) +``` + +### Instructions + +- Use `glm()` to fit a logistic regression model with `high_use` as the target variable and `failures` and `absences` as the predictors. +- Print out a summary of the model. +- Add another explanatory variable to the model after absences: 'sex'. Repeat the above. +- Use `coef()` on the model object to print out the coefficients of the model. + +Hint: +- Use the `summary()` function to print out a summary. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# alc is available + +# find the model with glm() +m <- glm(high_use ~ failures + absences, data = alc, family = "binomial") + +# print out a summary of the model + + +# print out the coefficients of the model + + +``` + + +## 3.9 From coefficients to odds ratios + +From the fact that the computational target variable in the logistic regression model is the log of odds, it follows that applying the exponent function to the modeled values gives the odds: + +$$\exp \left( log\left( \frac{p}{1 - p} \right) \right) = \frac{p}{1 - p}.$$ + +For this reason, the exponents of the coefficients of a logistic regression model can be interpreted as odds ratios between a unit change (vs. no change) in the corresponding explanatory variable. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +library(dplyr) +``` + +### Instructions + +- Use `glm()` to fit a logistic regression model. +- Creat the object `OR`: Use `coef()` on the model object to extract the coefficients of the model and then apply the `exp` function on the coefficients. +- Use `confint()` on the model object to compute confidence intervals for the coefficients. Exponentiate the values and assign the results to the object `CI`. (R does this quite fast, despite the "Waiting.." message) +- Combine and print out the odds ratios and their confidence intervals. Which predictor has the widest interval? Does any of the intervals contain 1 and why would that matter? + +Hints: +- You can get the confidence intervals with `confint(*model_object*)` +- The logistic regression model is saved in the object `m`. +- `coef(m) %>% exp` is the same as `exp( coef(m) )` +- You get odds ratios by exponentiating the logistic regression coefficients. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# alc is available + +# find the model with glm() +m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial") + +# compute odds ratios (OR) +OR <- coef(m) %>% exp + +# compute confidence intervals (CI) + + +# print out the odds ratios with their confidence intervals +cbind(OR, CI) + + +``` + + +## 3.10 Binary predictions (1) + +When you have a linear model, you can make predictions. A very basic question is, of course, how well does our model actually predict the target variable. Let's take a look! + +The `predict()` function can be used to make predictions with a model object. If `predict()` is not given any new data, it will use the data used for finding (fitting, leaning, training) the model to make predictions. + +In the case of a binary response variable, the 'type' argument of `predict()` can be used to get the predictions as probabilities (instead of log of odds, the default). + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +library(dplyr) +``` + +### Instructions + +- Fit the logistic regression model with `glm()`. +- Create object `probabilities` by using `predict()` on the model object. +- Mutate the alc data: add a column 'probability' with the predicted probabilities. +- Mutate the data again: add a column 'prediction' which is true if the value of 'probability' is greater than 0.5. +- Look at the first ten observations of the data, along with the predictions. +- Use `table()` to create a cross table of the columns 'high_use' versus 'prediction' in `alc`. This is sometimes called a 'confusion matrix`. + +Hints: +- Remember to use the [dplyr cheat sheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf) by RStudio +- The `$` sign can be used to access columns of a data frame + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# alc is available + +# fit the model +m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial") + +# predict() the probability of high_use +probabilities <- predict(m, type = "response") + +library(dplyr) +# add the predicted probabilities to 'alc' +alc <- mutate(alc, probability = probabilities) + +# use the probabilities to make a prediction of high_use +alc <- mutate(alc, prediction = "change me!") + +# see the last ten original classes, predicted probabilities, and class predictions +select(alc, failures, absences, sex, high_use, probability, prediction) %>% tail(10) + +# tabulate the target variable versus the predictions +table(high_use = alc$high_use, prediction = "change me!") + + +``` + + +## 3.11 Binary predictions (2) + +Let's continue to explore the predictions of our model. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +library(dplyr) +m <- glm(high_use ~ sex + failures + absences, data = alc, family = "binomial") +alc <- mutate(alc, probability = predict(m, type = "response")) +alc <- mutate(alc, prediction = probability > 0.5) +``` + +### Instructions + +- Initialize the ggplot object and define `probability` as the x axis and `high_use` as the y axis. +- Use `geom_point()` to draw the plot. +- Add the aesthetic element `col = prediction` and draw the plot again. +- Use `table()` to create a cross table of 'high_use' versus 'prediction' +- Adjust the code: Use `%>%` to apply the `prop.table()` function on the output of `table()` +- Adjust the code: Use `%>%` to apply the `addmargins()` function on the output of `prop.table()` + +Hint: +- Recall that the pipe (`%>%`) assigns the output of the function on its left side to the function on its right side. The idea is to chain the three commands `table`, `prop.table` and `addmargins` (in that order). + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# alc is available + +# access dplyr and ggplot2 +library(dplyr); library(ggplot2) + +# initialize a plot of 'high_use' versus 'probability' in 'alc' +g <- ggplot(alc, aes(x = "change me!", y = "change me!")) + +# define the geom as points and draw the plot + + +# tabulate the target variable versus the predictions +table(high_use = alc$high_use, prediction = alc$prediction) + + +``` + + +## 3.12 Accuracy and loss functions + +A simple measure of performance in binary classification is accuracy: the average number of correctly classified observations. + +Classification methods such as logistic regression aim to (approximately) minimize the incorrectly classified observations. The mean of incorrectly classified observations can be thought of as a penalty (loss) function for the classifier. Less penalty = good. + +Since we know how to make predictions with our model, we can also compute the average number of incorrect predictions. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +library(dplyr) +m <- glm(high_use ~ sex + failures + absences, data = alc, family = "binomial") +alc <- mutate(alc, probability = predict(m, type = "response")) +alc <- mutate(alc, prediction = probability > 0.5) +``` + +### Instructions + +- Define the loss function `loss_func` +- Execute the call to the loss function with `prob = 0`, meaning you define the probability of `high_use` as zero for each individual. What is the interpretation of the resulting proportion? +- Adjust the code: change the `prob` argument in the loss function to `prob = 1`. What kind of a prediction does this equal to? What is the interpretation of the resulting proportion? +- Adjust the code again: change the `prob` argument by giving it the prediction probabilities in `alc` (the column `probability`). What is the interpretation of the resulting proportion? + +Hints: +- Select the whole code of `loss_func` to execute it. +- You can access the `probability` column in `alc` by using the `$`-mark. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# the logistic regression model m and dataset alc with predictions are available + +# define a loss function (mean prediction error) +loss_func <- function(class, prob) { + n_wrong <- abs(class - prob) > 0.5 + mean(n_wrong) +} + +# call loss_func to compute the average number of wrong predictions in the (training) data +loss_func(class = alc$high_use, prob = 0) + + +``` + + +## 3.13 Cross-validation + +[Cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) is a method of testing a predictive model on unseen data. In cross-validation, the value of a penalty (loss) function (mean prediction error) is computed on data not used for finding the model. Low value = good. + +Cross-validation gives a good estimate of the actual predictive power of the model. It can also be used to compare different models or classification methods. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(readr) +alc <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/alc.csv", show_col_types=FALSE) +library(dplyr) +m <- glm(high_use ~ sex + failures + absences, data = alc, family = "binomial") +alc <- mutate(alc, probability = predict(m, type = "response")) +alc <- mutate(alc, prediction = probability > 0.5) +``` + +### Instructions + +- Define the loss function `loss_func` and compute the mean prediction error for the training data: The `high_use` column in `alc` is the target and the `probability` column has the predictions. +- Perform leave-one-out cross-validation and print out the mean prediction error for the testing data. (`nrow(alc)` gives the observation count in `alc` and using `K = nrow(alc)` defines the leave-one-out method. The `cv.glm` function from the 'boot' library computes the error and stores it in `delta`. See `?cv.glm` for more information.) +- Adjust the code: Perform 10-fold cross validation. Print out the mean prediction error for the testing data. Is the prediction error higher or lower on the testing data compared to the training data? Why? + +Hint: +- The `K` argument in `cv.glm` tells how many folds you will have. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# the logistic regression model m and dataset alc (with predictions) are available + +# define a loss function (average prediction error) +loss_func <- function(class, prob) { + n_wrong <- abs(class - prob) > 0.5 + mean(n_wrong) +} + +# compute the average number of wrong predictions in the (training) data + + +# K-fold cross-validation +library(boot) +cv <- cv.glm(data = alc, cost = loss_func, glmfit = m, K = nrow(alc)) + +# average number of wrong predictions in the cross validation +cv$delta[1] + +``` + +**GOOD JOB!** diff --git a/Exercises/Exercise4.Rmd b/Exercises/Exercise4.Rmd new file mode 100644 index 000000000..bb08808a1 --- /dev/null +++ b/Exercises/Exercise4.Rmd @@ -0,0 +1,547 @@ +--- +title: "**Introduction to Open Data Science, Exercise Set 4**" + +subtitle: "**Clustering and classification**" + +output: + html_document: + theme: flatly + highlight: haddock + toc: true + toc_depth: 2 + number_section: false +--- + +This set consists of a few numbered exercises. +Go to each exercise in turn and do as follows: + +1. Read the brief description of the exercise. +2. Run the (possible) pre-exercise-code chunk. +3. Follow the instructions to fix the R code! + +## 4.0 INSTALL THE REQUIRED PACKAGES FIRST! + +One or more extra packages (in addition to `tidyverse`) will be needed below. + +```{r} +# Select (with mouse or arrow keys) the install.packages("...") and +# run it (by Ctrl+Enter / Cmd+Enter): + +# install.packages(c("MASS", "corrplot")) +``` + + +## 4.1 Datasets inside R + +R has many (usually small) datasets already loaded in. There are also datasets included in the package installations. Some of the datasets are quite famous (like the [Iris](https://en.wikipedia.org/wiki/Iris_flower_data_set) flower data) and they are frequently used for teaching purposes or to demonstrate statistical methods. + +We will be using the [Boston](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/Boston.html) dataset from the MASS package. Let's see how it looks like! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# (no pre-code in this exercise!) +``` + +### Instructions +- Load the `Boston` dataset from MASS +- Explore the `Boston` dataset. Look at the structure with `str()` and use `summary()` to see the details of the variables. +- Draw the plot matrix with `pairs()` + +Hint: +- You can draw the `pairs()` plot by typing the object name where it is saved. + +### R code +```{r} +# This is a code chunk in RStudio editor. +# access the MASS package +library(MASS) + +# load the data +data("Boston") + +# explore the dataset + + + +# plot matrix of the variables + + + +``` + + +## 4.2 Correlations plot + +It is often interesting to look at the correlations between variables in the data. The function `cor()` can be used to create the correlation matrix. A more visual way to look at the correlations is to use `corrplot()` function (from the corrplot package). + +Use the corrplot to visualize the correlation between variables of the Boston dataset. + +**Note:** You should first install the package `corrplot`. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +library(tidyr) +library(corrplot) +data("Boston") +``` + +### Instructions + +- Calculate the correlation matrix and save it as `cor_matrix`. Print the matrix to see how it looks like. +- Adjust the code: use the pipe (`%>%`) to round the matrix. Rounding can be done with the `round()` function. Use the first two digits. Print the matrix again. +- Plot the rounded correlation matrix +- Adjust the code: add argument `type = "upper"` to the plot. Print the plot again. +- Adjust the code little more: add arguments `cl.pos = "b"`, `tl.pos = "d"` and `tl.cex = 0.6` to the plot. Print the plot again. +- See more of corrplot [here](https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html) + +Hints: +- For correlation matrices, see `?cor` +- The pipe (`%>%`) takes the data before the pipe and applies the functionality assigned in the right. For example `data_column %>% summary()` creates a summary of the data_column. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# Boston dataset is available + +# calculate the correlation matrix and round it +cor_matrix <- cor(Boston) + +# print the correlation matrix + + +# visualize the correlation matrix +library(corrplot) +corrplot(cor_matrix, method="circle") + +``` + + +## 4.3 Scale the whole dataset + +Usually the R datasets do not need much data wrangling as they are already in a good shape. But we will need to do little adjustments. + +For later use, we will need to scale the data. In the scaling we subtract the column means from the corresponding columns and divide the difference with standard deviation. + +$$scaled(x) = \frac{x - mean(x)}{ sd(x)}$$ + +The Boston data contains only numerical values, so we can use the function `scale()` to standardize the whole dataset. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +data("Boston") +``` + +### Instructions +- Use the `scale()` function on the `Boston` dataset. Save the scaled data to `boston_scaled` object. +- Use `summary()` to look at the scaled variables. Note the means of the variables. +- Find out the class of the scaled object by executing the `class()` function. +- Later we will want the data to be a data frame. Use `as.data.frame()` to convert the `boston_scaled` to a data frame format. Keep the object name as `boston_scaled`. + +Hint: +- See `?scale` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# Boston dataset is available + +# center and standardize variables +boston_scaled <- "change me!" + +# summaries of the scaled variables + + +# class of the boston_scaled object +class(boston_scaled) + +# change the object to data frame + + +``` + + +## 4.4 Creating a factor variable + +We can create a categorical variable from a continuous one. There are many ways to to do that. Let's choose the variable `crim` (per capita crime rate by town) to be our factor variable. We want to cut the variable by [quantiles](https://en.wikipedia.org/wiki/Quantile) to get the high, low and middle rates of crime into their own categories. + +See how it's done below! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +data("Boston") +boston_scaled <- as.data.frame(scale(Boston)) +boston_scaled$crim <- as.numeric(boston_scaled$crim) +``` + +### Instructions +- Look at the summary of the scaled variable `crim` +- Use the function `quantile()` on the scaled crime rate variable and save the results to `bins`. Print the results. +- Create categorical crime vector with the `cut()` function. Set the `breaks` argument to be the quantile vector you just created. +- Use the function `table()` on the `crime` object +- Adjust the code of `cut()` by adding the `label` argument in the function. Create a string vector with the values `"low"`, `"med_low"`, `"med_high"`, `"high"` (in that order) and use it to set the labels. +- Do the table of the `crime` object again +- Execute the last lines of code to remove the original crime rate variable and adding the new one to scaled Boston dataset. + +Hints: +- You can create a vector with `c()` +- Separate the values with comma in a vector +- Remember that strings need the quotes around them +- `table(*object_name*)` creates a cross tabulation of the object + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# Boston and boston_scaled are available + +# summary of the scaled crime rate + + +# create a quantile vector of crim and print it +bins <- quantile(boston_scaled$crim) +bins + +# create a categorical variable 'crime' +crime <- cut(boston_scaled$crim, breaks = "change me!", include.lowest = TRUE) + +# look at the table of the new factor crime + + +# remove original crim from the dataset +boston_scaled <- dplyr::select(boston_scaled, -crim) + +# add the new categorical value to scaled data +boston_scaled <- data.frame(boston_scaled, crime) + + +``` + + +## 4.5 Divide and conquer: train and test sets + +When we want to use a statistical method to predict something, it is important to have data to test how well the predictions fit. Splitting the original data to test and train sets allows us to check how well our model works. + +The training of the model is done with the train set and prediction on new data is done with the test set. This way you have true classes / labels for the test data, and you can calculate how well the model performed in prediction. + +Time to split our data! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +library(MASS) +boston_scaled <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/boston_scaled.txt", + sep=",", header = T) +``` + +### Instructions +- Use the function `nrow()` on the `boston_scaled` to get the number of rows in the dataset. Save the number of rows in `n`. +- Execute the code to choose randomly 80% of the rows and save the row numbers to `ind` +- Create `train` set by selecting the row numbers that are saved in `ind`. +- Create `test` set by subtracting the rows that are used in the train set +- Take the crime classes from the `test` and save them as `correct_classes` +- Execute the code to remove `crime` from `test` set + +Hints: +- You can get the number of rows with `nrow(*name_of_the_dataframe*)` +- `train` and `test` are data frames, so you can access their columns with `$` mark + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# boston_scaled is available + +# number of rows in the Boston dataset +n <- "change me!" + +# choose randomly 80% of the rows +ind <- sample(n, size = n * 0.8) + +# create train set +train <- boston_scaled[ind,] + +# create test set +test <- boston_scaled[-ind,] + +# save the correct classes from test data +correct_classes <- "change me!" + +# remove the crime variable from test data +test <- dplyr::select(test, -crime) + +``` + + +## 4.6 Linear discriminant analysis + +[Linear discriminant analysis](https://en.wikipedia.org/wiki/Linear_discriminant_analysis) is a classification (and dimension reduction) method. It finds the (linear) combination of the variables that separate the target variable classes. The target can be binary or multiclass variable. + +Linear discriminant analysis is closely related to many other methods, such as principal component analysis (we will look into that next week) and the already familiar logistic regression. + +LDA can be visualized with a biplot. We will talk more about biplots next week. The LDA biplot arrow function used in the exercise is (with slight changes) taken from [this](http://stackoverflow.com/questions/17232251/how-can-i-plot-a-biplot-for-lda-in-r) Stack Overflow message thread. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +boston_scaled <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/boston_scaled.txt", + sep=",", header = T) +ind <- sample(nrow(boston_scaled), size = nrow(boston_scaled) * 0.8) +train <- boston_scaled[ind,] +test <- boston_scaled[-ind,] +correct_classes <- test$crime +test <- dplyr::select(test, -crime) +``` + +### Instructions +- Fit a linear discriminant analysis with the function `lda()`. The function takes a formula (like in regression) as a first argument. Use the `crime` as a target variable and all the other variables as predictors. Hint! You can type `target ~ .` where the dot means all other variables in the data. +- Print the `lda.fit` object +- Create a numeric vector of the train sets crime classes (for plotting purposes) +- Use the function `plot()` on the `lda.fit` model. The argument `dimen` can be used to choose how many discriminants is used. +- Adjust the code: add arguments `col = classes` and `pch = classes` to the plot. +- Execute the `lda.arrow()` function (if you haven't done that already). Draw the plot with the lda arrows. Note that in DataCamp you will need to select both lines of code and execute them at the same time for the `lda.arrow()` function to work. +- You can change the `myscale` argument in `lda.arrow()` to see more clearly which way the arrows are pointing. + +Hints: +- The formula looks like `target ~ .` +- The target variable in here is `crime` +- You can change a factor to numeric with `as.numeric()` +- Remember to execute the `lda.arrows()` code together with the plot in DataCamp. Otherwise the arrows won't work. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# data train is available + +# linear discriminant analysis +lda.fit <- lda("change me!", data = train) + +# print the lda.fit object +lda.fit + +# the function for lda biplot arrows +lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){ + heads <- coef(x) + arrows(x0 = 0, y0 = 0, + x1 = myscale * heads[,choices[1]], + y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads) + text(myscale * heads[,choices], labels = row.names(heads), + cex = tex, col=color, pos=3) +} + +# target classes as numeric +classes <- as.numeric(train$crime) + +# plot the lda results +plot("change me!", dimen = 2) +lda.arrows(lda.fit, myscale = 1) + +``` + + +## 4.7 Predict LDA + +Like in the regression, the function `predict()` can be used to predict values based on a model. The function arguments are almost the same. You can see the help page of prediction function for LDA with `?predict.lda`. + +We split our data earlier so that we have the test set and the correct class labels. See how the LDA model performs when predicting on new (test) data. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +boston_scaled <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/boston_scaled.txt", + sep=",", header = T) +ind <- sample(nrow(boston_scaled), size = nrow(boston_scaled) * 0.8) +train <- boston_scaled[ind,] +test <- boston_scaled[-ind,] +correct_classes <- test$crime +test <- dplyr::select(test, -crime) + +lda.fit = lda(crime ~ ., data=train) +``` + +### Instructions +- Predict the crime classes with the `test` data. Like in regression, the `predict()` function takes the model object as a first argument. +- Create a table of the correct classes and the predicted ones. You can get the predicted classes with `lda.pred$class`. +- Look at the table. Did the classifier predict the crime rates correctly? + +Hints: +- `table(*object_name*)` creates a cross tabulation of the object +- You can get the predicted classes with `lda.pred$class` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# lda.fit, correct_classes and test are available + +# predict classes with test data +lda.pred <- predict(lda.fit, newdata = "change me!") + +# cross tabulate the results +table(correct = "change me!", predicted = "change me!") + +``` + + +## 4.8 Towards clustering: distance measures + +Similarity or dissimilarity of objects can be measured with distance measures. There are many different measures for different types of data. The most common or "normal" distance measure is [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance). + +There are functions that calculate the distances in R. In this exercise, we will be using the base R's `dist()` function. The function creates a distance matrix that is saved as dist object. The distance matrix is usually square matrix containing the pairwise distances of the observations. So with large datasets, the computation of distance matrix is time consuming and storing the matrix might take a lot of memory. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +data("Boston") +``` + +### Instructions +- Load the MASS package and the `Boston` dataset from it +- Create `dist_eu` by calling the `dist()` function on the Boston dataset. Note that by default, the function uses Euclidean distance measure. +- Look at the summary of the `dist_eu` +- Next create object `dist_man` that contains the Manhattan distance matrix of the Boston dataset +- Look at the summary of the `dist_man` + +Hints: +- `data(*name_of_the_dataset*)` can be used to load dataset from R package +- See `?dist`. The argument you will need to change the distance measure is called `method` +- Remember that strings need the quotes in R + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# Boston dataset is available + +# euclidean distance matrix +dist_eu <- "change me!" + +# look at the summary of the distances + + +# manhattan distance matrix +dist_man <- "change me!" + +# look at the summary of the distances + + +``` + + +## 4.9 K-means clustering + +[K-means](https://en.wikipedia.org/wiki/K-means_clustering) is maybe the most used and known clustering method. It is an unsupervised method, that assigns observations to groups or **clusters** based on similarity of the objects. In the previous exercise we got a hang of **distances**. The `kmeans()` function counts the distance matrix automatically, but it is good to know the basics. Let's cluster a bit! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +library(ggplot2) +data("Boston") +set.seed(13) +``` + +### Instructions + +- First change the centers in the `kmeans()` function to be `4` and execute the clustering code +- Plot the Boston data with `pairs()`. Adjust the code by adding the `col` argument. Set the color based on the clusters that k-means produced. You can access the cluster numbers with `km$cluster`. What variables do seem to effect the clustering results? Note: With `pairs()` you can reduce the number of pairs to see the plots more clearly. On line 7, just replace `Boston` with for example `Boston[6:10]` to pair up 5 columns (columns 6 to 10). +- Try a different number of clusters: `1`, `2` and `3` (leave it to `3`). Visualize the results. + +Hints: +- You can change the number of the cluster you want to have with the `centers` argument in `kmeans()` +- See `?kmeans` +- You can access the cluster numbers with `km$cluster` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# Boston dataset is available + +# k-means clustering +km <- kmeans(Boston, centers = "change me!") + +# plot the Boston dataset with clusters +pairs(Boston, col = "change me!") + + + +``` + + +## 4.10 K-means: determine the k + +K-means needs the number of clusters as an argument. There are many ways to look at the optimal number of clusters and a good way might depend on the data you have. + +One way to determine the number of clusters is to look at how the total of within cluster sum of squares (WCSS) behaves when the number of cluster changes (the calculation of total WCSS was explained in the video before). When you plot the number of clusters and the total WCSS, the optimal number of clusters is when the total WCSS drops radically. + +K-means might produce different results every time, because it randomly assigns the initial cluster centers. The function `set.seed()` can be used to deal with that. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(MASS) +library(ggplot2) +data("Boston") +``` + +### Instructions +- Set the max number of clusters (`k_max`) to be 10 +- Execute the code to calculate total WCSS. This might take a while. +- Visualize the total WCSS when the number of cluster goes from 1 to 10. The optimal number of clusters is when the value of total WCSS changes radically. In this case, two clusters would seem optimal. +- Run `kmeans()` again with two clusters and visualize the results + +Hints: +- Simply adjust the part of the code that says "change me!" +- You can change the number of the cluster you want to have with the `centers` argument in `kmeans()` +- Numeric values do not need quotes in R + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# MASS, ggplot2 and Boston dataset are available +set.seed(123) + +# determine the number of clusters +k_max <- "change me!" + +# calculate the total within sum of squares +twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss}) + +# visualize the results +qplot(x = 1:k_max, y = twcss, geom = 'line') + +# k-means clustering +km <- kmeans(Boston, centers = "change me!") + +# plot the Boston dataset with clusters +pairs(Boston, col = km$cluster) + +``` + +**Well done!** diff --git a/Exercises/Exercise5.Rmd b/Exercises/Exercise5.Rmd new file mode 100644 index 000000000..947f906c5 --- /dev/null +++ b/Exercises/Exercise5.Rmd @@ -0,0 +1,497 @@ +--- +title: "**Introduction to Open Data Science, Exercise Set 5**" + +subtitle: "**Dimensionality reduction techniques**" + +output: + html_document: + theme: flatly + highlight: haddock + toc: true + toc_depth: 2 + number_section: false +--- + + +This set consists of a few numbered exercises. +Go to each exercise in turn and do as follows: + +1. Read the brief description of the exercise. +2. Run the (possible) pre-exercise-code chunk. +3. Follow the instructions to fix the R code! + +## 5.0 INSTALL THE REQUIRED PACKAGES FIRST! + +One or more extra packages (in addition to `tidyverse`) will be needed below. + +```{r} +# Select (with mouse or arrow keys) the install.packages("...") and +# run it (by Ctrl+Enter / Cmd+Enter): + +# install.packages("FactoMineR") +``` + + +## 5.1 Meet the human data + +We will be using the `human` dataset to introduce Principal Components Analysis (PCA). The data originates from the United Nations Development Programme. See [their data page](https://hdr.undp.org/data-center/human-development-index) for more information. For a nice overview see also the [calculating the human development indices pdf](https://hdr.undp.org/system/files/documents//technical-notes-calculating-human-development-indices.pdf). + +Most of the variable names have been shortened and two new variables have been computed. See the meta file for the modified data [here](https://github.com/KimmoVehkalahti/Helsinki-Open-Data-Science/blob/master/datasets/human_meta.txt) for descriptions. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# (no pre-code in this exercise!) +``` + +### Instructions +- Read the `human` data into memory +- Print out the (column) names of the data +- Look at the structure of the data +- Print out summaries of the variables in the data + +Hints: +- Use `str()` to see structure +- Use `summary()` to compute summaries + +### R code +```{r} +# This is a code chunk in RStudio editor. + +# read the human data +human <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human1.txt", + sep =",", header = T) + +# look at the (column) names of human +names(human) + +# look at the structure of human + + +# print out summaries of the variables + + +``` + + +## 5.2 String manipulation + +Sometimes a variable is coded in a way that is not natural for R to understand. For example, large integers can sometimes be coded with a comma to separate thousands. In these cases, R interprets the variable as a **factor** or a **character.** + +In some cases you could use the `dec` argument in `read.table()` to get around this, but if the data also includes decimals separated by a dot, this is not an option. To get rid of the unwanted commas, we need *string manipulation*. + +In R, strings are of the basic type character and they can be created by using quotation marks or specific functions. There are quite a few functions in Base R that can be used to manipulate characters, but there is also a bit more consintent and simple tidyverse package **stringr.** + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(tidyr) +human <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human1.txt", + sep =",", header = T) +``` + +### Instructions +- Access the stringr package +- Look at the structure of the Gross National Income (GNI) variable in `human` +- Execute the sample code where the comma is removed from each value of GNI. +- Adjust the code: Use the pipe operator (`%>%`) to convert the resulting vector to numeric with `as.numeric`. + +Hints: +- Use `$` to access a single column of a data frame. +- Use `str()` to look at the structure of any object +- Add the pipe operator and `as.numeric` to the row where `str_replace()` is used +- The previous exercise sets have more information and examples related to the pipe. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# tidyr package and human are available + +# access the stringr package (part of `tidyverse`) +library(stringr) + +# look at the structure of the GNI column in 'human' + + +# remove the commas from GNI and print out a numeric version of it +str_replace(human$GNI, pattern=",", replace ="") + +``` + + +## 5.3 Dealing with not available (NA) values + +In R, NA stands for not available, which means that the data point is missing. If a variable you wish to analyse contains missing values, there are usually two main options: + +- Remove the observations with missing values +- Replace the missing values with actual values using an *imputation* technique. + +We will use the first option, which is the simplest solution. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +# read data +human <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human1.txt", + sep =",", header = T) +human$GNI <- gsub(",", "", human$GNI) %>% as.numeric +``` + +### Instructions +- Create a smaller version of the human data by selecting the variables defined in `keep` +- Use complete.cases() on human to print out a logical "completeness indicator" vector +- Adjust the code: Define `comp` as the completeness indicator and print out the resulting data frame. When is the indicator `FALSE` and when is it `TRUE`? (hint: `?complete.cases()`). +- `filter()` out all the rows with any `NA` values. Right now, `TRUE` is recycled so that nothing is filtered out. + +Hints: +- Use `complete.cases()` on 'human' again to define the 'comp' column +- Use the logical vector created by complete.cases to filter out the rows with `NA` values. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# human with modified GNI is available + +library(dplyr) +# columns to keep +keep <- c("Country", "Edu2.FM", "Labo.FM", "Life.Exp", "Edu.Exp", "GNI", "Mat.Mor", "Ado.Birth", "Parli.F") + +# select the 'keep' columns +human <- select(human, one_of(keep)) + +# print out a completeness indicator of the 'human' data +complete.cases(human) + +# print out the data along with a completeness indicator as the last column +data.frame(human[-1], comp = "change me!") + +# filter out all rows with NA values +human_ <- filter(human, TRUE) # modify the "TRUE", see instructions above! + + +``` + + +## 5.4 Excluding observations + +Besides missing values, there might be other reasons to exclude observations. In our human data, there are a few data points which have been computed from other observations. We want to remove them before further analysis. + +The basic way in R to reference the rows or columns of a data frame is to use brackets (`[,]`) along with indices or names. A comma is used to separate row and column references. In the examples below, `df` is a data frame. + +``` +df[,] # select every row and every column +df[1:5, ] # select first five rows +df[, c(2, 5)] # select 2nd and 5th columns +``` + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +# read data +human <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human1.txt", + sep =",", header = T) +human$GNI <- gsub(",", "", human$GNI) %>% as.numeric +keep <- c("Country", "Edu2.FM", "Labo.FM", "Life.Exp", "Edu.Exp", "GNI", "Mat.Mor", "Ado.Birth", "Parli.F") +human <- select(human, one_of(keep)) +human <- filter(human, complete.cases(human)) +``` + +### Instructions +- Use `tail()` to print out the last 10 observations of `human` (hint: `?tail`). What are the last 10 country names? +- Create object `last` +- Create data frame `human_` by selecting rows from the 1st to `last` from `human`. +- Define the rownames in `human_` by the Country column + +Hint: +- Use `1:last` to select rows from 1 to `last` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# human without NA is available + +# look at the last 10 observations of human + + +# define the last indice we want to keep +last <- nrow(human) - 7 + +# choose everything until the last 7 observations +human_ <- human["change me!", ] + +# add countries as rownames +rownames(human_) <- human_$Country + +``` + + +## 5.5 Exploring the countries + +Now that we have sufficiently wrangled the 'human' data for further analysis, let's explore the variables and their relationships more closely. + +A simple pairs plot or a more informative generalized pairs plot from the **GGally** package is a good way of visualizing a reasonably sized data frame. + +To study linear connections, correlations also can be computed with the `cor()` function and then visualized with the corrplot function from the **corrplot** package. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +# read data +human <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human1.txt", + sep =",", header = T) +human$GNI <- gsub(",", "", human$GNI) %>% as.numeric +keep <- c("Country", "Edu2.FM", "Labo.FM", "Life.Exp", "Edu.Exp", "GNI", "Mat.Mor", "Ado.Birth", "Parli.F") +human <- select(human, one_of(keep)) +human <- filter(human, complete.cases(human)) +rownames(human) <- human$Country +last <- nrow(human) - 7 +human <- human[1:last, ] +library(corrplot) +``` + +### Instructions +- Create the data frame `human_` by removing the `Country` variable from `human` (the countries are still the row names) +- Access the GGally package and visualize all the `human_` variables with `ggpairs()`. +- Compute and print out the correlation matrix of `human_` +- Adjust the code: use the pipe operator (`%>%`) and visualize the correlation matrix with `corrplot()`. + +Hint: +- The pipe assigns the output on its left as the first argument to the function name on its right. Use it on the same line where the correlation matrix is computed + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# modified human, dplyr and the corrplot functions are available + +# remove the Country variable +human_ <- select(human, -Country) + +# Access GGally +library(GGally) + +# visualize the 'human_' variables + +# Access corrplot +library(corrplot) + +# compute the correlation matrix and visualize it with corrplot +cor(human_) + + +``` + + +## 5.6 PCA with R + +[Principal Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) (PCA) can be performed by two sightly different matrix decomposition methods from linear algebra: the [Eigenvalue Decomposition](https://en.wikipedia.org/wiki/Eigendecomposition_of_a_matrix) and the [Singular Value Decomposition](https://en.wikipedia.org/wiki/Singular_value_decomposition) (SVD). + +There are two functions in the default package distribution of R that can be used to perform PCA: `princomp()` and `prcomp()`. The `prcomp()` function uses the SVD and is the preferred, more numerically accurate method. + +Both methods quite literally *decompose* a data matrix into a product of smaller matrices, which let's us extract the underlying **principal components**. This makes it possible to approximate a lower dimensional representation of the data by choosing only a few principal components. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +human <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human2.txt", + sep =",", header = T) +``` + +### Instructions +- Create `human_std` by standardizing the variables in `human`. +- Print out summaries of the standardized variables. What are the means? Do you know the standard deviations? (hint: `?scale`) +- Use `prcomp()` to perform principal component analysis on the standardized data. Save the results in the object `pca_human` +- Use `biplot()` to draw a biplot of `pca_human` (Click next to "Plots" to view it larger) +- Experiment with the argument `cex` of `biplot()`. It should be a vector of length 2 and it can be used to scale the labels in the biplot. Try for example `cex = c(0.8, 1)`. Which number affects what? +- Add the argument `col = c("grey40", "deeppink2")` + +Hint: +- Use the `summary()` function to compute summaries of the variables in a data frame + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# modified human is available + +# standardize the variables +human_std <- scale(human) + +# print out summaries of the standardized variables + + +# perform principal component analysis (with the SVD method) +pca_human <- prcomp(human_std) + +# draw a biplot of the principal component representation and the original variables +biplot(pca_human, choices = 1:2) + +``` + + +## 5.7 A biplot of PCA + +A biplot is a way of visualizing the connections between two representations of the same data. First, a simple scatter plot is drawn where the observations are represented by two principal components (PC's). Then, arrows are drawn to visualize the connections between the original variables and the PC's. The following connections hold: + +- The angle between the arrows can be interpreted as the correlation between the variables. +- The angle between a variable and a PC axis can be interpreted as the correlation between the two. +- The length of the arrows are proportional to the standard deviations of the variables. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +human <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human2.txt", + sep =",", header = T) +human_std <- scale(human) +pca_human <- prcomp(human_std) +``` + +### Instructions +- Create and print out a summary of `pca_human` (created in the previous exercise) +- Create object `pca_pr` and print it out +- Adjust the code: instead of proportions of variance, save the percentages of variance in the `pca_pr` object. Round the percentages to 1 digit. +- Execute the `paste0()` function. Then create a new object `pc_lab` by assigning the output to it. +- Draw the biplot again. Use the first value of the `pc_lab` vector as the label for the x-axis and the second value as the label for the y-axis. + +Hints: +- Percentages are proportions on a different scale. Multiplication by 100 changes proportions to percentages. +- Objects are created with the assign operator `<-` +- Brackets can be used to access values of a vector: `V[1]` gets the first value of `V` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# pca_human, dplyr are available + +# create and print out a summary of pca_human +s <- summary(pca_human) + + +# rounded percentanges of variance captured by each PC +pca_pr <- round(1*s$importance[2, ], digits = 5) + +# print out the percentages of variance + + +# create object pc_lab to be used as axis labels +paste0(names(pca_pr), " (", pca_pr, "%)") + +# draw a biplot +biplot(pca_human, cex = c(0.8, 1), col = c("grey40", "deeppink2"), xlab = NA, ylab = NA) + + +``` + + +## 5.8 It's tea time! + +The [Factominer](https://cran.r-project.org/web/packages/FactoMineR/index.html) package contains functions dedicated to multivariate explanatory data analysis. It contains for example methods *(Multiple) Correspondence analysis* , *Multiple Factor analysis* as well as PCA. + +In the next exercises we are going to use the `tea` dataset. The dataset contains the answers of a questionnaire on tea consumption. + +Let's dwell in teas for a bit! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +library(tidyr) +library(ggplot2) +tea <- read.csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/tea.csv", stringsAsFactors = TRUE) +``` + +### Instructions +- Create the `keep_columns` object. Then `select()` the columns from `tea` to create a new dataset. Save the new data as `tea_time`. +- Look at the summaries and structure of the `tea_time` data. +- Visualize the dataset. Define the plot type by adding `geom_bar()` after initialization of the ggplot. +- Adjust the code: the labels of the x-axis are showing poorly. Make the plot more readable by adding `theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))` after barplot code. + +Hints: +- `str()` and `summary()`. +- Use the `+` mark to add functions to `ggplot()` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# the tea dataset is available + +library(dplyr) +library(tidyr) +# column names to keep in the dataset +keep_columns <- c("Tea", "How", "how", "sugar", "where", "lunch") + +# select the 'keep_columns' to create a new dataset +tea_time <- select(tea, "change me!") + +# look at the summaries and structure of the data + + +# visualize the dataset +library(ggplot2) +pivot_longer(tea_time, cols = everything()) %>% + ggplot(aes(value)) + facet_wrap("name", scales = "free") + + +``` + + +## 5.9 Multiple Correspondence Analysis + +[Multiple Correspondence Analysis](https://en.wikipedia.org/wiki/Multiple_correspondence_analysis) (MCA) is a method to analyze qualitative data and it is an extension of Correspondence analysis (CA). MCA can be used to detect patterns or structure in the data as well as in dimension reduction. + +**Note:** You should first install the package `FactoMineR`. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +tea_time <- read.csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/tea_time.csv", stringsAsFactors = TRUE) +library(FactoMineR) +``` + +### Instructions +- Do multiple correspondence analysis with the function `MCA()`. Give `tea_time` as the functions first argument. Note that the `MCA()` function visualizes the analysis by default, and the plots can be turned off with the argument `graph = FALSE`. +- Look at the summary of the model. +- Plot the variables of the model. You can either plot the variables or the individuals or both. You can change which one to plot with the `invisible` argument. +- Adjust the code: add argument `habillage = "quali"` (how French!) to the plot. Do you notice what changes? + +Hint: +- See the FactoMineR [documentation](https://cran.r-project.org/web/packages/FactoMineR/FactoMineR.pdf) for help + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# tea_time is available + +# multiple correspondence analysis +library(FactoMineR) +mca <- MCA("change me!", graph = FALSE) + +# summary of the model + + +# visualize MCA +plot("change me!", invisible=c("ind"), graph.type = "classic") + +``` + +**Great work!!** diff --git a/Exercises/Exercise6.Rmd b/Exercises/Exercise6.Rmd new file mode 100644 index 000000000..09250e8d5 --- /dev/null +++ b/Exercises/Exercise6.Rmd @@ -0,0 +1,787 @@ +--- +title: "**Introduction to Open Data Science, Exercise Set 6**" + +subtitle: "**Analysis of longitudinal data**" + +output: + html_document: + theme: flatly + highlight: haddock + toc: true + toc_depth: 2 + number_section: false +--- + + +This set consists of a few numbered exercises. +Go to each exercise in turn and do as follows: + +1. Read the brief description of the exercise. +2. Run the (possible) pre-exercise-code chunk. +3. Follow the instructions to fix the R code! + +## 6.0 INSTALL THE REQUIRED PACKAGES FIRST! + +One or more extra packages (in addition to `tidyverse`) will be needed below. + +```{r} +# Select (with mouse or arrow keys) the install.packages("...") and +# run it (by Ctrl+Enter / Cmd+Enter): + +# install.packages("lme4") +``` + + +## 6.1 Meet and Repeat: PART I + +Many studies in the behavioral sciences involve several measurement or observations of the response variable of interest on each subject in the study. For example, the response variable may be measured under a number of different experimental conditions or on a number of different occasions over time; such data are labelled repeated measures or *longitudinal data*. In the first part (I) of these exercises useful methods for the graphical exploration of this type of data are described and a simple method for their analysis are introduced, with the warning that although simple the method should be used only in the initial stage of dealing with the data; more appropriate methods will be discussed in part II. + +In the first part we will dwelve in to the BPRS data, in which 40 male subjects were randomly assigned to one of two treatment groups and each subject was rated on the brief psychiatric rating scale (BPRS) measured before treatment began (week 0) and then at weekly intervals for eight weeks. The BPRS assesses the level of 18 symptom constructs such as hostility, suspiciousness, hallucinations and grandiosity; each of these is rated from one (not present) to seven (extremely severe). The scale is used to evaluate patients suspected of having schizophrenia. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# (no pre-code in this exercise!) +``` + +### Instructions +- Read the `BPRS` data into memory +- Print out the (column) names of the data +- Look at the structure of the data +- Print out summaries of the variables in the data +- Pay special attention to the structure of the data + +Hints: +- Use `str()` to see structure +- Use `summary()` to compute summaries + +### R code +```{r} +# This is a code chunk in RStudio editor. + +# Read the BPRS data +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) + +# Look at the (column) names of BPRS +names(BPRS) + +# Look at the structure of BPRS + + +# Print out summaries of the variables + + + +``` + + +## 6.2 Graphical displays of longitudinal data: The magical pivot_longer() + +To be able to study the possible differences in the bprs value between the treatment groups and the possible change of the value in time, we don't want the weeks to be individual variables. The `pivot_longer()` function is used to transform the dataset accordingly. + +The `pivot_longer()` function takes multiple columns and collapses them into key-value pairs, so that we can have the weeks as values of a new variable week. You can find more information about pivot_longer in the package documentation with `?pivot_longer` or in the dplyr cheatsheet. + +Our `weeks` are in a bit inconvenient form as characters, so we somehow need to extract the week numbers from the character vector `weeks`. + +With the `substr()` function we can extract a part of longer character object. We simply supply it with a character object or vector, *start position*, as in the position of the first letter to extract, and *stop position*, as in the position of the last letter to extract. For example `substr("Hello world!", 1, 5)` would return "Hello". + +The `arrange()` function is also used for information purposes although it is not necessary for the analyses: it simply allows the final table to be ordered according to a variable (e.g. Time or week number to respect the chronology). + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) +``` + +### Instructions +- Factor variables treatment and subject +- Use `pivot_longer()` to convert BPRS to a long form +- Use `mutate()` and `substr()` to create column `week` by extracting the week number from column `weeks` +- Glimpse the data using `glimpse()` + +Hints: +- Use `pivot_longer()` to convert the data to a long form (the cols arguments ask for the variables to be kept *i.e.* not pivoted) +- Use `mutate()` and `substr()` to create `week` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# The data BPRS is available + +# Access the packages dplyr and tidyr +library(dplyr) +library(tidyr) + +# Factor treatment & subject +BPRS$treatment <- factor(BPRS$treatment) +BPRS$subject <- factor(BPRS$subject) + +# Convert to long form +BPRSL <- pivot_longer(BPRS, cols = -c(treatment, subject), + names_to = "weeks", values_to = "bprs") %>% + arrange(weeks) #order by weeks variable + +# Extract the week number +BPRSL <- BPRSL %>% + mutate(week = as.integer(substr("change me!"))) + +# Take a glimpse at the BPRSL data +glimpse(BPRSL) + +``` + + +## 6.3 Individuals on the plot + +Graphical displays of data are almost always useful for exposing patterns in the data, particularly when these are unexpected; this might be of great help in suggesting which class of models might be most sensibly applied in the later more formal analysis. + +To begin we shall plot the BPRS values for all 40 men, differentiating between the treatment groups into which the men have been randomized. This simple graph makes a number of features of the data readily apparent. + +REMEMBER: In `ggplot2` or `dplyr` syntax, you generally do not need to "quote" variable names! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr) +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) +BPRS$treatment <- factor(BPRS$treatment) +BPRS$subject <- factor(BPRS$subject) +BPRSL <- pivot_longer(BPRS, cols=-c(treatment,subject),names_to = "weeks",values_to = "bprs") %>% arrange(weeks) +BPRSL <- BPRSL %>% mutate(week = as.integer(substr(weeks,5,5))) +rm(BPRS) +``` + +### Instructions +- Draw the plot with `week` on the x-axis and `bprs` on the y-axis +- Inspect the plot. See how both the BPRS-score and the variability between individuals decrease over the eight weeks time + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# BPRSL is available + +#Access the package ggplot2 +library(ggplot2) + +# Draw the plot +ggplot(BPRSL, aes(x = "change me!", y = "change me too!", linetype = subject)) + + geom_line() + + scale_linetype_manual(values = rep(1:10, times=4)) + + facet_grid(. ~ treatment, labeller = label_both) + + theme(legend.position = "none") + + scale_y_continuous(limits = c(min(BPRSL$bprs), max(BPRSL$bprs))) + +``` + + +## 6.4 The Golden Standardise + +An important effect we want to take notice is how the men who have higher BPRS values at the beginning tend to have higher values throughout the study. This phenomenon is generally referred to as tracking. + +The tracking phenomenon can be seen more clearly in a plot of the standardized values of each +observation, i.e., the values obtained by subtracting the relevant occasion mean from the original observation and then dividing by the corresponding visit standard deviation. + +$$standardised(x) = \frac{x - mean(x)}{ sd(x)}$$ + +REMEMBER: In `ggplot2` or `dplyr` syntax, you generally do not need to "quote" variable names! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr); library(ggplot2) +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) +BPRS$treatment <- factor(BPRS$treatment) +BPRS$subject <- factor(BPRS$subject) +BPRSL <- pivot_longer(BPRS, cols=-c(treatment,subject),names_to = "weeks",values_to = "bprs") %>% arrange(weeks) +BPRSL <- BPRSL %>% mutate(week = as.integer(substr(weeks,5,5))) +rm(BPRS) +``` + +### Instructions +- Assign `week` as the grouping variable +- Standardise the variable `bprs` +- Glimpse the data now with the standardised `brps` +- Plot the data now with the standardised `brps` + +Hint: +- Standardise the `bprs` by grouping variable `week` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# BPRSL is available + +library(dplyr) +library(tidyr) +# Standardise the variable bprs +BPRSL <- BPRSL %>% + group_by("change me!") %>% + mutate(stdbprs = "change me!") %>% + ungroup() + +# Glimpse the data +glimpse(BPRSL) + +# Plot again with the standardised bprs +library(ggplot2) +ggplot(BPRSL, aes(x = week, y = stdbprs, linetype = subject)) + + geom_line() + + scale_linetype_manual(values = rep(1:10, times=4)) + + facet_grid(. ~ treatment, labeller = label_both) + + scale_y_continuous(name = "standardized bprs") + +``` + + +## 6.5 Good things come in Summary graphs + +With large numbers of observations, graphical displays of individual response profiles are of little use and investigators then commonly produce graphs showing average (mean) profiles for each treatment group along with some indication of the variation of the observations at each time point, in this case the standard error of mean + +$$se = \frac{sd(x)}{\sqrt{n}}$$ + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr); library(ggplot2) +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) +BPRS$treatment <- factor(BPRS$treatment) +BPRS$subject <- factor(BPRS$subject) +BPRSL <- pivot_longer(BPRS, cols=-c(treatment,subject),names_to = "weeks",values_to = "bprs") %>% arrange(weeks) +BPRSL <- BPRSL %>% mutate(week = as.integer(substr(weeks,5,5))) +rm(BPRS) +BPRSL <- BPRSL %>% + group_by(week) %>% + mutate( stdbprs = (bprs - mean(bprs))/sd(bprs) ) %>% + ungroup() +``` + +### Instructions +- Create the summary data `BPRSS` with the mean and standard error of the variable `bprs` +- Glimpse the data +- Plot the mean profiles (with `geom_errorbar()` line commented out) +- Uncomment the `geom_errorbar()` line and plot the mean profiles again +- Note the considerable overlap in the mean profiles of the two treatment groups suggesting there might be little difference between the two groups in respect to the mean BPRS values + +Hint: +- Calculate the summary variables `mean` and `se` inside the `summarise()` function + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# BPRSL is available + +# Number of subjects (per group): +n <- 20 + +library(dplyr) +library(tidyr) +# Summary data with mean and standard error of bprs by treatment and week +BPRSS <- BPRSL %>% + group_by(treatment, week) %>% + summarise( mean = "change me!", se = "change me!" ) %>% + ungroup() + +# Glimpse the data +glimpse(BPRSS) + +# Plot the mean profiles +library(ggplot2) +ggplot(BPRSS, aes(x = week, y = mean, linetype = treatment, shape = treatment)) + + geom_line() + + scale_linetype_manual(values = c(1,2)) + + geom_point(size=3) + + scale_shape_manual(values = c(1,2)) + + #geom_errorbar(aes(ymin=mean-se, ymax=mean+se, linetype="1"), width=0.3) + + theme(legend.position = c(0.8,0.8)) + + scale_y_continuous(name = "mean(bprs) +/- se(bprs)") + +``` + + +## 6.6 Find the outlaw... Outlier! + +As an example of the summary measure approach we will look into the post treatment values of the BPRS. The mean of weeks 1 to 8 will be our summary measure. First calculate this measure and then look at boxplots of the measure for each treatment group. See how the mean summary measure is more variable in the second treatment group and its distribution in this group is somewhat skew. The boxplot of the second group also reveals an outlier, a subject whose mean BPRS score of the eight weeks is over 70. It might bias the conclusions from further comparisons of the groups, so we shall remove that subject from the data. Without the outlier, try to figure which treatment group might have the lower the eight-week mean. Think, considering the variation, how can we be sure? + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr); library(ggplot2) +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) +BPRS$treatment <- factor(BPRS$treatment) +BPRS$subject <- factor(BPRS$subject) +BPRSL <- pivot_longer(BPRS, cols=-c(treatment,subject),names_to = "weeks",values_to = "bprs") %>% arrange(weeks) +BPRSL <- BPRSL %>% mutate(week = as.integer(substr(weeks,5,5))) +rm(BPRS) +BPRSL <- BPRSL %>% + group_by(week) %>% + mutate( stdbprs = (bprs - mean(bprs))/sd(bprs) ) %>% + ungroup() +``` + +### Instructions +- Create the summary data BPRSL8S +- Glimpse the data +- Draw the boxplot and observe the outlier +- Find a suitable threshold value and use `filter()` to exclude the outlier to form a new data BPRSL8S1 +- Glimpse and draw a boxplot of the new data to check the outlier has been dealt with + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# BPRSL is available + +library(dplyr) +library(tidyr) +# Create a summary data by treatment and subject with mean as the summary variable (ignoring baseline week 0) +BPRSL8S <- BPRSL %>% + filter(week > 0) %>% + group_by(treatment, subject) %>% + summarise( mean=mean(bprs) ) %>% + ungroup() + +# Glimpse the data +glimpse(BPRSL8S) + +# Draw a boxplot of the mean versus treatment +library(ggplot2) +ggplot(BPRSL8S, aes(x = treatment, y = mean)) + + geom_boxplot() + + stat_summary(fun = "mean", geom = "point", shape=23, size=4, fill = "white") + + scale_y_continuous(name = "mean(bprs), weeks 1-8") + +# Create a new data by filtering the outlier and adjust the ggplot code the draw the plot again with the new data +BPRSL8S1 <- "change me!" + +``` + + +## 6.7 T for test and A for Anova + +Although the informal graphical material presented up to now has all indicated a lack of difference in the two treatment groups, most investigators would still require a formal test for a difference. Consequently we shall now apply a t-test to assess any difference between the treatment groups, and also calculate a confidence interval for this difference. We use the data without the outlier created in the previous exercise. The t-test confirms the lack of any evidence for a group difference. Also the 95% confidence interval is wide and includes the zero, allowing for similar conclusions to be made. + +Baseline measurements of the outcome variable in a longitudinal study are often correlated with the chosen summary measure and using such measures in the analysis can often lead to substantial gains in precision when used appropriately as a covariate in an analysis of covariance. We can illustrate the analysis on the data using the BPRS value corresponding to time zero taken prior to the start of treatment as the baseline covariate. We see that the baseline BPRS is strongly related to the BPRS values taken after treatment has begun, but there is still no evidence of a treatment difference even after conditioning on the baseline value. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr) +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) +BPRS$treatment <- factor(BPRS$treatment) +BPRS$subject <- factor(BPRS$subject) +BPRSL <- pivot_longer(BPRS, cols=-c(treatment,subject),names_to = "weeks",values_to = "bprs") %>% arrange(weeks) +BPRSL <- BPRSL %>% mutate(week = as.integer(substr(weeks,5,5))) +BPRSL <- BPRSL %>% + group_by(week) %>% + mutate( stdbprs = (bprs - mean(bprs))/sd(bprs) ) %>% + ungroup() +BPRSL8S <- BPRSL %>% + filter(week > 0) %>% + group_by(treatment, subject) %>% + summarise( mean=mean(bprs) ) %>% + ungroup() +rm(BPRSL) +BPRSL8S1 <- BPRSL8S %>% + filter(mean < 60) +``` + +### Instructions +- Perform a two-sample t-test and observe the differences as seen in in the boxplots of the previous exercise +- Add the baseline from the original data as a new variable to the summary data +- Fit the linear model with `mean` as the target and `baseline` + `treatment` as the response from the `BPRSL8S1` (Remember the `lm()` formula `y` ~ `x1` + `x2`) +- Compute the analysis of variance table for the fitted model and pay close attention to the significance of `baseline` + +Hints: +- Perform the t-test +- Fit the linear model +- Compute `anova()` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# BPRSL8S & BPRSL8S1 datasets are available + +# Perform a two-sample t-test +t.test(mean ~ treatment, data = BPRSL8S1, var.equal = TRUE) + +library(dplyr) +library(tidyr) +# Add the baseline from the original data as a new variable to the summary data +BPRSL8S2 <- BPRSL8S %>% + mutate(baseline = BPRS$week0) + +# Fit the linear model with the mean as the response +fit <- lm("Linear model formula here!", data = BPRSL8S2) + +# Compute the analysis of variance table for the fitted model with anova() + + +``` + + +## 6.8 Meet and Repeat: PART II + +Longitudinal data, where a response variable is measured on each subject on several different occasions poses problems for their analysis because the repeated measurements on each subject are very likely to be correlated rather than independent. In PART II of these exercises methods for dealing with longitudinal data which aim to account for the correlated nature of the data and where the response is assumed to be normally distributed are discussed. + +To investigate the use of linear mixed effects models in practice, we shall use data from a nutrition study conducted in three groups of rats. The groups were put on different diets, and each animal’s body weight (grams) was recorded repeatedly (approximately) weekly, except in week seven when two recordings were taken) over a 9-week period. The question of most interest is whether the growth profiles of the three groups differ. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr) +``` + +### Instructions +- Read the `RATS` data into memory +- Factor variables `ID` and `group` +- Glimpse the data + +Hints: +- Factor variables `ID` and `group` with `factor()` +- Glimpse the data with `glimpse()` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! + +# read in the RATS data +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t') + +library(dplyr) +# Factor variables ID and Group + +# Glimpse the data + +``` + + +## 6.9 Linear Mixed Effects Models + +Again, to be able to study the differences between the variables of interest, that is the weight of the individual rats, and the groups as well as the change of the weight in time, we want to *pivot* the data to a long form. + +This time we need to extract the number of days as an integer variable. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr) +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t') +RATS$ID <- factor(RATS$ID) +RATS$Group <- factor(RATS$Group) +``` + +### Instructions +- Assign `names_to` as `WD` and `values_to` as `Weight` and convert the data to a long form +- Mutate a new variable `Time` by extracting the number of the day from `WD` +- `glimpse()` the data + +Hints: +- Assign `names_to` as `WD` and `values_to` as `Weight` +- Use `substr()` to extract the number of the day. Check what is the maximum number of digits in the numbers after `WD`. +- `glimpse()` the data + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# RATS is available + +library(dplyr) +library(tidyr) +# Convert data to long form +RATSL <- pivot_longer(RATS, cols = -c(ID, Group), + names_to = "change me!", + values_to = "change me!") %>% + mutate(Time = as.integer(substr("change me!"))) %>% + arrange(Time) + +# Glimpse the data +glimpse(RATSL) + +``` + + +## 6.10 Plot first, ask questions later + +To begin, we shall ignore the repeated-measures structure of the data and assume that all the observations are independent of one another. Now if we simply ignore that the sets of 11 weights come from the same rat, we have a data set consisting of 176 weights, times, and group memberships that we see can easily be analyzed using multiple linear regression. To begin, we will plot the data, identifying the observations in each group but ignoring the longitudinal nature of the data. + +We'll start with a simple plot and continue by adding some styling elements. Feel free to experiment! + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr); library(ggplot2) +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t') +RATS$ID <- factor(RATS$ID) +RATS$Group <- factor(RATS$Group) +RATSL <- pivot_longer(RATS, cols=-c(ID,Group), names_to = "WD",values_to = "Weight") %>% mutate(Time = as.integer(substr(WD,3,4))) %>% arrange(Time) +``` + +### Instructions +- Check the dimensions of RATSL +- Draw the `Weight` against `Time` plot +- Add line type aesthetics to differentiate the rat groups by assigning `aes(linetype = Group)` as an argument to `geom_line()` +- Add x-axis label and breaks by adding `scale_x_continuous(name = "Time (days)", breaks = seq(0, 60, 10))` to the plot. +- Add y-axis label by adding `scale_y_continuous(name = "Weight (grams)")` +- Change the position of the legend by adding `theme(legend.position = "top")`. +- Observe the difference between the weights of the rats in Group 1 and those in the other two groups + +Hints: +- Use `dim()` to check the dimensions +- Draw the plot with the designated style elements. To add a new style element add `+`to the end of the previous line and add the new element on a new indented line. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# RATSL is available + +library(dplyr) +library(tidyr) +# Check the dimensions of the data + + +# Plot the RATSL data +library(ggplot2) +ggplot(RATSL, aes(x = Time, y = Weight, group = ID)) + + geom_line() + + + + +``` + + +## 6.11 Holding on to independence: The Linear model + +Continuing to ignore the repeated-measures structure of the data, we will fit a multiple linear regression model with weight as response and `Time` and `Group` as explanatory variables. + +Recall again from *Multiple regression* exercises that this is done by defining explanatory variables with the `formula` argument of `lm()`, as below + +``` +y ~ x1 + x2 + .. +``` +Here `y` is again the target variable and `x1, x2, ..` are the explanatory variables. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr) +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t') +RATS$ID <- factor(RATS$ID) +RATS$Group <- factor(RATS$Group) +RATSL <- pivot_longer(RATS, cols=-c(ID,Group), names_to = "WD",values_to = "Weight") %>% mutate(Time = as.integer(substr(WD,3,4))) %>% arrange(Time) +``` + +### Instructions +- Create a regression model with `Weight` as the response variable and `Time` and `Group` as explanatory variables +- Print out the summary of the model +- Observe 1) How Group2 and Group3 differ from Group1 +conditional on `Time` and 2) The significance of the regression on `Time` + +Hints: +- Use `lm()` with the formula `Weight ~ Time + Group` +- Use `summary()` to print the summary of the model + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# RATS and RATSL are available + +# create a regression model RATS_reg +RATS_reg <- "Regression model here!" + +# print out a summary of the model + +``` + + +## 6.12 The Random Intercept Model + +The previous model assumes independence of the repeated measures of weight, and this assumption is highly unlikely. So, now we will move on to consider both some more appropriate graphics and appropriate models. + +To begin the more formal analysis of the rat growth data, we will first fit the *random intercept model* for the same two explanatory variables: `Time` and `Group`. Fitting a random intercept model allows the linear regression fit for each rat to differ in *intercept* from other rats. + +We will use the `lme4` package which offers efficient tools for fitting linear and generalized linear mixed-effects models. The first argument is the `formula` object describing both the fixed-effects and random effects part of the model, with the response on the left of a ~ operator and the terms, separated by + operators, on the right. Note the random-effects terms distinguished by vertical bars (|). + +**Note:** You should first install the package `lme4`. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr) +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t') +RATS$ID <- factor(RATS$ID) +RATS$Group <- factor(RATS$Group) +RATSL <- pivot_longer(RATS, cols=-c(ID,Group), names_to = "WD",values_to = "Weight") %>% mutate(Time = as.integer(substr(WD,3,4))) %>% arrange(Time) +``` + +### Instructions +- Access the `lme4` package +- Fit the random intercept model with the rat `ID` as the random effect +- Print out the summary of the model +- Pay attention to variability (standard deviation) of the rat `ID` + +Hints: +- Fit the random intercept model with the rat `ID` as the random effect +- Print out the summary of the model with `summary()` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# RATS and RATSL are available + +# access library lme4 +library(lme4) + +# Create a random intercept model +RATS_ref <- lmer(Weight ~ Time + Group + (1 | ID), data = RATSL, REML = FALSE) + +# Print the summary of the model + + +``` + + +## 6.13 Slippery slopes: Random Intercept and Random Slope Model + +Now we can move on to fit the *random intercept and random slope model* to the rat growth data. Fitting a random intercept and random slope model allows the linear regression fits for each individual to differ in intercept but also in slope. This way it is possible to account for the individual differences in the rats' growth profiles, but also the effect of time. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr); library(lme4); library(ggplot2) +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t') +RATS$ID <- factor(RATS$ID) +RATS$Group <- factor(RATS$Group) +RATSL <- pivot_longer(RATS, cols=-c(ID,Group), names_to = "WD",values_to = "Weight") %>% mutate(Time = as.integer(substr(WD,3,4))) %>% arrange(Time) +RATS_ref <- lmer(Weight ~ Time + Group + (1 | ID), data = RATSL, REML = FALSE) +``` + +### Instructions +- Fit the random intercept and slope model with `Time` and `ID` as the random effects +- Print the summary of the model +- Compute the analysis of variance tables of the models `RATS_ref` and `RATS_ref1` +- Pay attention to the chi-squared statistics and p-value of the likelihood ratio test between `RATS_ref1` and `RATS_ref`. The lower the value the better the fit against the comparison model. + +Hints: +- Print the summary of the model with `summary()` +- Compute the analysis of variance tables of the models `RATS_ref1` and `RATS_ref` with `anova()` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# RATS and RATSL are available + +# create a random intercept and random slope model +library(lme4) +RATS_ref1 <- lmer(Weight ~ Time + Group + (Time | ID), data = RATSL, REML = FALSE) + +# print a summary of the model + + +# perform an ANOVA test on the two models +anova(RATS_ref1, RATS_ref) + +``` + + +## 6.14 Time to interact: Random Intercept and Random Slope Model with interaction + +Finally, we can fit a random intercept and slope model that allows for a group × time interaction. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +library(dplyr); library(tidyr); library(lme4); library(ggplot2) +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t') +RATS$ID <- factor(RATS$ID) +RATS$Group <- factor(RATS$Group) +RATSL <- pivot_longer(RATS, cols=-c(ID,Group), names_to = "WD",values_to = "Weight") %>% mutate(Time = as.integer(substr(WD,3,4))) %>% arrange(Time) +RATS_ref <- lmer(Weight ~ Time + Group + (1 | ID), data = RATSL, REML = FALSE) +RATS_ref1 <- lmer(Weight ~ Time + Group + (Time | ID), data = RATSL, REML = FALSE) +``` + +### Instructions +- Write the same model as in the previous exercise but add `Time` * `Group` interaction. +- Print out the summary of the model +- Compute the analysis of variance tables of the models `RATS_ref2` and `RATS_ref1` +- Again pay attention to the likelihood ratio test chi-squared value and the according p-value. The lower the value the better the fit against the comparison model. +- Draw the plot of *observed* values of RATSL (this is the same plot drawn earlier) +- Create a vector of the fitted values of the model using the function `fitted()` +- Use for example `mutate()` to add the vector `Fitted` as a new column to RATSL +- Draw the plot of *fitted* values of RATSL + +Hints: +- Print the summary of the model with `summary()` +- Compute the analysis of variance tables of the models `RATS_ref1` and `RATS_ref` with `anova()` +- Create a vector of the fitted values of the model using the function `fitted()`. Supply it with the model `RATS_ref2` +- Use `mutate()` to add the vector `Fitted` as a new column to RATSL. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# RATS and RATSL are available + +# create a random intercept and random slope model with the interaction +library(lme4) +RATS_ref2 <- "Write the model here" + +# print a summary of the model + + +# perform an ANOVA test on the two models +anova(RATS_ref2, RATS_ref1) + +# draw the plot of RATSL with the observed Weight values +ggplot(RATSL, aes(x = Time, y = Weight, group = ID)) + + geom_line(aes(linetype = Group)) + + scale_x_continuous(name = "Time (days)", breaks = seq(0, 60, 20)) + + scale_y_continuous(name = "Observed weight (grams)") + + theme(legend.position = "top") + +# Create a vector of the fitted values +Fitted <- "change me!" + +library(dplyr) +library(tidyr) +# Create a new column fitted to RATSL + + +# draw the plot of RATSL with the Fitted values of weight +library(ggplot2) +ggplot(RATSL, aes(x = Time, y = "change me!", group = ID)) + + geom_line(aes(linetype = Group)) + + scale_x_continuous(name = "Time (days)", breaks = seq(0, 60, 20)) + + scale_y_continuous(name = "Fitted weight (grams)") + + theme(legend.position = "top") + + +``` + +**Very well done!!!** diff --git a/IODS.Rproj b/IODS.Rproj new file mode 100644 index 000000000..8e3c2ebc9 --- /dev/null +++ b/IODS.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/R Data/Create_Human_Week5.R b/R Data/Create_Human_Week5.R new file mode 100644 index 000000000..890461166 --- /dev/null +++ b/R Data/Create_Human_Week5.R @@ -0,0 +1,88 @@ +#Name: Subam Kathet +#Date: 05 December 2022 + +#IODS Exercise set 5 - Dimensionality reduction techniques +#R-learning codes for week 5 data wrangling exercises + +#We will be working with the same human data wrangled during last weeks exercise session +#The csv file was saved in the project folder + +#Human data originates from United Nations Development Program +#Link: https://hdr.undp.org/data-center/human-development-index#/indicies/HDI + +#Overview of the data + ##calculating the human development indices-graphical presentation + # https://hdr.undp.org/system/files/documents//technical-notes-calculating-human-development-indices.pdf + +#Lets start the data wrangling exercise + +#packages +library(tidyverse) +library(dplyr) +library(ggplot2) + +##Step 1## Importing data set into R +#Lets import last week's csv file +human <- read_csv("human.csv") +dim("human") +str("human") +summary("human") +#195 observation and 19 variables, looks great !! + +##Step 2## Mutate the data +#Transform the Gross National Income (GNI) variable to numeric (using string manipulation) + +library(stringr) +str(human$GNI) +str_replace(human$GNI, pattern=",", replace ="") %>% as.numeric + +#Check data now +summary(human$GNI) +print(human$GNI) + +##Step 3## Exclude unneeded variables +#keep only the columns matching the following variable names + +library(dplyr) +# columns to keep +keep <- c("SeEdu_FM", "LFR_FM", "Life_Exp", "Exp_Edu", "GNI", "MMR", "ABR", "%PR") +human <- select(human, one_of(keep)) +include <- complete.cases(human) + + +#Lets see now +str(human); dim(human) +#195 observation and 8 variables, great !! lets continue + + +##Step 4## Remove all rows with missing values + +data.frame(human[-1], comp = include) + +human_ <- filter(human, include) + +rownames(human_) <- human_$Country + +##Step 5## Remove the observations which relate to regions instead of countries + +tail(human_, n = 10) +last <- nrow(human_) - 7 +human_ <- human_[1:last, ] + +##Step 6## #Define the row names of the data by the country names and remove the country name column from the data +human_ +human_$GNI <- gsub(",", "", human_$GNI) %>% as.numeric +str(human_);dim(human_) + +#155 observations and 8 variables, great !! + +##Step 7## Save the file +#save new data set to project folder +write.csv(human_, file="human_.csv") + +#check +read.csv('human_.csv', row.names = 1) + +#Data wrangling complete !! lets move on to the analysis + + diff --git a/R Data/Exercise 2_Rcodes.R b/R Data/Exercise 2_Rcodes.R new file mode 100644 index 000000000..70338eac9 --- /dev/null +++ b/R Data/Exercise 2_Rcodes.R @@ -0,0 +1,228 @@ +#Name: Subam Kathet +#Date: 14 November 2022 +#Description: R codes and some notes for exercise set 2 for IODS course + +library(dplyr) +library(tidyverse) +library(GGally) +library(ggplot2) + +# Use read.tabe to import the data set into R through the link + +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-data.txt", sep="\t", header=TRUE) + +#use .txt file to import data set for better description. +# Preliminary results available at http://www.slideshare.net/kimmovehkalahti/the-relationship-between-learning-approaches-and-students-achievements-in-an-introductory-statistics-course-in-finland +#Total respondents n=183, total question n=60, so 184 rows including heading and 60 columns +#The data set is basically outomes of international survey on approaches to learning conducted from the social science department of university of helinki. +#The code as respective column heading represents a question related to the survey and number. Each SN is a respondents and the answers to each question are given in a Lickert scale (0-5). + +#Print the data set + +print(dim(lrn14)) +print(class(lrn14)) +print(str(lrn14)) + +# Week 2: Regression and model validation + +# 2.1 Reading data from the web + +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt", sep="\t", header=TRUE) + +# Look at the dimensions of the data + +# Look at the structure of the data +#use .txt file to import data set for better description. +# Preliminary results available at http://www.slideshare.net/kimmovehkalahti/the-relationship-between-learning-approaches-and-students-achievements-in-an-introductory-statistics-course-in-finland +#Total respondents n=183, total question n=60, so 184 rows including heading and 60 columns +#The code as respective column heading represents a question related to the survey and number. Each SN is a respondents and the answers to each question are given in a Lickert scale (0-5). + +dim(lrn14) +str(lrn14) + +## 2.2 Scaling variables + +#The next step is [wrangling the data](https://en.wikipedia.org/wiki/Data_wrangling) into a format that is easy to analyze. We will wrangle our data for the next few exercises. +#A neat thing about R is that may operations are *vectorized*. It means that a single operation can affect all elements of a vector. This is often convenient. +#The column `Attitude` in `lrn14` is a sum of 10 questions related to students attitude towards statistics, each measured on the [Likert scale](https://en.wikipedia.org/wiki/Likert_scale) (1-5). Here we'll scale the combination variable back to the 1-5 scale. + +lrn14$attitude <- lrn14$Attitude / 10 + +## 2.3 Combining variables + +# questions related to deep, surface and strategic learning +deep_questions <- c("D03", "D11", "D19", "D27", "D07", "D14", "D22", "D30","D06", "D15", "D23", "D31") +surface_questions <- c("SU02","SU10","SU18","SU26", "SU05","SU13","SU21","SU29","SU08","SU16","SU24","SU32") +strategic_questions <- c("ST01","ST09","ST17","ST25","ST04","ST12","ST20","ST28") + +# select the columns related to deep learning +deep_columns <- select(lrn14, one_of(deep_questions)) +# and create column 'deep' by averaging +lrn14$deep <- rowMeans(deep_columns) + +# select the columns related to surface learning +surface_columns <- select(lrn14, one_of(surface_questions)) +# and create column 'surf' by averaging +lrn14$surf <- rowMeans(surface_columns) + +# select the columns related to strategic learning +strategic_columns <- select(lrn14, one_of(strategic_questions)) +# and create column 'stra' by averaging +lrn14$stra <- rowMeans(strategic_columns) + +## 2.4 Selecting columns + +library(dplyr) + +# choose a handful of columns to keep +keep_columns <- c("gender","Age","attitude", "deep", "stra", "surf", "Points") + +# select the 'keep_columns' to create a new dataset +learning2014 <- select(lrn14,all_of(keep_columns)) + +# see the structure of the new dataset + +print(learning2014) + +## 2.5 Modifying column names + +print(names(learning2014)) +colnames(learning2014)[2] <- "age" +learning2014 <- rename(learning2014, points = Points) +print(dim(learning2014)) #check the dimension now (must have 166 rown and 7) + +## 2.6 Excluding observations + +learning2014 <- learning2014[learning2014$points > 0,] +dim(lrn14) +dim(learning2014) + +#Export csv file +setwd("~/Documents/GitHub/IODS-project") +write_csv(learning2014, 'learning2014.csv') + +## 2.7 Visualizations with ggplot2 + +#[**ggplot2**](http://ggplot2.org/) is a popular library for creating stunning graphics with R. It has some advantages over the basic plotting system in R, mainly consistent use of function arguments and flexible plot alteration. ggplot2 is an implementation of Leland Wilkinson's *Grammar of Graphics* — a general scheme for data visualization. + +#In ggplot2, plots may be created via the convenience function `qplot()` where arguments and defaults are meant to be similar to base R's `plot()` function. More complex plotting capacity is available via `ggplot()`, which exposes the user to more explicit elements of the grammar. (from [wikipedia](https://en.wikipedia.org/wiki/Ggplot2)) + +#RStudio has a [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) for data visualization with ggplot2. + +# initialize plot with data and aesthetic mapping +p1 <- ggplot(learning2014, aes(x = attitude, y = points)) + +# define the visualization type (points) +p2 <- p1 + geom_point() + +# draw the plot +p2 + +# add a regression line +p3 <- p2 + geom_smooth(method = "lm") + +# draw the plot +p3 + +#Lets try and overview summary +p <- ggpairs(learning2014, mapping = aes(col = gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20))) +# draw the plot! +p + + +## 2.8 Exploring a data frame + +#Often the most interesting feature of your data are the relationships between the variables. If there are only a handful of variables saved as columns in a data frame, it is possible to visualize all of these relationships neatly in a single plot. + +#Base R offers a fast plotting function `pairs()`, which draws all possible scatter plots from the columns of a data frame, resulting in a scatter plot matrix. Libraries **GGally** and **ggplot2** together offer a slow but more detailed look at the variables, their distributions and relationships. + + +### R code + +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# draw a scatter plot matrix of the variables in learning2014. +# [-1] excludes the first column (gender) +pairs(learning2014[-1]) + +# access the GGally and ggplot2 libraries +library(GGally) +library(ggplot2) + +# create a more advanced plot matrix with ggpairs() +p <- ggpairs(learning2014, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20))) + +## 2.9 Simple regression + + +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# a scatter plot of points versus attitude +library(ggplot2) +qplot(attitude, points, data = learning2014) + geom_smooth(method = "lm") + +# fit a linear model +my_model <- lm(points ~ 1, data = learning2014) + +# print out a summary of the model +summary(my_model) + +## 2.10 Multiple regression +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# create an plot matrix with ggpairs() +ggpairs(learning2014, lower = list(combo = wrap("facethist", bins = 20))) + +# create a regression model with multiple explanatory variables +my_model2 <- lm(points ~ attitude + stra, data = learning2014) + +# print out a summary of the model +summary(my_model2) + +## 2.11 Graphical model validation +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# create a regression model with multiple explanatory variables +my_model2 <- lm(points ~ attitude + stra, data = learning2014) + +# draw diagnostic plots using the plot() function. Choose the plots 1, 2 and 5 +plot(my_model2, which = 1) + +plot(my_model2, which = 2) + +plot(my_model2, which = 3) + +plot(my_model2, which = 4) + +plot(my_model2, which = 5) + +plot(my_model2, which = 6) + +## 2.12 Making predictions + +# Create model object m +m <- lm(points ~ attitude, data = learning2014) + +# print out a summary of the model +summary(m) + +# New observations +new_attitudes <- c("Mia" = 3.8, "Mike"= 4.4, "Riikka" = 2.2, "Pekka" = 2.9) +new_data <- data.frame(attitude = new_attitudes) + +# Print out the new data +summary(new_data) + +# Predict the new students exam points based on attitude +predict(m, newdata = new_data) + + + + + + + diff --git a/R Data/create_alc.R b/R Data/create_alc.R new file mode 100644 index 000000000..14339e57f --- /dev/null +++ b/R Data/create_alc.R @@ -0,0 +1,106 @@ +#Name: Subam Kathet +#Date: 19 November 2022 + +#IODS Exercise set 3 - Logistic Regression +#R-learning codes for Logistic Regression + +#We will be working with two new data sets retrieved from the UCI Machine Learning Repository +#Resource for data base: https://archive.ics.uci.edu/ml/datasets.html +#The data are from two identical questionnaires related to secondary school student alcohol consumption in Portugal + +#This script consists of all the codes used during the data wrangling exercise. + +# LET'S START !!! + +#Two csv files has been downloaded in the course folder from the link in the description. + +#Importing data set into R + +math <- read.table("student-mat.csv", sep = ";", header = TRUE) +por <- read.table("student-por.csv", sep = ";", header = TRUE) + + +# Let's see how the data looks + +##data set math +dim(math) +str(math) +colnames(math) + +##395 observation of 33 variables + +##data set por +dim(por) +str(por) +colnames(por) +## 649 observation of 33 variables + + +# Lets join the two columns + +#But before that we set the columns not to be used as identifier +free_cols <- c("failures", "paid", "absences", "G1", "G2", "G3") + +# Then set columns to be joined +join_cols <- setdiff(colnames(por), free_cols) + +# And finally join columns +math_por <- inner_join(math, por, by = join_cols, suffix = c(".math", ".por")) + +#Let's see how the data set looks now ! +str(math_por) +dim(math_por) + +#370 rows and 39 columns + +#Removing the duplicates + +alc <- select(math_por, all_of(join_cols)) + + +for(col_name in free_cols) { + + two_cols <- select(math_por, starts_with(col_name)) + + first_col <- select(two_cols, 1)[[1]] + + + if(is.numeric(first_col)) { + + alc[col_name] <- round(rowMeans(two_cols)) + } else { + alc[col_name] <- first_col + } +} + +# Lets see the data using "glimpse" +glimpse(alc) + +#And dimention +dim(alc) +#Dataset now consists of 370 rows and 33 columns + +#Joining two data sets and making changes + +## calculating weekday and weekend average alcohol consumption and adding column +alc <- mutate(alc, alc_use = (Dalc + Walc) / 2) + +## adding a new column "high use" if consumption value is greater than 2 +alc <- mutate(alc, high_use = alc_use > 2) + + +# Lets make .csv file and export this into the project folder +write_csv(alc, "alc.csv") +glimpse(alc) + +# Lets look at the final outcome +glimpse(math_por) +glimpse(alc) + +#Rows: 370 +#Columns: 35 + +#Everything looks fine !!! + + + diff --git a/RATSL.csv b/RATSL.csv new file mode 100644 index 000000000..9f69b6a86 --- /dev/null +++ b/RATSL.csv @@ -0,0 +1,177 @@ +ID,Group,WD,Weight,Time +1,1,WD1,240,1 +2,1,WD1,225,1 +3,1,WD1,245,1 +4,1,WD1,260,1 +5,1,WD1,255,1 +6,1,WD1,260,1 +7,1,WD1,275,1 +8,1,WD1,245,1 +9,2,WD1,410,1 +10,2,WD1,405,1 +11,2,WD1,445,1 +12,2,WD1,555,1 +13,3,WD1,470,1 +14,3,WD1,535,1 +15,3,WD1,520,1 +16,3,WD1,510,1 +1,1,WD8,250,8 +2,1,WD8,230,8 +3,1,WD8,250,8 +4,1,WD8,255,8 +5,1,WD8,260,8 +6,1,WD8,265,8 +7,1,WD8,275,8 +8,1,WD8,255,8 +9,2,WD8,415,8 +10,2,WD8,420,8 +11,2,WD8,445,8 +12,2,WD8,560,8 +13,3,WD8,465,8 +14,3,WD8,525,8 +15,3,WD8,525,8 +16,3,WD8,510,8 +1,1,WD15,255,15 +2,1,WD15,230,15 +3,1,WD15,250,15 +4,1,WD15,255,15 +5,1,WD15,255,15 +6,1,WD15,270,15 +7,1,WD15,260,15 +8,1,WD15,260,15 +9,2,WD15,425,15 +10,2,WD15,430,15 +11,2,WD15,450,15 +12,2,WD15,565,15 +13,3,WD15,475,15 +14,3,WD15,530,15 +15,3,WD15,530,15 +16,3,WD15,520,15 +1,1,WD22,260,22 +2,1,WD22,232,22 +3,1,WD22,255,22 +4,1,WD22,265,22 +5,1,WD22,270,22 +6,1,WD22,275,22 +7,1,WD22,270,22 +8,1,WD22,268,22 +9,2,WD22,428,22 +10,2,WD22,440,22 +11,2,WD22,452,22 +12,2,WD22,580,22 +13,3,WD22,485,22 +14,3,WD22,533,22 +15,3,WD22,540,22 +16,3,WD22,515,22 +1,1,WD29,262,29 +2,1,WD29,240,29 +3,1,WD29,262,29 +4,1,WD29,265,29 +5,1,WD29,270,29 +6,1,WD29,275,29 +7,1,WD29,273,29 +8,1,WD29,270,29 +9,2,WD29,438,29 +10,2,WD29,448,29 +11,2,WD29,455,29 +12,2,WD29,590,29 +13,3,WD29,487,29 +14,3,WD29,535,29 +15,3,WD29,543,29 +16,3,WD29,530,29 +1,1,WD36,258,36 +2,1,WD36,240,36 +3,1,WD36,265,36 +4,1,WD36,268,36 +5,1,WD36,273,36 +6,1,WD36,277,36 +7,1,WD36,274,36 +8,1,WD36,265,36 +9,2,WD36,443,36 +10,2,WD36,460,36 +11,2,WD36,455,36 +12,2,WD36,597,36 +13,3,WD36,493,36 +14,3,WD36,540,36 +15,3,WD36,546,36 +16,3,WD36,538,36 +1,1,WD43,266,43 +2,1,WD43,243,43 +3,1,WD43,267,43 +4,1,WD43,270,43 +5,1,WD43,274,43 +6,1,WD43,278,43 +7,1,WD43,276,43 +8,1,WD43,265,43 +9,2,WD43,442,43 +10,2,WD43,458,43 +11,2,WD43,451,43 +12,2,WD43,595,43 +13,3,WD43,493,43 +14,3,WD43,525,43 +15,3,WD43,538,43 +16,3,WD43,535,43 +1,1,WD44,266,44 +2,1,WD44,244,44 +3,1,WD44,267,44 +4,1,WD44,272,44 +5,1,WD44,273,44 +6,1,WD44,278,44 +7,1,WD44,271,44 +8,1,WD44,267,44 +9,2,WD44,446,44 +10,2,WD44,464,44 +11,2,WD44,450,44 +12,2,WD44,595,44 +13,3,WD44,504,44 +14,3,WD44,530,44 +15,3,WD44,544,44 +16,3,WD44,542,44 +1,1,WD50,265,50 +2,1,WD50,238,50 +3,1,WD50,264,50 +4,1,WD50,274,50 +5,1,WD50,276,50 +6,1,WD50,284,50 +7,1,WD50,282,50 +8,1,WD50,273,50 +9,2,WD50,456,50 +10,2,WD50,475,50 +11,2,WD50,462,50 +12,2,WD50,612,50 +13,3,WD50,507,50 +14,3,WD50,543,50 +15,3,WD50,553,50 +16,3,WD50,550,50 +1,1,WD57,272,57 +2,1,WD57,247,57 +3,1,WD57,268,57 +4,1,WD57,273,57 +5,1,WD57,278,57 +6,1,WD57,279,57 +7,1,WD57,281,57 +8,1,WD57,274,57 +9,2,WD57,468,57 +10,2,WD57,484,57 +11,2,WD57,466,57 +12,2,WD57,618,57 +13,3,WD57,518,57 +14,3,WD57,544,57 +15,3,WD57,555,57 +16,3,WD57,553,57 +1,1,WD64,278,64 +2,1,WD64,245,64 +3,1,WD64,269,64 +4,1,WD64,275,64 +5,1,WD64,280,64 +6,1,WD64,281,64 +7,1,WD64,284,64 +8,1,WD64,278,64 +9,2,WD64,478,64 +10,2,WD64,496,64 +11,2,WD64,472,64 +12,2,WD64,628,64 +13,3,WD64,525,64 +14,3,WD64,559,64 +15,3,WD64,548,64 +16,3,WD64,569,64 diff --git a/alc.csv b/alc.csv new file mode 100644 index 000000000..1a570b506 --- /dev/null +++ b/alc.csv @@ -0,0 +1,371 @@ +school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,schoolsup,famsup,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,failures,paid,absences,G1,G2,G3,alc_use,high_use +GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,yes,no,no,yes,yes,no,no,4,3,4,1,1,3,0,no,5,2,8,8,1,FALSE +GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,no,yes,no,no,yes,yes,no,5,3,3,1,1,3,0,no,3,7,8,8,1,FALSE +GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,yes,no,no,yes,yes,yes,no,4,3,2,2,3,3,2,yes,8,10,10,11,2.5,TRUE +GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,no,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,yes,1,14,14,14,1,FALSE +GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,no,yes,no,yes,yes,no,no,4,3,2,1,2,5,0,yes,2,8,12,12,1.5,FALSE +GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,no,yes,yes,yes,yes,yes,no,5,4,2,1,2,5,0,yes,8,14,14,14,1.5,FALSE +GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,no,0,12,12,12,1,FALSE +GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,yes,yes,no,yes,yes,no,no,4,1,4,1,1,1,0,no,4,8,9,10,1,FALSE +GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,no,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0,yes,0,16,17,18,1,FALSE +GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,no,yes,yes,yes,yes,yes,no,5,5,1,1,1,5,0,yes,0,13,14,14,1,FALSE +GP,F,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,no,yes,no,yes,yes,yes,no,3,3,3,1,2,2,0,yes,1,12,11,12,1.5,FALSE +GP,F,15,U,GT3,T,2,1,services,other,reputation,father,3,3,no,yes,yes,yes,yes,yes,no,5,2,2,1,1,4,0,no,2,10,12,12,1,FALSE +GP,M,15,U,LE3,T,4,4,health,services,course,father,1,1,no,yes,yes,yes,yes,yes,no,4,3,3,1,3,5,0,yes,1,13,14,13,2,FALSE +GP,M,15,U,GT3,T,4,3,teacher,other,course,mother,2,2,no,yes,no,yes,yes,yes,no,5,4,3,1,2,3,0,yes,1,11,11,12,1.5,FALSE +GP,M,15,U,GT3,A,2,2,other,other,home,other,1,3,no,yes,no,yes,yes,yes,yes,4,5,2,1,1,3,0,no,0,14,15,16,1,FALSE +GP,F,16,U,GT3,T,4,4,health,other,home,mother,1,1,no,yes,no,yes,yes,yes,no,4,4,4,1,2,2,0,no,5,16,16,16,1.5,FALSE +GP,F,16,U,GT3,T,4,4,services,services,reputation,mother,1,3,no,yes,yes,yes,yes,yes,no,3,2,3,1,2,2,0,yes,8,13,14,14,1.5,FALSE +GP,F,16,U,GT3,T,3,3,other,other,reputation,mother,3,2,yes,yes,yes,yes,yes,no,no,5,3,2,1,1,4,0,no,3,10,12,12,1,FALSE +GP,M,17,U,GT3,T,3,2,services,services,course,mother,1,1,no,yes,yes,yes,yes,yes,no,5,5,5,2,4,5,3,no,9,7,6,6,3,TRUE +GP,M,16,U,LE3,T,4,3,health,other,home,father,1,1,no,no,yes,yes,yes,yes,no,3,1,3,1,3,5,0,yes,5,10,11,11,2,FALSE +GP,M,15,U,GT3,T,4,3,teacher,other,reputation,mother,1,2,no,no,no,yes,yes,yes,no,4,4,1,1,1,1,0,no,0,12,14,14,1,FALSE +GP,M,15,U,GT3,T,4,4,health,health,other,father,1,1,no,yes,no,yes,yes,yes,no,5,4,2,1,1,5,0,yes,0,12,14,14,1,FALSE +GP,M,16,U,LE3,T,4,2,teacher,other,course,mother,1,2,no,no,yes,yes,yes,yes,no,4,5,1,1,3,5,0,no,1,14,14,15,2,FALSE +GP,M,16,U,LE3,T,2,2,other,other,reputation,mother,2,2,no,yes,yes,yes,yes,yes,no,5,4,4,2,4,5,0,no,1,12,12,11,3,TRUE +GP,F,15,R,GT3,T,2,4,services,health,course,mother,1,3,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,0,yes,2,10,10,9,1,FALSE +GP,F,16,U,GT3,T,2,2,services,services,home,mother,1,1,no,yes,no,no,yes,yes,no,1,2,2,1,3,5,1,yes,10,8,10,10,2,FALSE +GP,M,15,U,GT3,T,2,2,other,other,home,mother,1,1,no,yes,no,yes,yes,yes,no,4,2,2,1,2,5,0,yes,5,12,12,12,1.5,FALSE +GP,M,15,U,GT3,T,4,2,health,services,other,mother,1,1,no,no,no,yes,yes,yes,no,2,2,4,2,4,1,0,yes,2,13,14,13,3,TRUE +GP,M,16,U,LE3,A,3,4,services,other,home,mother,1,2,yes,yes,yes,yes,yes,yes,no,5,3,3,1,1,5,0,no,3,12,12,12,1,FALSE +GP,M,16,U,GT3,T,4,4,teacher,teacher,home,mother,1,2,no,yes,yes,yes,yes,yes,yes,4,4,5,5,5,5,0,yes,10,11,12,12,5,TRUE +GP,M,15,U,GT3,T,4,4,health,services,home,mother,1,2,no,yes,no,no,yes,yes,no,5,4,2,3,4,5,0,yes,0,10,11,12,3.5,TRUE +GP,M,15,U,GT3,T,4,4,services,services,reputation,mother,2,2,no,yes,yes,yes,yes,yes,no,4,3,1,1,1,5,0,no,1,16,16,16,1,FALSE +GP,M,15,R,GT3,T,4,3,teacher,at_home,course,mother,1,2,no,yes,yes,yes,yes,yes,yes,4,5,2,1,1,5,0,no,0,15,15,16,1,FALSE +GP,M,15,U,LE3,T,3,3,other,other,course,mother,1,2,no,no,yes,no,yes,yes,no,5,3,2,1,1,2,0,no,0,10,11,12,1,FALSE +GP,M,16,U,GT3,T,3,2,other,other,home,mother,1,1,no,yes,no,no,yes,yes,no,5,4,3,1,1,5,0,yes,2,12,13,14,1,FALSE +GP,F,15,U,GT3,T,2,3,other,other,other,father,2,1,no,yes,yes,yes,yes,no,no,3,5,1,1,1,5,0,no,2,10,9,8,1,FALSE +GP,M,15,U,LE3,T,4,3,teacher,services,home,mother,1,3,no,yes,yes,yes,yes,yes,no,5,4,3,1,1,4,0,no,1,14,15,16,1,FALSE +GP,M,16,R,GT3,A,4,4,other,teacher,reputation,mother,2,3,no,yes,yes,yes,yes,yes,yes,2,4,3,1,1,5,0,no,6,14,14,14,1,FALSE +GP,F,15,R,GT3,T,3,4,services,health,course,mother,1,3,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,0,yes,2,12,12,12,1,FALSE +GP,F,15,R,GT3,T,2,2,at_home,other,reputation,mother,1,1,yes,yes,yes,yes,yes,no,no,4,3,1,1,1,2,0,yes,8,14,13,12,1,FALSE +GP,F,16,U,LE3,T,2,2,other,other,home,mother,2,2,no,yes,yes,no,yes,yes,yes,3,3,3,1,2,3,0,no,20,9,10,10,1.5,FALSE +GP,M,15,U,LE3,T,4,4,teacher,other,home,other,1,1,no,yes,no,no,yes,yes,yes,5,4,3,2,4,5,0,no,8,11,12,12,3,TRUE +GP,M,15,U,GT3,T,4,4,services,teacher,course,father,1,2,no,yes,yes,yes,yes,yes,no,4,3,3,1,1,5,0,no,1,16,16,16,1,FALSE +GP,M,15,U,GT3,T,2,2,services,services,course,father,1,1,yes,yes,no,yes,yes,yes,no,5,4,1,1,1,1,0,no,0,8,9,10,1,FALSE +GP,F,16,U,LE3,T,2,2,other,at_home,course,father,2,2,yes,no,yes,yes,yes,yes,no,4,3,3,2,2,5,1,no,14,10,10,10,2,FALSE +GP,F,15,U,LE3,A,4,3,other,other,course,mother,1,2,yes,yes,yes,yes,yes,yes,yes,5,2,2,1,1,5,0,yes,6,9,10,8,1,FALSE +GP,F,16,U,LE3,A,3,3,other,services,home,mother,1,2,no,yes,no,yes,yes,yes,no,2,3,5,1,4,3,0,no,9,12,12,12,2.5,TRUE +GP,M,16,U,GT3,T,4,3,health,services,reputation,mother,1,4,no,no,yes,yes,yes,yes,no,4,2,2,1,1,2,0,no,3,18,18,18,1,FALSE +GP,M,15,U,GT3,T,4,2,teacher,other,home,mother,1,2,no,yes,no,yes,yes,no,no,4,3,3,2,2,5,0,yes,3,13,14,14,2,FALSE +GP,F,15,U,GT3,T,4,4,services,teacher,other,father,1,2,yes,yes,yes,no,yes,yes,no,4,4,4,1,1,3,0,no,2,10,10,10,1,FALSE +GP,F,16,U,LE3,T,2,2,services,services,course,mother,3,2,no,yes,no,yes,yes,yes,no,4,3,3,2,3,4,0,yes,1,13,13,13,2.5,TRUE +GP,F,15,U,LE3,T,4,2,health,other,other,mother,1,2,no,yes,no,yes,yes,yes,no,4,3,3,1,1,5,0,yes,1,14,14,14,1,FALSE +GP,M,15,U,LE3,A,4,2,health,health,other,father,2,1,no,no,no,yes,yes,no,no,5,5,5,3,4,5,0,no,5,10,10,10,3.5,TRUE +GP,F,15,U,GT3,T,4,4,services,services,course,mother,1,1,yes,yes,no,yes,yes,yes,no,3,3,4,2,3,5,0,yes,0,10,11,12,2.5,TRUE +GP,F,15,U,LE3,A,3,3,other,other,other,mother,1,1,no,no,no,yes,yes,yes,no,5,3,4,4,4,1,0,yes,3,12,12,13,4,TRUE +GP,F,16,U,GT3,A,2,1,other,other,other,mother,1,2,no,no,yes,yes,yes,yes,yes,5,3,4,1,1,2,0,yes,5,10,11,11,1,FALSE +GP,F,15,U,GT3,A,4,3,services,services,reputation,mother,1,2,no,yes,yes,yes,yes,yes,no,4,3,2,1,1,1,0,yes,0,14,14,15,1,FALSE +GP,M,15,U,GT3,T,4,4,teacher,health,reputation,mother,1,2,no,yes,yes,yes,yes,no,no,3,2,2,1,1,5,0,no,6,14,15,16,1,FALSE +GP,M,15,U,LE3,T,1,2,other,at_home,home,father,1,2,yes,yes,yes,yes,yes,yes,no,4,3,2,1,1,5,0,no,1,12,12,12,1,FALSE +GP,F,16,U,GT3,T,4,2,services,other,course,mother,1,2,no,yes,no,yes,yes,yes,no,4,2,3,1,1,5,0,no,2,16,16,16,1,FALSE +GP,F,16,R,GT3,T,4,4,health,teacher,other,mother,1,2,no,yes,yes,yes,yes,no,no,2,4,4,2,3,4,0,no,3,14,14,14,2.5,TRUE +GP,F,16,U,GT3,T,1,1,services,services,course,father,4,1,yes,yes,yes,no,yes,yes,yes,5,5,5,5,5,5,0,no,3,10,9,14,5,TRUE +GP,F,16,U,LE3,T,1,2,other,services,reputation,father,1,2,yes,no,yes,yes,yes,yes,no,4,4,3,1,1,1,0,no,2,10,12,10,1,FALSE +GP,F,16,U,GT3,T,4,3,teacher,health,home,mother,1,3,yes,yes,yes,yes,yes,yes,no,3,4,4,2,4,4,0,yes,1,12,11,11,3,TRUE +GP,F,15,U,LE3,T,4,3,services,services,reputation,father,1,2,yes,no,yes,yes,yes,yes,yes,4,4,4,2,4,2,0,no,0,12,11,11,3,TRUE +GP,F,16,U,LE3,T,4,3,teacher,services,course,mother,3,2,no,yes,yes,yes,yes,yes,no,5,4,3,1,2,1,0,no,2,16,15,16,1.5,FALSE +GP,M,15,U,GT3,A,4,4,other,services,reputation,mother,1,4,no,yes,yes,no,yes,yes,yes,1,3,3,5,5,3,0,no,2,12,12,12,5,TRUE +GP,F,16,U,GT3,T,3,1,services,other,course,mother,1,4,yes,yes,no,yes,yes,yes,no,4,3,3,1,2,5,0,yes,2,8,8,8,1.5,FALSE +GP,F,15,R,LE3,T,2,2,health,services,reputation,mother,2,2,yes,yes,no,yes,yes,yes,no,4,1,3,1,3,4,0,yes,1,10,10,10,2,FALSE +GP,F,15,R,LE3,T,3,1,other,other,reputation,father,2,4,no,yes,no,no,yes,yes,no,4,4,2,2,3,3,0,no,9,16,16,16,2.5,TRUE +GP,M,16,U,GT3,T,3,1,other,other,reputation,father,2,4,no,yes,no,yes,yes,yes,no,4,3,2,1,1,5,0,yes,1,13,13,13,1,FALSE +GP,M,15,U,GT3,T,4,2,other,other,course,mother,1,4,no,no,no,yes,yes,yes,no,3,3,3,1,1,3,0,no,0,10,10,10,1,FALSE +GP,F,15,R,GT3,T,1,1,other,other,reputation,mother,1,2,yes,yes,no,no,yes,yes,yes,3,3,4,2,4,5,1,no,2,10,8,8,3,TRUE +GP,M,16,U,GT3,T,3,1,other,other,reputation,mother,1,1,no,no,yes,yes,yes,no,no,5,3,2,2,2,5,0,no,1,12,12,14,2,FALSE +GP,F,16,U,GT3,T,3,3,other,services,home,mother,1,2,yes,yes,yes,yes,yes,yes,no,4,3,3,2,4,5,0,yes,29,11,12,11,3,TRUE +GP,M,15,U,GT3,T,4,3,teacher,other,home,mother,1,2,no,yes,yes,yes,yes,yes,no,4,3,3,2,3,5,0,yes,3,10,10,10,2.5,TRUE +GP,M,15,U,GT3,T,4,0,teacher,other,course,mother,2,4,no,no,yes,yes,yes,yes,no,3,4,3,1,1,1,0,no,4,12,11,10,1,FALSE +GP,F,16,U,GT3,T,2,2,other,other,reputation,mother,1,4,no,no,no,yes,yes,yes,yes,5,2,3,1,3,3,0,yes,0,12,12,12,2,FALSE +GP,M,17,U,GT3,T,2,1,other,other,home,mother,2,1,yes,yes,yes,yes,no,yes,no,4,5,1,1,1,3,3,no,1,8,8,10,1,FALSE +GP,F,16,U,GT3,T,3,4,at_home,other,course,mother,1,2,no,yes,no,yes,yes,yes,no,2,4,3,1,2,3,0,no,13,8,8,8,1.5,FALSE +GP,M,15,U,GT3,T,2,3,other,services,course,father,1,1,yes,yes,yes,no,yes,yes,yes,3,2,2,1,3,3,0,yes,1,10,12,12,2,FALSE +GP,M,15,U,GT3,T,2,3,other,other,home,mother,1,3,yes,no,no,no,yes,yes,no,5,3,2,1,2,5,0,yes,3,10,10,10,1.5,FALSE +GP,F,15,U,LE3,T,3,2,services,other,reputation,mother,1,2,no,yes,no,yes,yes,yes,no,4,4,4,1,1,5,0,yes,7,10,8,8,1,FALSE +GP,M,15,U,LE3,T,2,2,services,services,home,mother,2,2,no,no,yes,yes,yes,yes,no,5,3,3,1,3,4,0,yes,3,14,14,14,2,FALSE +GP,F,15,U,GT3,T,1,1,other,other,home,father,1,2,no,yes,yes,no,yes,yes,no,4,3,2,2,3,4,0,no,2,11,11,11,2.5,TRUE +GP,F,15,U,GT3,T,4,4,services,services,reputation,father,2,2,no,no,no,yes,yes,yes,yes,4,4,4,2,3,5,1,yes,5,10,10,10,2.5,TRUE +GP,F,16,U,LE3,T,2,2,at_home,other,course,mother,1,2,no,yes,no,yes,yes,no,no,4,3,4,1,2,2,0,no,5,10,9,8,1.5,FALSE +GP,F,15,U,GT3,T,4,2,other,other,reputation,mother,1,3,no,yes,yes,yes,yes,yes,no,5,3,3,1,3,1,0,no,4,14,14,14,2,FALSE +GP,M,16,U,GT3,T,2,2,services,other,reputation,father,2,2,no,no,yes,no,yes,yes,no,4,4,2,1,1,3,0,yes,9,12,10,10,1,FALSE +GP,M,16,U,LE3,A,4,4,teacher,health,reputation,mother,1,2,no,yes,no,yes,yes,no,no,4,1,3,3,5,5,0,no,12,8,8,8,4,TRUE +GP,F,16,U,GT3,T,3,3,other,other,home,mother,1,3,no,yes,no,yes,yes,yes,yes,4,3,3,1,3,4,0,yes,1,8,9,10,2,FALSE +GP,F,15,U,GT3,T,4,3,services,other,reputation,mother,1,1,no,no,yes,yes,yes,yes,no,4,5,5,1,3,1,0,yes,5,15,15,16,2,FALSE +GP,F,16,U,LE3,T,3,1,other,other,home,father,1,2,yes,yes,no,yes,yes,no,no,3,3,3,2,3,2,0,no,2,10,10,9,2.5,TRUE +GP,F,16,U,GT3,T,4,2,teacher,services,home,mother,2,2,no,yes,yes,yes,yes,yes,no,5,3,3,1,1,1,0,yes,1,12,12,12,1,FALSE +GP,M,15,U,LE3,T,2,2,services,health,reputation,mother,1,4,no,yes,yes,yes,yes,yes,no,4,3,4,1,1,4,0,no,4,11,12,13,1,FALSE +GP,F,15,R,GT3,T,1,1,at_home,other,home,mother,2,4,yes,yes,yes,yes,yes,yes,no,3,1,2,1,1,1,0,yes,3,10,12,12,1,FALSE +GP,M,16,R,GT3,T,4,3,services,other,reputation,mother,2,1,yes,yes,yes,no,yes,yes,no,3,3,3,1,1,4,0,no,4,10,13,13,1,FALSE +GP,F,16,U,GT3,T,2,1,other,other,course,mother,1,2,no,yes,no,yes,yes,no,yes,4,3,5,1,1,5,0,yes,1,10,10,11,1,FALSE +GP,F,16,U,GT3,T,4,4,other,other,reputation,mother,1,1,no,no,yes,no,yes,yes,no,5,3,4,1,2,1,0,no,5,12,14,14,1.5,FALSE +GP,F,16,U,GT3,T,4,3,other,at_home,course,mother,1,3,yes,yes,no,yes,yes,yes,no,5,3,5,1,1,3,0,yes,1,10,11,10,1,FALSE +GP,M,16,U,GT3,T,4,4,services,services,other,mother,1,1,yes,yes,yes,yes,yes,yes,no,4,5,5,5,5,4,0,yes,13,8,8,6,5,TRUE +GP,M,16,U,GT3,T,4,4,services,teacher,other,father,1,3,no,yes,yes,yes,yes,yes,yes,4,4,3,1,1,4,0,no,0,16,16,16,1,FALSE +GP,M,15,U,GT3,T,4,4,services,other,course,mother,1,1,no,yes,yes,no,yes,yes,no,5,3,3,1,1,5,0,no,3,11,13,13,1,FALSE +GP,F,15,U,GT3,T,3,2,services,other,home,mother,2,2,yes,yes,no,yes,yes,yes,no,4,3,5,1,1,2,0,yes,21,9,8,8,1,FALSE +GP,M,15,U,GT3,A,3,4,services,other,course,mother,1,2,no,yes,yes,yes,yes,yes,no,5,4,4,1,1,1,0,yes,0,16,17,17,1,FALSE +GP,F,15,U,GT3,A,3,3,other,health,reputation,father,1,4,yes,no,no,yes,yes,no,no,4,3,3,1,1,4,0,no,10,10,10,10,1,FALSE +GP,F,15,U,GT3,T,2,2,other,other,course,mother,1,4,yes,yes,no,yes,yes,yes,no,5,1,2,1,1,3,0,yes,6,8,9,9,1,FALSE +GP,M,16,U,GT3,T,3,3,services,other,home,father,1,3,no,yes,yes,yes,yes,yes,no,5,3,3,1,1,5,0,no,3,14,16,16,1,FALSE +GP,M,15,R,GT3,T,4,4,other,other,home,father,4,4,no,yes,yes,yes,yes,yes,yes,1,3,5,3,5,1,0,yes,7,11,12,12,4,TRUE +GP,F,16,U,LE3,T,4,4,health,health,other,mother,1,3,no,yes,yes,yes,yes,yes,yes,5,4,5,1,1,4,0,yes,3,14,15,15,1,FALSE +GP,M,15,U,LE3,A,4,4,teacher,teacher,course,mother,1,1,no,no,yes,yes,yes,yes,no,5,5,3,1,1,4,0,no,5,16,16,16,1,FALSE +GP,F,16,R,GT3,T,3,3,services,other,reputation,father,1,3,yes,yes,yes,yes,yes,yes,no,4,1,2,1,1,2,0,no,2,9,10,10,1,FALSE +GP,F,16,U,GT3,T,2,2,at_home,other,home,mother,1,2,yes,no,yes,yes,yes,yes,no,3,1,2,1,1,5,1,no,9,9,12,12,1,FALSE +GP,M,15,U,LE3,T,4,2,teacher,other,course,mother,1,1,no,no,no,yes,yes,yes,no,3,5,2,1,1,3,0,no,10,18,18,18,1,FALSE +GP,M,15,R,GT3,T,2,1,health,services,reputation,mother,1,2,no,no,yes,yes,yes,yes,yes,5,4,2,1,1,5,0,no,6,10,9,10,1,FALSE +GP,M,16,U,GT3,T,4,4,teacher,teacher,course,father,1,2,no,yes,yes,yes,yes,yes,no,5,4,4,1,2,5,0,no,4,16,14,15,1.5,FALSE +GP,M,15,U,GT3,T,4,4,other,teacher,reputation,father,2,2,no,yes,yes,yes,yes,no,no,4,4,3,1,1,2,0,no,3,14,14,15,1,FALSE +GP,M,16,U,GT3,T,3,3,other,services,home,father,2,1,no,no,yes,yes,yes,yes,no,5,4,2,1,1,5,0,no,3,14,14,14,1,FALSE +GP,M,17,R,GT3,T,1,3,other,other,course,father,3,2,no,yes,yes,yes,yes,yes,no,5,2,4,1,4,5,1,no,17,10,9,10,2.5,TRUE +GP,M,15,U,GT3,T,3,4,other,other,reputation,father,1,1,no,no,no,yes,yes,yes,no,3,4,3,1,2,4,0,no,4,14,13,14,1.5,FALSE +GP,F,15,U,GT3,T,1,2,at_home,services,course,mother,1,2,no,no,no,no,yes,yes,no,3,2,3,1,2,1,0,no,1,15,14,14,1.5,FALSE +GP,M,15,U,GT3,T,2,2,services,services,home,father,1,4,no,yes,yes,yes,yes,yes,no,5,5,4,1,2,5,0,yes,6,15,14,14,1.5,FALSE +GP,F,16,U,LE3,T,2,4,other,health,course,father,2,2,no,yes,yes,yes,yes,yes,yes,4,2,2,1,2,5,0,yes,2,14,12,13,1.5,FALSE +GP,M,16,U,GT3,T,4,4,health,other,course,mother,1,1,no,yes,yes,yes,yes,yes,no,3,4,4,1,4,5,0,no,11,13,12,13,2.5,TRUE +GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,no,no,no,yes,yes,yes,yes,5,4,4,1,1,5,0,yes,0,10,9,10,1,FALSE +GP,M,15,U,GT3,T,3,4,services,services,home,father,1,1,yes,no,no,yes,yes,yes,no,5,5,5,3,2,5,0,no,1,11,11,10,2.5,TRUE +GP,F,15,U,LE3,A,3,4,other,other,home,mother,1,2,yes,no,yes,yes,yes,yes,yes,5,3,2,1,1,1,0,no,0,8,10,11,1,FALSE +GP,F,19,U,GT3,T,0,1,at_home,other,course,other,1,2,no,yes,no,no,no,no,no,3,4,2,1,1,5,2,no,1,8,9,10,1,FALSE +GP,M,16,R,GT3,T,4,4,teacher,teacher,course,mother,1,1,no,no,yes,yes,yes,yes,no,3,5,5,2,5,4,0,yes,8,16,16,16,3.5,TRUE +GP,F,15,R,GT3,T,3,4,services,teacher,course,father,2,3,no,yes,no,yes,yes,yes,yes,4,2,2,2,2,5,1,no,0,11,6,6,2,FALSE +GP,F,15,U,GT3,T,1,1,at_home,other,course,mother,3,1,no,yes,yes,no,yes,yes,yes,4,3,3,1,2,4,0,no,3,10,6,6,1.5,FALSE +GP,F,17,U,LE3,T,2,2,other,other,course,father,1,1,no,yes,no,yes,yes,yes,yes,3,4,4,1,3,5,0,no,7,12,12,12,2,FALSE +GP,F,16,U,GT3,A,3,4,services,other,course,father,1,1,no,no,no,yes,yes,yes,no,3,2,1,1,4,5,0,no,14,14,12,12,2.5,TRUE +GP,M,15,R,GT3,T,3,4,at_home,teacher,course,mother,4,2,no,yes,no,yes,yes,no,yes,5,3,3,1,1,5,0,no,1,10,6,6,1,FALSE +GP,F,15,U,GT3,T,4,4,services,at_home,course,mother,1,3,no,yes,yes,yes,yes,yes,yes,4,3,3,1,1,5,0,no,2,12,7,8,1,FALSE +GP,M,17,R,GT3,T,3,4,at_home,other,course,mother,3,2,no,no,no,yes,yes,no,no,5,4,5,2,4,5,0,no,1,10,4,5,3,TRUE +GP,F,16,U,GT3,A,3,3,other,other,course,other,2,1,no,yes,yes,no,yes,yes,yes,4,3,2,1,1,5,1,no,2,6,4,5,1,FALSE +GP,M,16,U,LE3,T,1,1,services,other,course,mother,1,2,no,no,no,yes,yes,no,yes,4,4,4,1,3,5,2,no,0,12,11,11,2,FALSE +GP,F,15,U,GT3,T,4,4,teacher,teacher,course,mother,2,1,no,no,yes,yes,yes,yes,no,4,3,2,1,1,5,0,no,3,14,15,14,1,FALSE +GP,M,15,U,GT3,T,4,3,teacher,services,course,father,2,4,yes,yes,no,yes,yes,yes,no,2,2,2,1,1,3,0,no,3,8,10,6,1,FALSE +GP,M,16,U,LE3,T,2,2,services,services,reputation,father,2,1,no,yes,yes,yes,yes,yes,no,2,3,3,2,2,2,1,no,6,10,10,10,2,FALSE +GP,F,15,U,GT3,T,4,4,teacher,services,course,mother,1,3,no,yes,yes,yes,yes,yes,no,4,2,2,1,1,5,0,yes,2,11,12,12,1,FALSE +GP,F,16,U,LE3,T,1,1,at_home,at_home,course,mother,1,1,no,no,no,yes,yes,yes,no,3,4,4,3,3,1,0,no,3,12,12,12,3,TRUE +GP,M,17,U,GT3,T,2,1,other,other,home,mother,1,1,no,yes,no,yes,yes,yes,no,5,4,5,1,2,5,2,no,11,7,4,3,1.5,FALSE +GP,F,15,U,GT3,T,1,1,other,services,course,father,1,2,no,yes,no,yes,yes,yes,no,4,4,2,1,2,5,0,yes,0,10,12,12,1.5,FALSE +GP,F,15,U,GT3,T,3,2,health,services,home,father,1,2,no,yes,no,yes,yes,yes,no,3,3,2,1,1,3,2,no,1,8,9,6,1,FALSE +GP,F,15,U,GT3,T,1,2,at_home,other,course,mother,1,2,no,yes,no,no,yes,yes,no,4,3,2,1,1,5,0,yes,4,12,12,12,1,FALSE +GP,M,16,U,GT3,T,4,4,teacher,teacher,course,mother,1,1,no,yes,no,yes,no,yes,yes,3,3,2,2,1,5,0,no,8,8,8,4,1.5,FALSE +GP,M,15,U,LE3,A,2,1,services,other,course,mother,4,1,no,no,no,yes,yes,yes,no,4,5,5,2,5,5,2,no,0,10,10,10,3.5,TRUE +GP,M,18,U,LE3,T,1,1,other,other,course,mother,1,1,no,no,no,yes,no,yes,yes,2,3,5,2,5,4,2,no,0,8,7,0,3.5,TRUE +GP,M,16,U,LE3,T,2,1,at_home,other,course,mother,1,1,no,no,yes,yes,yes,no,yes,4,4,4,3,5,5,1,no,6,10,12,12,4,TRUE +GP,F,15,R,GT3,T,3,3,services,services,reputation,other,2,3,no,yes,yes,yes,yes,yes,yes,4,2,1,2,3,3,1,yes,5,12,12,12,2.5,TRUE +GP,M,19,U,GT3,T,3,2,services,at_home,home,mother,1,1,no,yes,no,yes,no,yes,yes,4,5,4,1,1,4,2,no,3,8,4,6,1,FALSE +GP,F,17,U,GT3,T,4,4,other,teacher,course,mother,1,1,yes,yes,no,yes,yes,no,yes,4,2,1,1,1,4,0,no,0,12,12,12,1,FALSE +GP,M,15,R,GT3,T,2,3,at_home,services,course,mother,1,2,yes,no,yes,yes,yes,no,no,4,4,4,1,1,1,0,yes,1,9,8,8,1,FALSE +GP,M,17,R,LE3,T,1,2,other,other,reputation,mother,1,1,no,no,no,yes,yes,no,no,2,2,2,3,3,5,2,no,11,12,10,12,3,TRUE +GP,F,18,R,GT3,T,1,1,at_home,other,course,mother,3,1,no,yes,yes,no,yes,no,no,5,2,5,1,5,4,3,no,6,10,9,10,3,TRUE +GP,M,16,R,GT3,T,2,2,at_home,other,course,mother,3,1,no,no,no,no,yes,no,no,4,2,2,1,2,3,0,no,3,14,12,13,1.5,FALSE +GP,M,16,U,GT3,T,3,3,other,services,course,father,1,2,no,yes,no,yes,yes,yes,yes,4,5,5,4,4,5,1,yes,2,10,11,6,4,TRUE +GP,M,17,R,LE3,T,2,1,at_home,other,course,mother,2,1,no,no,yes,yes,no,yes,yes,3,3,2,2,2,5,2,no,4,8,7,4,2,FALSE +GP,M,17,R,LE3,T,1,1,other,services,course,mother,4,2,no,no,yes,yes,no,no,yes,5,3,5,1,5,5,2,no,0,6,8,8,3,TRUE +GP,M,16,U,GT3,T,2,2,other,other,course,father,1,2,no,no,no,yes,no,yes,no,4,3,5,2,4,4,0,no,2,10,10,10,3,TRUE +GP,F,16,U,GT3,T,4,2,health,services,home,father,1,2,no,no,no,yes,yes,yes,yes,4,2,3,1,1,3,0,yes,0,16,16,17,1,FALSE +GP,F,16,U,GT3,T,2,2,other,other,home,mother,1,2,no,yes,no,no,yes,yes,no,5,1,5,1,1,4,0,yes,0,9,10,6,1,FALSE +GP,F,16,U,GT3,T,4,4,health,health,reputation,mother,1,2,no,yes,no,yes,yes,yes,yes,4,4,2,1,1,3,0,yes,0,15,15,16,1,FALSE +GP,M,16,U,GT3,T,3,4,other,other,course,father,3,1,no,yes,yes,no,yes,yes,no,3,4,5,2,4,2,2,no,2,8,7,5,3,TRUE +GP,M,16,U,GT3,T,1,0,other,other,reputation,mother,2,2,no,yes,yes,yes,yes,yes,yes,4,3,2,1,1,3,0,yes,1,14,16,17,1,FALSE +GP,M,17,U,LE3,T,4,4,teacher,other,reputation,mother,1,2,no,yes,yes,yes,yes,yes,no,4,4,4,1,3,5,0,yes,0,12,10,10,2,FALSE +GP,F,16,U,GT3,T,1,3,at_home,services,home,mother,1,2,no,no,yes,no,yes,yes,yes,4,3,5,1,1,3,2,no,0,11,10,6,1,FALSE +GP,F,16,U,LE3,T,3,3,other,other,reputation,mother,2,2,no,yes,yes,yes,yes,yes,no,4,4,5,1,1,4,0,yes,2,12,12,12,1,FALSE +GP,M,17,U,LE3,T,4,3,teacher,other,course,mother,2,2,no,no,yes,yes,yes,yes,no,4,4,4,4,4,4,0,yes,2,10,10,10,4,TRUE +GP,F,16,U,GT3,T,2,2,services,other,reputation,mother,2,2,no,no,yes,no,yes,yes,no,3,4,4,1,4,5,0,yes,1,13,12,12,2.5,TRUE +GP,M,17,U,GT3,T,3,3,other,other,reputation,father,1,2,no,no,yes,no,yes,yes,no,4,3,4,1,4,4,0,no,4,8,7,8,2.5,TRUE +GP,M,16,R,GT3,T,4,2,teacher,services,other,mother,1,1,no,yes,yes,yes,yes,yes,yes,4,3,3,3,4,3,0,no,9,10,8,10,3.5,TRUE +GP,M,17,U,GT3,T,4,3,other,other,course,mother,1,2,no,yes,yes,yes,yes,yes,yes,5,2,3,1,1,2,0,no,4,10,10,12,1,FALSE +GP,M,16,U,GT3,T,4,3,teacher,other,home,mother,1,2,no,yes,yes,yes,yes,yes,no,3,4,3,2,3,3,0,yes,7,10,9,10,2.5,TRUE +GP,M,16,U,GT3,T,3,3,services,other,home,mother,1,2,no,no,yes,yes,yes,yes,yes,4,2,3,1,2,3,0,yes,1,12,12,12,1.5,FALSE +GP,F,17,U,GT3,T,2,4,services,services,reputation,father,1,2,no,yes,yes,yes,yes,no,no,5,4,2,2,3,5,0,no,0,16,18,17,2.5,TRUE +GP,F,17,U,LE3,T,3,3,other,other,reputation,mother,1,2,no,yes,yes,yes,yes,yes,yes,5,3,3,2,3,1,0,no,44,12,11,11,2.5,TRUE +GP,F,16,U,GT3,T,3,2,other,other,reputation,mother,1,2,no,yes,no,yes,yes,yes,no,1,2,2,1,2,1,0,yes,11,13,14,14,1.5,FALSE +GP,M,17,U,GT3,T,3,3,services,services,other,mother,1,2,no,yes,yes,yes,yes,yes,yes,4,3,4,2,3,4,0,no,9,12,12,12,2.5,TRUE +GP,M,16,U,GT3,T,1,2,services,services,other,mother,1,1,no,yes,yes,yes,yes,yes,yes,3,3,3,1,2,3,0,yes,1,10,10,11,1.5,FALSE +GP,M,16,U,LE3,T,2,1,other,other,course,mother,1,2,no,no,yes,yes,yes,yes,yes,4,2,3,1,2,5,0,yes,0,14,14,16,1.5,FALSE +GP,F,17,U,GT3,A,3,3,health,other,reputation,mother,1,2,no,yes,no,no,yes,yes,yes,3,3,3,1,3,3,0,no,8,10,10,12,2,FALSE +GP,M,17,R,GT3,T,1,2,at_home,other,home,mother,1,2,no,no,no,yes,yes,no,no,3,1,3,1,5,3,0,no,5,8,9,10,3,TRUE +GP,F,16,U,GT3,T,2,3,services,services,course,mother,1,2,no,no,no,yes,yes,yes,no,4,3,3,1,1,2,0,no,8,12,12,13,1,FALSE +GP,F,17,U,GT3,T,1,1,at_home,services,course,mother,1,2,no,no,yes,yes,yes,yes,no,5,3,3,1,1,3,0,no,0,10,10,10,1,FALSE +GP,M,17,U,GT3,T,1,2,at_home,services,other,other,2,2,no,no,yes,no,yes,yes,no,4,4,4,4,5,5,0,yes,14,8,10,10,4.5,TRUE +GP,M,16,R,GT3,T,3,3,services,services,reputation,mother,1,1,no,yes,yes,yes,yes,yes,no,4,3,2,3,4,5,0,no,4,10,10,10,3.5,TRUE +GP,M,16,U,GT3,T,2,3,other,other,home,father,2,1,no,no,no,yes,yes,yes,no,5,3,3,1,1,3,0,no,0,13,13,13,1,FALSE +GP,F,17,U,LE3,T,2,4,services,services,course,father,1,2,no,no,yes,yes,yes,yes,yes,4,3,2,1,1,5,0,no,4,14,15,16,1,FALSE +GP,M,17,U,GT3,T,4,4,services,teacher,home,mother,1,1,no,no,no,yes,yes,yes,no,5,2,3,1,2,5,0,no,4,15,14,15,1.5,FALSE +GP,M,16,R,LE3,T,3,3,teacher,other,home,father,3,1,no,yes,yes,yes,yes,yes,no,3,3,4,3,5,3,0,yes,12,10,10,11,4,TRUE +GP,F,17,U,GT3,T,4,4,services,teacher,home,mother,2,1,no,yes,no,yes,yes,yes,no,4,2,4,2,3,2,1,no,27,16,16,17,2.5,TRUE +GP,F,16,U,LE3,T,4,4,teacher,teacher,reputation,mother,1,2,no,yes,no,yes,yes,yes,no,4,5,2,1,2,3,0,yes,0,10,10,10,1.5,FALSE +GP,F,16,U,GT3,T,4,3,health,other,home,mother,1,2,no,yes,yes,yes,yes,yes,no,4,3,5,1,5,2,0,no,2,15,15,16,3,TRUE +GP,F,16,U,GT3,T,2,3,other,other,reputation,mother,1,2,yes,yes,yes,yes,yes,no,no,4,4,3,1,3,4,0,yes,5,10,11,11,2,FALSE +GP,F,17,U,GT3,T,1,1,other,other,course,mother,1,2,no,yes,no,no,yes,no,no,4,4,4,1,3,1,0,yes,2,12,12,12,2,FALSE +GP,F,17,R,GT3,T,2,2,other,other,reputation,mother,1,1,no,yes,no,yes,yes,yes,no,5,3,2,1,2,3,0,no,20,10,10,10,1.5,FALSE +GP,F,16,R,GT3,T,2,2,services,services,reputation,mother,2,4,no,yes,yes,no,yes,yes,no,5,3,5,1,1,5,0,yes,6,12,12,12,1,FALSE +GP,F,17,U,GT3,T,3,4,at_home,services,home,mother,1,3,no,yes,no,yes,yes,yes,yes,4,4,3,3,4,5,1,yes,21,9,9,8,3.5,TRUE +GP,F,16,U,GT3,A,3,1,services,other,course,mother,1,2,no,yes,no,yes,yes,yes,no,2,3,3,2,2,4,2,yes,4,9,9,10,2,FALSE +GP,F,16,U,GT3,T,4,3,teacher,other,other,mother,1,2,no,no,yes,yes,yes,yes,yes,1,3,2,1,1,1,0,yes,7,12,14,14,1,FALSE +GP,F,16,U,GT3,T,1,1,at_home,other,home,mother,2,1,no,yes,no,yes,yes,no,no,4,3,2,1,4,5,0,yes,4,10,11,12,2.5,TRUE +GP,F,17,R,GT3,T,4,3,teacher,other,reputation,mother,2,3,no,yes,yes,yes,yes,yes,yes,4,4,2,1,1,4,0,yes,3,9,10,10,1,FALSE +GP,F,19,U,GT3,T,3,3,other,other,reputation,other,1,4,no,yes,yes,yes,yes,yes,no,4,3,3,1,2,3,0,yes,7,10,10,10,1.5,FALSE +GP,M,17,U,LE3,T,4,4,services,other,home,mother,1,2,no,yes,no,yes,yes,yes,yes,5,3,5,4,5,3,0,yes,14,12,12,12,4.5,TRUE +GP,F,16,U,GT3,A,2,2,other,other,reputation,mother,1,2,yes,yes,no,yes,yes,yes,no,3,3,4,1,1,4,0,yes,0,12,13,14,1,FALSE +GP,M,18,U,GT3,T,2,2,services,other,home,mother,1,2,no,yes,yes,yes,yes,yes,no,4,4,4,2,4,5,0,yes,12,9,9,10,3,TRUE +GP,F,17,R,LE3,T,4,4,services,other,other,mother,1,1,no,yes,no,yes,yes,no,no,5,2,1,1,2,3,0,yes,9,10,10,10,1.5,FALSE +GP,F,17,U,LE3,T,3,2,other,other,reputation,mother,2,2,no,no,no,yes,yes,yes,no,4,4,4,1,3,1,0,yes,2,14,16,15,2,FALSE +GP,F,17,U,GT3,T,4,3,other,other,reputation,mother,1,2,no,no,no,yes,yes,yes,yes,3,4,5,2,4,1,1,yes,19,8,8,7,3,TRUE +GP,M,18,U,LE3,T,3,3,services,health,home,father,1,2,no,yes,no,yes,yes,yes,no,3,2,4,2,4,4,0,yes,12,8,8,9,3,TRUE +GP,F,17,U,GT3,T,2,3,at_home,other,home,father,2,1,no,yes,no,yes,yes,no,no,3,3,3,1,4,3,0,yes,4,10,10,10,2.5,TRUE +GP,F,17,U,GT3,T,2,2,at_home,at_home,course,mother,1,3,no,yes,yes,yes,yes,yes,no,4,3,3,1,1,4,0,yes,2,10,11,12,1,FALSE +GP,F,17,R,GT3,T,2,1,at_home,services,reputation,mother,2,2,no,yes,yes,yes,yes,yes,no,4,2,5,1,2,5,0,no,1,8,8,8,1.5,FALSE +GP,F,17,U,GT3,T,1,1,at_home,other,reputation,mother,1,3,no,yes,yes,yes,yes,no,yes,4,3,4,1,1,5,0,no,6,9,8,6,1,FALSE +GP,F,16,U,GT3,T,2,3,services,teacher,other,mother,1,2,yes,no,no,yes,yes,yes,no,2,3,1,1,1,3,0,no,1,14,14,16,1,FALSE +GP,M,18,U,GT3,T,2,2,other,other,home,mother,2,2,no,yes,no,yes,yes,yes,no,3,3,3,5,5,4,2,yes,4,11,11,12,5,TRUE +GP,F,16,U,GT3,T,4,4,teacher,services,home,mother,1,3,no,yes,yes,no,yes,yes,no,5,3,2,1,1,5,0,no,2,14,14,15,1,FALSE +GP,F,18,R,GT3,T,3,1,other,other,reputation,mother,1,2,no,no,yes,yes,yes,yes,yes,5,3,3,1,1,4,0,no,10,8,8,8,1,FALSE +GP,F,17,U,GT3,T,3,2,other,other,course,mother,1,2,no,no,yes,no,yes,yes,no,5,3,4,1,3,3,0,no,6,16,16,16,2,FALSE +GP,M,17,U,LE3,T,2,3,services,services,reputation,father,1,2,no,yes,no,no,yes,yes,no,5,3,3,1,3,3,0,yes,1,11,11,12,2,FALSE +GP,M,18,U,LE3,T,2,1,at_home,other,course,mother,4,2,yes,yes,yes,yes,yes,yes,yes,4,3,2,4,5,3,0,yes,8,10,9,10,4.5,TRUE +GP,F,17,U,GT3,A,2,1,other,other,course,mother,2,3,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,0,no,5,14,12,14,1.5,FALSE +GP,F,17,U,LE3,T,4,3,health,other,reputation,father,1,2,no,no,yes,yes,yes,yes,yes,3,2,3,1,2,3,0,no,7,14,12,13,1.5,FALSE +GP,M,17,R,GT3,T,2,2,other,other,course,father,2,2,no,yes,yes,yes,yes,yes,no,4,5,2,1,1,1,0,yes,2,12,12,12,1,FALSE +GP,M,17,U,GT3,T,4,4,teacher,teacher,reputation,mother,1,2,yes,yes,yes,yes,yes,yes,yes,4,5,5,1,3,2,0,no,7,12,11,11,2,FALSE +GP,M,16,U,GT3,T,4,4,health,other,reputation,father,1,2,no,yes,yes,yes,yes,yes,no,4,2,4,2,4,1,0,yes,1,14,13,14,3,TRUE +GP,M,16,U,LE3,T,1,1,other,other,home,mother,2,2,no,yes,no,yes,yes,yes,no,3,4,2,1,1,5,0,yes,10,9,8,8,1,FALSE +GP,M,16,U,GT3,T,3,2,at_home,other,reputation,mother,2,3,no,no,yes,yes,yes,yes,yes,5,3,3,1,3,2,0,no,5,12,10,11,2,FALSE +GP,M,17,U,LE3,T,2,2,other,other,home,father,1,2,no,no,yes,no,yes,yes,yes,4,4,2,5,5,4,0,yes,2,15,14,14,5,TRUE +GP,F,16,U,GT3,T,2,1,other,other,home,mother,1,1,no,no,no,yes,yes,yes,yes,4,5,2,1,1,5,0,no,12,11,11,11,1,FALSE +GP,F,17,R,GT3,T,2,1,at_home,services,course,mother,3,2,no,no,yes,yes,yes,no,no,2,1,1,1,1,3,0,no,2,13,12,12,1,FALSE +GP,M,18,U,GT3,T,2,2,other,services,reputation,father,1,2,no,no,no,yes,no,yes,no,5,5,4,3,5,2,0,no,8,8,7,4,4,TRUE +GP,M,17,U,LE3,T,4,3,health,other,course,mother,2,2,no,no,yes,yes,yes,yes,yes,2,5,5,1,4,5,0,no,11,14,14,14,2.5,TRUE +GP,M,17,R,LE3,A,4,4,teacher,other,course,mother,2,2,no,yes,no,yes,yes,yes,no,3,3,3,2,3,4,0,yes,1,11,12,12,2.5,TRUE +GP,M,16,U,LE3,T,4,3,teacher,other,course,mother,1,1,no,no,yes,no,yes,yes,no,5,4,5,1,1,3,0,no,4,10,7,8,1,FALSE +GP,M,16,U,GT3,T,4,4,services,services,course,mother,1,1,no,no,yes,yes,yes,yes,no,5,3,2,1,2,5,0,no,2,14,14,14,1.5,FALSE +GP,F,18,U,GT3,T,2,1,other,other,course,other,2,3,no,yes,no,no,yes,yes,yes,4,4,4,1,1,3,0,yes,5,10,5,6,1,FALSE +GP,M,16,U,GT3,T,2,1,other,other,course,mother,3,1,no,no,no,yes,yes,yes,no,4,3,3,1,1,4,0,no,6,16,17,17,1,FALSE +GP,M,17,U,GT3,T,2,3,other,other,course,father,2,1,no,no,no,yes,yes,yes,no,5,2,2,1,1,2,0,no,3,10,12,13,1,FALSE +GP,M,22,U,GT3,T,3,1,services,services,other,mother,1,1,no,no,no,no,no,yes,yes,5,4,5,5,5,1,3,no,14,6,8,6,5,TRUE +GP,M,18,R,LE3,T,3,3,other,services,course,mother,1,2,no,yes,no,yes,yes,yes,yes,4,3,3,1,3,5,0,no,8,6,7,8,2,FALSE +GP,M,16,U,GT3,T,0,2,other,other,other,mother,1,1,no,no,no,no,yes,yes,no,4,3,2,2,4,5,0,yes,0,12,14,13,3,TRUE +GP,M,18,U,GT3,T,3,2,services,other,course,mother,2,1,no,no,no,yes,no,yes,no,4,4,5,2,4,5,0,no,4,6,8,8,3,TRUE +GP,M,16,U,GT3,T,3,3,at_home,other,reputation,other,3,2,yes,yes,no,no,yes,yes,no,5,3,3,1,3,2,0,no,5,8,10,10,2,FALSE +GP,M,18,U,GT3,T,2,1,services,services,other,mother,1,1,no,no,no,no,no,yes,no,3,2,5,2,5,5,2,no,4,6,8,7,3.5,TRUE +GP,M,16,R,GT3,T,2,1,other,other,course,mother,2,1,no,no,yes,no,yes,no,no,3,3,2,1,3,3,0,no,1,11,11,10,2,FALSE +GP,M,17,R,GT3,T,2,1,other,other,course,mother,1,1,no,no,no,no,yes,yes,no,4,4,2,2,4,5,0,no,0,10,12,12,3,TRUE +GP,M,17,U,LE3,T,1,1,health,other,course,mother,2,1,no,yes,yes,yes,yes,yes,no,4,4,4,1,2,5,1,no,1,8,10,9,1.5,FALSE +GP,F,17,U,LE3,T,4,2,teacher,services,reputation,mother,1,4,no,yes,yes,yes,yes,yes,no,4,2,3,1,1,4,0,yes,4,14,14,15,1,FALSE +GP,M,19,U,LE3,A,4,3,services,at_home,reputation,mother,1,2,no,yes,no,yes,yes,yes,no,4,3,1,1,1,1,0,no,8,11,12,12,1,FALSE +GP,M,18,U,GT3,T,2,1,other,other,home,mother,1,2,no,no,yes,yes,yes,yes,no,5,2,4,1,2,4,0,no,5,16,15,15,1.5,FALSE +GP,F,17,U,LE3,T,2,2,services,services,course,father,1,4,no,no,yes,yes,yes,yes,yes,3,4,1,1,1,2,0,yes,1,10,10,6,1,FALSE +GP,F,18,U,GT3,T,4,3,services,other,home,father,1,2,no,yes,no,yes,yes,yes,yes,3,1,2,1,3,2,0,yes,12,16,16,16,2,FALSE +GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,2,no,yes,no,no,yes,yes,no,4,3,2,1,1,3,0,yes,2,9,9,10,1,FALSE +GP,M,18,R,GT3,T,3,2,other,other,course,mother,1,3,no,no,yes,no,yes,no,no,5,3,2,1,1,3,0,no,2,12,12,12,1,FALSE +GP,F,17,U,GT3,T,3,3,other,other,home,mother,1,3,no,no,yes,no,yes,no,no,3,2,3,1,1,4,0,no,3,12,10,11,1,FALSE +GP,F,18,U,GT3,T,2,2,at_home,services,home,mother,1,3,no,yes,yes,yes,yes,yes,yes,4,3,3,1,1,3,0,yes,0,10,11,6,1,FALSE +GP,M,18,R,LE3,A,3,4,other,other,reputation,mother,2,2,no,yes,yes,yes,yes,yes,no,4,2,5,3,4,1,0,yes,10,16,16,16,3.5,TRUE +GP,M,17,U,GT3,T,3,1,services,other,other,mother,1,2,no,no,yes,yes,yes,yes,yes,5,4,4,3,4,5,0,yes,1,10,10,12,3.5,TRUE +GP,F,18,R,GT3,T,4,4,teacher,other,reputation,mother,2,2,no,no,yes,yes,yes,yes,no,4,3,4,2,2,4,0,yes,8,11,10,12,2,FALSE +GP,M,18,U,GT3,T,4,2,health,other,reputation,father,1,2,no,yes,yes,yes,yes,yes,yes,5,4,5,1,3,5,0,yes,7,10,10,12,2,FALSE +GP,F,18,R,GT3,T,2,1,other,other,reputation,mother,2,2,no,yes,no,yes,no,yes,yes,4,3,5,1,2,3,0,no,6,7,4,5,1.5,FALSE +GP,F,19,U,GT3,T,3,3,other,services,home,other,1,2,no,yes,yes,yes,yes,yes,no,4,3,5,3,3,5,1,yes,16,10,10,10,3,TRUE +GP,F,18,U,GT3,T,2,3,other,services,reputation,father,1,4,no,yes,yes,yes,yes,yes,yes,4,5,5,1,3,2,0,yes,7,16,15,15,2,FALSE +GP,F,18,U,LE3,T,1,1,other,other,home,mother,2,2,no,yes,no,no,yes,no,no,4,4,3,1,1,3,0,yes,2,12,12,12,1,FALSE +GP,M,17,R,GT3,T,1,2,at_home,at_home,home,mother,1,2,no,yes,yes,no,yes,no,yes,3,5,2,2,2,1,0,yes,2,16,16,16,2,FALSE +GP,F,17,U,GT3,T,2,4,at_home,health,reputation,mother,2,2,no,yes,no,yes,yes,yes,yes,4,3,3,1,1,1,0,yes,4,12,13,13,1,FALSE +GP,F,17,U,LE3,T,2,2,services,other,course,mother,2,2,yes,yes,no,yes,yes,yes,yes,4,4,4,2,3,5,0,yes,6,12,12,12,2.5,TRUE +GP,F,18,R,GT3,A,3,2,other,services,home,mother,2,2,no,no,no,no,no,yes,yes,4,1,1,1,1,5,0,no,45,11,9,10,1,FALSE +GP,M,18,U,GT3,T,4,4,teacher,services,home,mother,2,1,no,no,yes,yes,yes,yes,no,3,2,4,1,4,3,0,yes,14,10,10,10,2.5,TRUE +GP,F,18,U,GT3,T,4,4,health,health,reputation,father,1,2,yes,yes,yes,yes,yes,yes,yes,2,4,4,1,1,4,1,no,8,12,10,10,1,FALSE +GP,M,18,U,LE3,T,4,3,teacher,services,course,mother,2,1,no,no,yes,yes,yes,yes,no,4,2,3,1,2,1,0,yes,4,10,10,10,1.5,FALSE +GP,M,17,U,LE3,A,4,1,services,other,home,mother,2,1,no,no,yes,yes,yes,yes,yes,4,5,4,2,4,5,0,yes,26,10,10,9,3,TRUE +GP,M,17,U,LE3,A,3,2,teacher,services,home,mother,1,1,no,no,no,yes,yes,yes,no,4,4,4,3,4,3,0,no,18,12,11,12,3.5,TRUE +GP,F,18,R,LE3,T,1,1,at_home,other,reputation,mother,2,4,no,yes,yes,yes,yes,no,no,5,2,2,1,1,3,0,yes,2,14,14,15,1,FALSE +GP,F,18,U,GT3,T,1,1,other,other,home,mother,2,2,yes,no,yes,yes,yes,yes,no,5,4,4,1,1,4,0,no,2,10,11,12,1,FALSE +GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,no,yes,no,no,yes,yes,no,5,4,5,1,2,5,0,no,8,11,10,12,1.5,FALSE +GP,M,17,U,GT3,T,1,1,other,other,reputation,father,1,2,no,no,no,no,yes,yes,no,4,3,3,1,2,4,0,yes,1,12,11,12,1.5,FALSE +GP,F,18,U,GT3,T,2,2,at_home,at_home,other,mother,1,3,no,yes,no,yes,yes,yes,no,4,3,3,1,2,2,0,yes,2,18,18,18,1.5,FALSE +GP,F,17,U,GT3,T,1,1,services,teacher,reputation,mother,1,3,no,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,yes,3,13,12,13,1,FALSE +GP,M,18,U,GT3,T,2,1,services,services,reputation,mother,1,3,no,no,yes,yes,yes,yes,no,4,2,4,1,3,2,0,yes,3,14,14,14,2,FALSE +GP,M,18,U,LE3,A,4,4,teacher,teacher,reputation,mother,1,2,no,yes,yes,yes,yes,yes,no,5,4,3,1,1,2,0,yes,4,16,15,16,1,FALSE +GP,M,18,U,GT3,T,4,2,teacher,other,home,mother,1,2,no,yes,yes,yes,yes,yes,yes,4,3,2,1,4,5,0,yes,6,14,14,14,2.5,TRUE +GP,F,17,U,GT3,T,4,3,health,services,reputation,mother,1,3,no,yes,no,yes,yes,yes,no,4,2,2,1,2,3,0,yes,0,16,16,16,1.5,FALSE +GP,F,17,R,LE3,T,3,1,services,other,reputation,mother,2,4,no,yes,no,yes,yes,no,no,3,1,2,1,1,3,0,yes,3,18,18,18,1,FALSE +GP,M,18,R,LE3,T,3,2,services,other,reputation,mother,2,3,no,yes,yes,yes,yes,yes,no,5,4,2,1,1,4,0,yes,4,14,14,14,1,FALSE +GP,M,17,U,GT3,T,3,3,health,other,home,mother,1,1,no,yes,no,yes,yes,yes,no,4,4,3,1,3,5,0,yes,2,14,14,13,2,FALSE +GP,F,19,U,GT3,T,4,4,health,other,reputation,other,2,2,no,yes,yes,yes,yes,yes,no,2,3,4,2,3,2,0,yes,1,12,11,6,2.5,TRUE +GP,F,18,U,LE3,T,4,3,other,other,home,other,2,2,no,yes,no,yes,yes,yes,yes,4,4,5,1,2,2,0,yes,5,12,11,11,1.5,FALSE +GP,F,18,U,GT3,T,4,3,other,other,reputation,father,1,4,no,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,yes,0,15,15,16,1,FALSE +GP,M,18,U,LE3,T,4,4,teacher,teacher,home,mother,1,1,no,yes,no,yes,yes,yes,yes,1,4,2,2,2,1,0,yes,2,17,16,16,2,FALSE +GP,F,18,U,LE3,A,4,4,health,other,home,mother,1,2,no,yes,no,yes,yes,yes,yes,4,2,4,1,1,4,0,no,7,13,12,13,1,FALSE +GP,M,17,U,LE3,T,4,4,other,teacher,home,father,2,1,no,no,no,yes,yes,yes,no,4,1,1,2,2,5,0,yes,0,12,12,12,2,FALSE +GP,F,17,U,GT3,T,4,2,other,other,reputation,mother,2,3,no,yes,no,yes,yes,yes,no,4,3,3,1,1,3,0,yes,0,16,14,15,1,FALSE +GP,F,17,U,GT3,T,3,2,health,health,reputation,father,1,4,no,yes,yes,no,yes,yes,no,5,2,2,1,2,5,0,yes,0,18,18,18,1.5,FALSE +GP,M,20,U,GT3,A,3,2,services,other,course,other,1,1,no,no,yes,yes,yes,no,no,5,5,3,1,1,5,1,no,0,16,16,16,1,FALSE +GP,M,19,R,GT3,T,3,3,other,services,reputation,father,1,2,no,no,yes,yes,yes,no,yes,4,5,3,1,2,5,0,no,0,12,11,12,1.5,FALSE +GP,F,18,U,GT3,T,2,1,services,other,course,mother,2,2,no,yes,yes,yes,yes,yes,no,5,3,3,1,2,1,0,yes,1,10,10,8,1.5,FALSE +GP,F,18,U,GT3,T,4,3,other,other,course,mother,1,3,no,yes,yes,yes,yes,yes,yes,4,3,4,1,1,5,0,yes,6,12,12,13,1,FALSE +GP,F,17,R,GT3,T,3,4,at_home,services,course,father,1,3,no,yes,yes,no,yes,yes,no,4,3,4,2,5,5,0,yes,1,13,13,14,3.5,TRUE +GP,F,18,U,GT3,T,4,4,teacher,other,course,mother,1,2,no,yes,no,yes,yes,yes,no,4,4,4,3,3,5,0,yes,1,12,11,12,3,TRUE +GP,F,17,U,GT3,A,4,3,services,services,course,mother,1,2,no,yes,no,yes,yes,yes,yes,5,2,2,1,2,5,0,yes,18,14,14,15,1.5,FALSE +GP,F,17,U,GT3,T,2,2,other,other,course,mother,1,2,no,yes,no,yes,yes,no,yes,4,2,2,1,1,3,0,no,8,12,11,11,1,FALSE +GP,F,17,R,LE3,T,2,2,services,services,course,mother,1,3,no,yes,yes,yes,yes,yes,no,3,3,2,2,2,3,0,yes,2,11,11,10,2,FALSE +GP,F,17,U,GT3,T,3,1,services,services,course,father,1,3,no,yes,no,no,yes,yes,no,3,4,3,2,3,5,0,no,0,14,16,16,2.5,TRUE +GP,F,17,U,LE3,T,0,2,at_home,at_home,home,father,2,3,no,no,no,yes,yes,yes,no,3,3,3,2,3,2,0,no,0,15,14,15,2.5,TRUE +GP,M,18,U,GT3,T,4,4,other,other,course,mother,1,3,no,no,yes,yes,yes,yes,no,4,3,3,2,2,3,0,no,2,11,13,12,2,FALSE +GP,M,17,U,GT3,T,3,3,other,services,reputation,mother,1,1,no,no,yes,no,yes,yes,no,4,3,5,3,5,5,0,no,2,16,16,16,4,TRUE +GP,M,17,R,GT3,T,2,2,services,other,course,mother,4,1,no,yes,no,yes,yes,yes,no,4,4,5,5,5,4,0,no,5,11,10,10,5,TRUE +GP,F,17,U,GT3,T,4,4,teacher,services,course,mother,1,3,no,yes,yes,yes,yes,yes,no,5,4,4,1,3,4,0,yes,4,12,10,11,2,FALSE +GP,F,17,U,GT3,T,4,4,teacher,teacher,course,mother,2,3,no,yes,no,no,yes,yes,yes,4,3,3,1,2,4,0,yes,4,14,14,14,1.5,FALSE +GP,M,18,U,LE3,T,2,2,other,other,course,mother,1,4,no,yes,yes,yes,yes,yes,no,4,5,5,2,4,5,0,no,1,10,10,10,3,TRUE +GP,F,17,R,GT3,T,2,4,at_home,other,course,father,1,3,no,yes,no,yes,yes,yes,yes,4,4,3,1,1,5,0,no,4,14,14,14,1,FALSE +GP,F,18,U,GT3,T,3,3,services,services,home,mother,1,2,no,no,yes,yes,yes,yes,no,5,3,4,1,1,4,0,no,4,8,6,6,1,FALSE +GP,F,18,U,LE3,T,2,2,other,other,home,other,1,2,no,no,yes,no,yes,yes,yes,4,3,3,1,1,2,0,no,0,9,8,6,1,FALSE +GP,F,18,R,GT3,T,2,2,at_home,other,course,mother,2,4,no,no,yes,yes,yes,no,no,4,4,4,1,1,4,0,no,3,12,11,7,1,FALSE +GP,F,17,U,GT3,T,3,4,services,other,course,mother,1,3,no,no,no,yes,yes,yes,no,4,4,5,1,3,5,0,no,12,14,14,14,2,FALSE +GP,F,17,U,GT3,T,3,2,other,other,home,mother,1,2,no,yes,no,yes,yes,yes,yes,4,3,2,2,3,2,0,yes,0,10,10,8,2.5,TRUE +GP,F,18,U,LE3,T,3,3,services,services,home,mother,1,4,no,yes,no,yes,yes,yes,no,5,3,3,1,1,1,0,no,6,15,14,16,1,FALSE +GP,F,17,R,GT3,A,3,2,other,other,home,mother,1,2,no,yes,no,yes,yes,yes,no,4,3,3,2,3,2,0,yes,2,12,12,13,2.5,TRUE +GP,M,18,U,GT3,T,4,4,teacher,services,home,father,1,2,no,yes,yes,yes,yes,yes,no,4,3,3,2,2,2,0,no,0,11,11,6,2,FALSE +GP,M,18,U,LE3,T,3,4,services,other,home,mother,1,2,no,no,yes,yes,yes,yes,yes,4,3,3,1,3,5,0,no,8,16,16,16,2,FALSE +GP,F,17,U,GT3,A,2,2,at_home,at_home,home,father,1,2,no,yes,no,yes,yes,yes,yes,3,3,1,1,2,4,0,no,9,10,10,7,1.5,FALSE +GP,F,18,U,GT3,T,2,3,at_home,other,course,mother,1,3,no,yes,no,yes,yes,yes,no,4,3,3,1,2,3,0,no,2,11,11,12,1.5,FALSE +GP,F,18,U,GT3,T,3,2,other,services,other,mother,1,3,no,no,no,yes,yes,yes,yes,5,4,3,2,3,1,0,no,6,14,14,16,2.5,TRUE +GP,M,18,R,GT3,T,4,3,teacher,services,course,mother,1,3,no,no,no,yes,yes,yes,yes,5,3,2,1,2,4,0,no,6,16,14,16,1.5,FALSE +GP,M,18,U,GT3,T,4,3,teacher,other,course,mother,1,3,no,yes,no,yes,yes,yes,yes,5,4,5,2,3,5,0,yes,0,12,12,12,2.5,TRUE +GP,F,17,U,GT3,T,4,3,health,other,reputation,mother,1,3,no,yes,yes,yes,yes,yes,yes,4,4,3,1,3,4,0,yes,0,12,14,14,2,FALSE +MS,M,18,R,GT3,T,3,2,other,other,course,mother,2,1,no,yes,no,no,yes,yes,no,2,5,5,5,5,5,0,no,9,10,12,12,5,TRUE +MS,M,19,R,GT3,T,1,1,other,services,home,other,3,2,no,no,no,yes,yes,yes,no,5,4,4,3,3,2,2,no,8,9,8,10,3,TRUE +MS,M,17,U,GT3,T,3,3,health,other,course,mother,2,2,no,yes,no,yes,yes,yes,no,4,5,4,2,3,3,0,yes,3,10,11,12,2.5,TRUE +MS,M,18,U,LE3,T,1,3,at_home,services,course,mother,1,1,no,no,no,yes,no,yes,yes,4,3,3,2,3,3,0,no,4,8,8,8,2.5,TRUE +MS,M,19,R,GT3,T,1,1,other,other,home,other,3,1,no,yes,no,yes,yes,yes,no,4,4,4,3,3,5,1,no,4,8,8,9,3,TRUE +MS,M,17,R,GT3,T,4,3,services,other,home,mother,2,2,no,yes,yes,no,yes,yes,yes,4,5,5,1,3,2,0,yes,4,12,11,11,2,FALSE +MS,F,18,U,GT3,T,3,3,services,services,course,father,1,2,no,yes,no,yes,yes,no,yes,5,3,4,1,1,5,0,no,0,10,10,10,1,FALSE +MS,F,17,R,GT3,T,4,4,teacher,services,other,father,2,2,no,yes,yes,yes,yes,yes,no,4,3,3,1,2,5,0,yes,3,12,12,12,1.5,FALSE +MS,F,17,U,LE3,A,3,2,services,other,reputation,mother,2,2,no,no,no,yes,yes,no,yes,1,2,3,1,2,5,0,no,1,14,13,13,1.5,FALSE +MS,M,18,U,LE3,T,1,1,other,services,home,father,2,1,no,no,no,no,yes,yes,yes,3,3,2,1,2,3,0,no,3,12,12,12,1.5,FALSE +MS,F,18,U,LE3,T,1,1,at_home,services,course,father,2,3,no,no,no,yes,yes,yes,no,5,3,2,1,1,4,0,no,0,18,16,17,1,FALSE +MS,F,18,U,GT3,T,3,3,services,services,other,mother,2,2,no,yes,no,yes,yes,yes,yes,4,3,2,1,3,3,0,no,3,12,12,12,2,FALSE +MS,F,17,U,LE3,T,4,4,at_home,at_home,course,mother,1,2,no,yes,yes,yes,yes,yes,yes,2,3,4,1,1,1,0,yes,2,16,14,15,1,FALSE +MS,F,17,R,GT3,T,1,2,other,services,course,father,2,2,no,no,no,no,yes,no,no,3,2,2,1,2,3,0,no,0,12,12,12,1.5,FALSE +MS,M,18,R,GT3,T,1,3,at_home,other,course,mother,2,2,no,yes,no,yes,yes,no,no,3,3,4,2,4,3,0,yes,2,9,10,10,3,TRUE +MS,M,18,U,LE3,T,4,4,teacher,services,other,mother,2,3,no,no,no,yes,yes,yes,yes,4,2,2,2,2,5,0,yes,0,14,14,14,2,FALSE +MS,F,17,R,GT3,T,1,1,other,services,reputation,mother,3,1,no,yes,no,yes,yes,yes,yes,5,2,1,1,2,1,1,yes,0,8,7,4,1.5,FALSE +MS,F,18,U,GT3,T,2,3,at_home,services,course,father,2,1,no,yes,no,yes,yes,yes,yes,5,2,3,1,2,4,0,yes,0,10,10,10,1.5,FALSE +MS,F,18,R,GT3,T,4,4,other,teacher,other,father,3,2,no,yes,no,no,yes,yes,yes,3,2,2,4,2,5,0,yes,5,10,8,6,3,TRUE +MS,M,18,R,LE3,T,1,2,at_home,services,other,father,3,1,no,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,0,yes,3,12,11,11,2.5,TRUE +MS,F,17,U,GT3,T,2,2,other,at_home,home,mother,1,3,no,no,yes,yes,yes,no,yes,3,4,3,1,1,3,0,no,8,12,11,12,1,FALSE +MS,F,17,R,GT3,T,1,2,other,other,course,mother,1,1,no,no,yes,yes,yes,yes,no,3,5,5,1,3,1,0,no,9,6,6,7,2,FALSE +MS,F,18,R,LE3,T,4,4,other,other,reputation,mother,2,3,no,no,no,yes,yes,yes,no,5,4,4,1,1,1,0,no,0,17,18,18,1,FALSE +MS,F,18,R,GT3,T,1,1,other,other,home,mother,4,3,no,no,no,yes,yes,yes,no,4,3,2,1,2,4,0,no,3,9,10,11,1.5,FALSE +MS,F,18,R,LE3,T,4,4,teacher,services,course,mother,1,2,no,no,yes,yes,yes,yes,no,5,4,3,3,4,2,0,yes,2,10,12,12,3.5,TRUE +MS,F,18,U,GT3,T,3,3,other,other,home,mother,1,2,no,no,no,yes,yes,yes,yes,4,1,3,1,2,1,0,yes,0,16,16,16,1.5,FALSE +MS,F,17,R,GT3,T,3,1,at_home,other,reputation,mother,1,2,no,yes,yes,no,yes,yes,no,4,5,4,2,3,1,0,yes,14,9,10,10,2.5,TRUE +MS,M,18,U,GT3,T,4,4,teacher,teacher,home,father,1,2,no,no,yes,no,yes,yes,no,3,2,4,1,4,2,0,yes,4,16,16,16,2.5,TRUE +MS,M,18,R,GT3,T,2,1,other,other,other,mother,2,1,no,no,yes,no,yes,yes,yes,4,4,3,1,3,5,0,no,2,7,6,4,2,FALSE +MS,M,17,U,GT3,T,2,3,other,services,home,father,2,2,no,no,yes,yes,yes,yes,no,4,4,3,1,1,3,0,no,3,12,13,13,1,FALSE +MS,M,19,R,GT3,T,1,1,other,services,other,mother,2,1,no,no,no,yes,yes,no,no,4,3,2,1,3,5,1,no,0,6,6,0,2,FALSE +MS,M,18,R,GT3,T,4,2,other,other,home,father,2,1,no,no,no,yes,yes,no,no,5,4,3,4,3,3,1,yes,7,6,6,2,3.5,TRUE +MS,F,18,R,GT3,T,2,2,at_home,other,other,mother,2,3,no,no,no,yes,yes,no,no,5,3,3,1,3,4,0,yes,1,12,13,12,2,FALSE +MS,F,18,R,GT3,T,4,4,teacher,at_home,reputation,mother,3,1,no,yes,yes,yes,yes,yes,yes,4,4,3,2,2,5,0,yes,6,6,7,8,2,FALSE +MS,F,19,R,GT3,T,2,3,services,other,course,mother,1,3,no,no,yes,no,yes,yes,no,5,4,2,1,2,5,1,no,2,8,8,5,1.5,FALSE +MS,F,18,U,LE3,T,3,1,teacher,services,course,mother,1,2,no,yes,no,yes,yes,yes,no,4,3,4,1,1,1,0,yes,2,11,12,12,1,FALSE +MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,0,no,3,8,8,4,1,FALSE +MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,no,no,no,no,yes,yes,no,2,4,5,3,4,2,0,no,4,12,13,13,3.5,TRUE +MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,no,2,10,12,10,3.5,TRUE diff --git a/chapter1.Rmd b/chapter1.Rmd index 439a9f99b..52dab0886 100644 --- a/chapter1.Rmd +++ b/chapter1.Rmd @@ -1,13 +1,27 @@ +# IODS Course Project + +SUBAM KATHET + +This course was recommended by a friend who is in the masters program in data science at the University of Helsinki. Data management, mining, data analytic and machine learning among many other within the same sphere are the next generation skill set everyone is recommended to acquire and here I am. I am a bit nervous, very exited and mostly curious to take a deep dive into the world of data science. # About the project -*Write a short description about the course and add a link to your GitHub repository here. This is an R Markdown (.Rmd) file so you should use R Markdown syntax.* +Here is the link to my github webpage. + +https://iamsubam.github.io/IODS-project/ + +And here is the link to my course diary. + +https://github.com/iamsubam/IODS-project + +# Week 1: Start me up !! The book and material + +I have only had some time to browse through the R for Health Data Science. Coming from a background of experimental epidemiology, I was drawn immediately by linear and logistic regression because this is something I often rely on in my work. I looked into survival analysis briefly because of my interest and found it quite interesting. Although I need to practice a lot before I can get my hands around the analysis. I think the book gives a great over view of essential statistical analysis required on a fundamental level. Some knowledge of statistics can be of great advantage as R platform is already designed with a steep learning curve. ```{r} # This is a so-called "R chunk" where you can write R code. date() - +# Trying to check if the chunk works or not. It is usually a struggle especially when R version is outdated and the packages doesn't work. Then re installing and updating the packages and coming back to work it out is a big struggle. ``` -The text continues here. diff --git a/chapter1.html b/chapter1.html new file mode 100644 index 000000000..5b132fe64 --- /dev/null +++ b/chapter1.html @@ -0,0 +1,443 @@ + + + + + + + + + + + + + +chapter1.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

IODS Course Project

+

SUBAM KATHET

+

This course was recommended by a friend who is in the masters program +in data science at the University of Helsinki. Data management, mining, +data analytic and machine learning among many other within the same +sphere are the next generation skill set everyone is recommended to +acquire and here I am. I am a bit nervous, very exited and mostly +curious to take a deep dive into the world of data science.

+
+
+

About the project

+

Here is the link to my github webpage.

+

https://iamsubam.github.io/IODS-project/

+

And here is the link to my course diary.

+

https://github.com/iamsubam/IODS-project

+
+
+

Week 1: Start me up !! The book and material

+

I have only had some time to browse through the R for Health Data +Science. Coming from a background of experimental epidemiology, I was +drawn immediately by linear and logistic regression because this is +something I often rely on in my work. I looked into survival analysis +briefly because of my interest and found it quite interesting. Although +I need to practice a lot before I can get my hands around the analysis. +I think the book gives a great over view of essential statistical +analysis required on a fundamental level. Some knowledge of statistics +can be of great advantage as R platform is already designed with a steep +learning curve.

+
# This is a so-called "R chunk" where you can write R code.
+
+date()
+
## [1] "Mon Nov 14 15:29:30 2022"
+
# Trying to check if the chunk works or not. It is usually a struggle especially when R version is outdated and the packages doesn't work. Then re installing and updating the packages and coming back to work it out is a big struggle. 
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/chapter2.Rmd b/chapter2.Rmd index 7211545ff..4dfc6f8b4 100644 --- a/chapter2.Rmd +++ b/chapter2.Rmd @@ -1,13 +1,604 @@ -# Insert chapter 2 title here +# Week 2: Regression and model validation -*Describe the work you have done this week and summarize your learning.* +This set consists of a few numbered exercises. +Go to each exercise in turn and do as follows: -- Describe your work and results clearly. -- Assume the reader has an introductory course level understanding of writing and reading R code as well as statistical methods. -- Assume the reader has no previous knowledge of your data or the more advanced methods you are using. +1. Read the brief description of the exercise. +2. Run the (possible) pre-exercise-code chunk. +3. Follow the instructions to fix the R code! + +## 2.0 Installing the packages + +One or more extra packages (in addition to `tidyverse`) will be needed below. + +```{r} +# Select (with mouse or arrow keys) the install.packages("...") and +# run it (by Ctrl+Enter / Cmd+Enter): + +library(dplyr) +library(tidyverse) +library(GGally) +library(ggplot2) +library(gapminder) +library(finalfit) +library(broom) + + +``` + + +## 2.1 Reading data from the web + +The first step of data analysis with R is reading data into R. This is done with a function. Which function and function arguments to use to do this, depends on the original format of the data. + +Conveniently in R, the same functions for reading data can usually be used whether the data is saved locally on your computer or somewhere else behind a web URL. + +After the correct function has been identified and data read into R, the data will usually be in R `data.frame` format. Te dimensions of a data frame are ($n$,$d$), where $n$ is the number of rows (the observations) and $d$ the number of columns (the variables). + +**The purpose of this course is to expose you to some basic and more advanced tasks of programming and data analysis with R.** + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# (No pre-exercise code in this exercise! Just go on!) + +``` + +### Instructions +- Read the `lrn14` data frame to memory with `read.table()`. There is information related to the data [here](http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt) +- Use `dim()` on the data frame to look at the dimensions of the data. How many rows and colums does the data have? +- Look at the structure of the data with `str()`. + +Hint: +- For both functions you can pass a data frame as the first (unnamed) argument. + +### R code +```{r} +# This is a code chunk in RStudio editor. +# Work with the exercise in this chunk, step-by-step. Fix the R code! + +# read the data into memory +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt", sep="\t", header=TRUE) + +# Look at the dimensions of the data + +# Look at the structure of the data +#use .txt file to import data set for better description. +# Preliminary results available at http://www.slideshare.net/kimmovehkalahti/the-relationship-between-learning-approaches-and-students-achievements-in-an-introductory-statistics-course-in-finland +#Total respondents n=183, total question n=60, so 184 rows including heading and 60 columns +#The code as respective column heading represents a question related to the survey and number. Each SN is a respondents and the answers to each question are given in a Lickert scale (0-5). + +dim(lrn14) +str(lrn14) +``` + + +## 2.2 Scaling variables + +The next step is [wrangling the data](https://en.wikipedia.org/wiki/Data_wrangling) into a format that is easy to analyze. We will wrangle our data for the next few exercises. + +A neat thing about R is that may operations are *vectorized*. It means that a single operation can affect all elements of a vector. This is often convenient. + +The column `Attitude` in `lrn14` is a sum of 10 questions related to students attitude towards statistics, each measured on the [Likert scale](https://en.wikipedia.org/wiki/Likert_scale) (1-5). Here we'll scale the combination variable back to the 1-5 scale. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# read the data into memory +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-data.txt", sep="\t", header=TRUE) + +``` + +### Instructions +- Execute the example codes to see how vectorized division works +- Use vector division to create a new column `attitude` in the `lrn14` data frame, where each observation of `Attitude` is scaled back to the original scale of the questions, by dividing it with the number of questions. + +Hint: +- Assign 'Attitude divided by 10' to the new column 'attitude. + +### R code +```{r} +lrn14$attitude <- lrn14$Attitude / 10 +``` + + +## 2.3 Combining variables + +Our data includes many questions that can be thought to measure the same *dimension*. You can read more about the data and the variables [here](http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt). Here we'll combine multiple questions into combination variables. Useful functions for summation with data frames in R are + +function | description +------------- | ---------- +`colSums(df)` | returns a sum of each column in `df` +`rowSums(df)` | returns a sum of each row in `df` +`colMeans(df)`| returns the mean of each column in `df` +`rowMeans(df)`| return the mean of each row in `df` + +We'll combine the use of `rowMeans()`with the `select()` function from the **dplyr** library to average the answers of selected questions. See how it is done from the example codes. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +# read the data into memory +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-data.txt", sep="\t", header=TRUE) +lrn14$attitude <- lrn14$Attitude / 10 +``` + +### Instructions +- Access the **dplyr** library +- Execute the example codes to create the combination variables 'deep' and 'surf' as columns in `lrn14` +- Select the columns related to strategic learning from `lrn14` +- Create the combination variable 'stra' as a column in `lrn14` + +Hints: +- Columns related to strategic learning are in the object `strategic_questions`. Use it for selecting the correct columns. +- Use the function `rowMeans()` identically to the examples + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# lrn14 is available + +# Access the dplyr library +library(dplyr) + +# questions related to deep, surface and strategic learning +deep_questions <- c("D03", "D11", "D19", "D27", "D07", "D14", "D22", "D30","D06", "D15", "D23", "D31") +surface_questions <- c("SU02","SU10","SU18","SU26", "SU05","SU13","SU21","SU29","SU08","SU16","SU24","SU32") +strategic_questions <- c("ST01","ST09","ST17","ST25","ST04","ST12","ST20","ST28") + + +# select the columns related to deep learning +deep_columns <- select(lrn14, one_of(deep_questions)) +# and create column 'deep' by averaging +lrn14$deep <- rowMeans(deep_columns) + +# select the columns related to surface learning +surface_columns <- select(lrn14, one_of(surface_questions)) +# and create column 'surf' by averaging +lrn14$surf <- rowMeans(surface_columns) + +# select the columns related to strategic learning +strategic_columns <- select(lrn14, one_of(strategic_questions)) +# and create column 'stra' by averaging +lrn14$stra <- rowMeans(strategic_columns) + +``` + + +## 2.4 Selecting columns + +Often it is convenient to work with only a certain column or a subset of columns of a bigger data frame. There are many ways to select columns of data frame in R and you saw one of them in the previous exercise: `select()` from **dplyr***. + +**dplyr** is a popular library for *data wrangling*. There are also convenient [data wrangling cheatsheets by RStudio](https://www.rstudio.com/resources/cheatsheets/) to help you get started (dplyr, tidyr etc.) + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-data.txt", sep="\t", header=TRUE) +lrn14$attitude <- lrn14$Attitude / 10 +deep_questions <- c("D03", "D11", "D19", "D27", "D07", "D14", "D22", "D30","D06", "D15", "D23", "D31") +lrn14$deep <- rowMeans(lrn14[, deep_questions]) +surface_questions <- c("SU02","SU10","SU18","SU26", "SU05","SU13","SU21","SU29","SU08","SU16","SU24","SU32") +lrn14$surf <- rowMeans(lrn14[, surface_questions]) +strategic_questions <- c("ST01","ST09","ST17","ST25","ST04","ST12","ST20","ST28") +lrn14$stra <- rowMeans(lrn14[, strategic_questions]) +``` + + +### Instructions +- Access the **dplyr** library +- Create object `keep_columns` +- Use `select()` (possibly together with `one_of()`) to create a new data frame `learning2014` with the columns named in `keep_columns`. +- Look at the structure of the new dataset + +Hint: +- See the previous exercise or the data wrangling cheatsheet for help on how to select columns + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! + +# lrn14 is available + +# access the dplyr library +library(dplyr) + +# choose a handful of columns to keep +keep_columns <- c("gender","Age","attitude", "deep", "stra", "surf", "Points") + +# select the 'keep_columns' to create a new dataset +learning2014 <- select(lrn14,all_of(keep_columns)) + +# see the structure of the new dataset + +print(learning2014) +``` +## 2.5 Modifying column names + +Sometimes you want to rename your column. You could do this by creating copies of the columns with new names, but you can also directly get and set the column names of a data frame, using the function `colnames()`. + +The **dplyr** library has a `rename()` function, which can also be used. Remember [the cheatsheets](https://www.rstudio.com/resources/cheatsheets/). + +### Instructions +- Print out the column names of `learning2014` +- Change the name of the second column to 'age' +- Change the name of 'Points' to 'points' +- Print out the column names again to see the changes + +Hint: +- You can use `colnames()` similarily to the example. Which index matches the column 'Points'? + +### R code + +```{r} +print(names(learning2014)) +colnames(learning2014)[2] <- "age" +learning2014 <- rename(learning2014, points = Points) +``` ```{r} -date() +print(dim(learning2014)) #check the dimension now (must have 166 rown and 7) ``` -Here we go again... +## 2.6 Excluding observations + +Often your data includes outliers or other observations which you wish to remove before further analysis. Or perhaps you simply wish to work with some subset of your data. + +In the **learning2014** data the variable 'points' denotes the students exam points in a statistics course exam. If the student did not attend an exam, the value of 'points' will be zero. We will remove these observations from the data. + +### R code +```{r, echo=FALSE} +learning2014 <- learning2014[learning2014$points > 0,] +``` + +### Instructions +- Access the **dplyr** library +- As an example, create object `male_students` by selecting the male students from `learning2014` +- Override `learning2014` and select rows where the 'points' variable is greater than zero. +- If you do not remember how logical comparison works in R, see the 'Logical comparison' exercise from the course 'R Short and Sweet'. + +Hint: +- The "greater than" logical operator is `>` + +```{r} +dim(lrn14) +dim(learning2014) + +#Export csv file +setwd("~/Documents/GitHub/IODS-project") +write_csv(learning2014, 'learning2014.csv') + +``` + +## 2.7 Visualizations with ggplot2 + +[**ggplot2**](http://ggplot2.org/) is a popular library for creating stunning graphics with R. It has some advantages over the basic plotting system in R, mainly consistent use of function arguments and flexible plot alteration. ggplot2 is an implementation of Leland Wilkinson's *Grammar of Graphics* — a general scheme for data visualization. + +In ggplot2, plots may be created via the convenience function `qplot()` where arguments and defaults are meant to be similar to base R's `plot()` function. More complex plotting capacity is available via `ggplot()`, which exposes the user to more explicit elements of the grammar. (from [wikipedia](https://en.wikipedia.org/wiki/Ggplot2)) + +RStudio has a [cheatsheet](https://www.rstudio.com/resources/cheatsheets/) for data visualization with ggplot2. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Access the **ggplot2** library +- Initialize the plot with data and aesthetic mappings +- Adjust the plot initialization: Add an aesthetic element to the plot by defining `col = gender` inside `aes()`. +- Define the visualization type (points) +- Draw the plot to see how it looks at this point +- *Add* a regression line to the plot +- *Add* the title "Student's attitude versus exam points" with `ggtitle("")` to the plot with regression line +- Draw the plot again to see the changes + +Hints: +- Use `+` to add the title to the plot +- The plot with regression line is saved in the object `p3` +- You can draw the plot by typing the object name where the plot is saved + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# Access the gglot2 library +library(ggplot2) + +# initialize plot with data and aesthetic mapping +p1 <- ggplot(learning2014, aes(x = attitude, y = points)) + +# define the visualization type (points) +p2 <- p1 + geom_point() + +# draw the plot +p2 + +# add a regression line +p3 <- p2 + geom_smooth(method = "lm") + +# draw the plot +p3 + +#Lets try and overview summary +p <- ggpairs(learning2014, mapping = aes(col = gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20))) + # draw the plot! +p + +``` +Fitting a regression line in a scatter plot between points and attitude doesn't provide a strong linear relationship. Would be interesting to work on this particular model as exercise to predict the relationship between these two variables. + +## 2.8 Exploring a data frame + +Often the most interesting feature of your data are the relationships between the variables. If there are only a handful of variables saved as columns in a data frame, it is possible to visualize all of these relationships neatly in a single plot. + +Base R offers a fast plotting function `pairs()`, which draws all possible scatter plots from the columns of a data frame, resulting in a scatter plot matrix. Libraries **GGally** and **ggplot2** together offer a slow but more detailed look at the variables, their distributions and relationships. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Draw a scatter matrix of the variables in learning2014 (other than gender) +- Adjust the code: Add the argument `col` to the `pairs()` function, defining the colour with the 'gender' variable in learning2014. +- Draw the plot again to see the changes. +- Access the **ggpot2** and **GGally** libraries and create the plot `p` with `ggpairs()`. +- Draw the plot. Note that the function is a bit slow. +- Adjust the argument `mapping` of `ggpairs()` by defining `col = gender` inside `aes()`. +- Draw the plot again. +- Adjust the code a little more: add another aesthetic element `alpha = 0.3` inside `aes()`. +- See the difference between the plots? + +Hints: +- You can use `$` to access a column of a data frame. +- Remember to separate function arguments with a comma +- You can draw the plot `p` by simply typing it's name: just like printing R objects. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# draw a scatter plot matrix of the variables in learning2014. +# [-1] excludes the first column (gender) +pairs(learning2014[-1]) + +# access the GGally and ggplot2 libraries +library(GGally) +library(ggplot2) + +# create a more advanced plot matrix with ggpairs() +p <- ggpairs(learning2014, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20))) + + + + +``` + + +## 2.9 Simple regression + +[Regression analysis](https://en.wikipedia.org/wiki/Regression_analysis) with R is easy once you have your data in a neat data frame. You can simply use the `lm()` function to fit a linear model. The first argument of `lm()` is a `formula`, which defines the target variable and the explanatory variable(s). + +The formula should be `y ~ x`, where `y` is the target (or outcome) variable and `x` the explanatory variable (predictor). The second argument of `lm()` is `data`, which should be a data frame where `y` and `x` are columns. + +The output of `lm()` is a linear model object, which can be saved for later use. The generic function `summary()` can be used to print out a summary of the model. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Create a scatter plot of 'points' versus 'attitude'. +- Fit a regression model where 'points' is the target and 'attitude' is the explanatory variable +- Print out the summary of the linear model object + +Hints: +- Replace `1` with the name of the explanatory variable in the formula inside `lm()` +- Use `summary()` on the model object to print out a summary + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# a scatter plot of points versus attitude +library(ggplot2) +qplot(attitude, points, data = learning2014) + geom_smooth(method = "lm") + +# fit a linear model +my_model <- lm(points ~ 1, data = learning2014) + +# print out a summary of the model +summary(my_model) + +``` + + +## 2.10 Multiple regression + +When there are more than one explanatory variables in the linear model, it is called multiple regression. In R, it is easy to include more than one explanatory variables in your linear model. This is done by simply defining more explanatory variables with the `formula` argument of `lm()`, as below + +``` +y ~ x1 + x2 + .. +``` +Here `y` is again the target variable and `x1, x2, ..` are the explanatory variables. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Draw a plot matrix of the learning2014 data with `ggpairs()` +- Fit a regression model where `points` is the target variable and both `attitude` and `stra` are the explanatory variables. +- Print out a summary of the model. +- Adjust the code: Add one more explanatory variable to the model. Based on the plot matrix, choose the variable with the third highest (absolute) correlation with the target variable and use that as the third variable. +- Print out a summary of the new model. + +Hint: +- The variable with the third highest absolute correlation with `points` is `surf`. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +library(GGally) +library(ggplot2) +# create an plot matrix with ggpairs() +ggpairs(learning2014, lower = list(combo = wrap("facethist", bins = 20))) + +# create a regression model with multiple explanatory variables +my_model2 <- lm(points ~ attitude + stra, data = learning2014) + +# print out a summary of the model +summary(my_model2) + +``` + + +## 2.11 Graphical model validation + +R makes it easy to graphically explore the validity of your model assumptions. If you give a linear model object as the first argument to the `plot()` function, the function automatically assumes you want diagnostic plots and will produce them. You can check the help page of plotting an lm object by typing `?plot.lm` or `help(plot.lm)` to the R console. + +In the plot function you can then use the argument `which` to choose which plots you want. `which` must be an integer vector corresponding to the following list of plots: + +which | graphic +----- | -------- +1 | Residuals vs Fitted values +2 | Normal QQ-plot +3 | Standardized residuals vs Fitted values +4 | Cook's distances +5 | Residuals vs Leverage +6 | Cook's distance vs Leverage + +
+We will focus on plots 1, 2 and 5: Residuals vs Fitted values, Normal QQ-plot and Residuals vs Leverage. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Create the linear model object `my_model2` +- Produce the following diagnostic plots using the `plot()` function: Residuals vs Fitted values, Normal QQ-plot and Residuals vs Leverage using the argument `which`. +- Before the call to the `plot()` function, add the following: `par(mfrow = c(2,2))`. This will place the following 4 graphics to the same plot. Execute the code again to see the effect. + +Hint: +- You can combine integers to an integer vector with `c()`. For example: `c(1,2,3)`. + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# create a regression model with multiple explanatory variables +my_model2 <- lm(points ~ attitude + stra, data = learning2014) + +# draw diagnostic plots using the plot() function. Choose the plots 1, 2 and 5 +plot(my_model2, which = 1) + +plot(my_model2, which = 2) + +plot(my_model2, which = 3) + +plot(my_model2, which = 4) + +plot(my_model2, which = 5) + +plot(my_model2, which = 6) + +``` + +## 2.12 Making predictions + +Okay, so let's assume that we have a linear model which seems to fit our standards. What can we do with it? + +The model quantifies the relationship between the explanatory variable(s) and the dependent variable. The model can also be used for predicting the dependent variable based on new observations of the explanatory variable(s). + +In R, predicting can be done using the `predict()` function. (see `?predict`). The first argument of predict is a model object and the argument `newdata` (a data.frame) can be used to make predictions based on new observations. One or more columns of `newdata` should have the same name as the explanatory variables in the model object. + +```{r, echo=FALSE} +# Pre-exercise-code (Run this code chunk first! Do NOT edit it.) + +# Click the green arrow ("Run Current Chunk") in the upper-right corner of this chunk. This will initialize the R objects needed in the exercise. Then move to Instructions of the exercise to start working. + +learning2014 <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", + sep = ",", header = T) +``` + +### Instructions + +- Create object `m` and print out a summary of the model +- Create object `new_attitudes` +- Adjust the code: Create a new data frame with a column named 'attitude' holding the new attitudes defined in `new_attitudes` +- Print out the new data frame +- `predict()` the new student's exam points based on their attitudes, using the `newdata` argument + +Hints: +- Type `attitude = new_attitudes` inside the `data.frame()` function. +- Give the `new_data` data.frame as the `newdata` argument for `predict()` + +### R code +```{r} +# Work with the exercise in this chunk, step-by-step. Fix the R code! +# learning2014 is available + +# Create model object m +m <- lm(points ~ attitude, data = learning2014) + +# print out a summary of the model +summary(m) + +# New observations +new_attitudes <- c("Mia" = 3.8, "Mike"= 4.4, "Riikka" = 2.2, "Pekka" = 2.9) +new_data <- data.frame(attitude = new_attitudes) + +# Print out the new data +summary(new_data) + +# Predict the new students exam points based on attitude +predict(m, newdata = new_data) + + +``` + + diff --git a/chapter2.html b/chapter2.html new file mode 100644 index 000000000..f93b8884a --- /dev/null +++ b/chapter2.html @@ -0,0 +1,1317 @@ + + + + + + + + + + + + + +chapter2.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Week 2: Regression and model validation

+

This set consists of a few numbered exercises. Go to each exercise in +turn and do as follows:

+
    +
  1. Read the brief description of the exercise.
  2. +
  3. Run the (possible) pre-exercise-code chunk.
  4. +
  5. Follow the instructions to fix the R code!
  6. +
+
+

2.0 Installing the packages

+

One or more extra packages (in addition to tidyverse) +will be needed below.

+
# Select (with mouse or arrow keys) the install.packages("...") and
+# run it (by Ctrl+Enter / Cmd+Enter):
+
+library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
library(tidyverse)
+
## ── Attaching packages
+## ───────────────────────────────────────
+## tidyverse 1.3.2 ──
+
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
+## ✔ tibble  3.1.8     ✔ stringr 1.4.1
+## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
+## ✔ readr   2.1.2     
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag()    masks stats::lag()
+
library(GGally)
+
## Registered S3 method overwritten by 'GGally':
+##   method from   
+##   +.gg   ggplot2
+
library(ggplot2)
+library(gapminder) 
+library(finalfit)
+library(broom)
+
+
+

2.1 Reading data from the web

+

The first step of data analysis with R is reading data into R. This +is done with a function. Which function and function arguments to use to +do this, depends on the original format of the data.

+

Conveniently in R, the same functions for reading data can usually be +used whether the data is saved locally on your computer or somewhere +else behind a web URL.

+

After the correct function has been identified and data read into R, +the data will usually be in R data.frame format. Te +dimensions of a data frame are (\(n\),\(d\)), where \(n\) is the number of rows (the +observations) and \(d\) the number of +columns (the variables).

+

The purpose of this course is to expose you to some basic and +more advanced tasks of programming and data analysis with +R.

+
+

Instructions

+
    +
  • Read the lrn14 data frame to memory with +read.table(). There is information related to the data here
  • +
  • Use dim() on the data frame to look at the dimensions +of the data. How many rows and colums does the data have?
  • +
  • Look at the structure of the data with str().
  • +
+

Hint: - For both functions you can pass a data frame as the first +(unnamed) argument.

+
+
+

R code

+
# This is a code chunk in RStudio editor.
+# Work with the exercise in this chunk, step-by-step. Fix the R code!
+
+# read the data into memory
+lrn14 <- read.table("http://www.helsinki.fi/~kvehkala/JYTmooc/JYTOPKYS3-meta.txt", sep="\t", header=TRUE)
+
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
+## EOF within quoted string
+
# Look at the dimensions of the data
+
+# Look at the structure of the data
+#use .txt file to import data set for better description.
+# Preliminary results available at http://www.slideshare.net/kimmovehkalahti/the-relationship-between-learning-approaches-and-students-achievements-in-an-introductory-statistics-course-in-finland
+#Total respondents n=183, total question n=60, so 184 rows including heading and 60 columns
+#The code as respective column heading represents a question related to the survey and number. Each SN is a respondents and the answers to each question are given in a Lickert scale (0-5).
+
+dim(lrn14)
+
## [1] 28  1
+
str(lrn14)
+
## 'data.frame':    28 obs. of  1 variable:
+##  $ Variable.descriptions.and.other.meta.data.of.JYTOPKYS3..syksy.2016.: chr  "Kimmo Vehkalahti: ASSIST 2014 - Phase 3 (end of Part 2), N=183" "Course: Johdatus yhteiskuntatilastotieteeseen, syksy 2014" "(Introduction to Social Statistics, fall 2014 - in Finnish)," "international survey of Approaches to Learning, made possible" ...
+
+
+
+

2.2 Scaling variables

+

The next step is wrangling the +data into a format that is easy to analyze. We will wrangle our data +for the next few exercises.

+

A neat thing about R is that may operations are vectorized. +It means that a single operation can affect all elements of a vector. +This is often convenient.

+

The column Attitude in lrn14 is a sum of 10 +questions related to students attitude towards statistics, each measured +on the Likert +scale (1-5). Here we’ll scale the combination variable back to the +1-5 scale.

+
+

Instructions

+
    +
  • Execute the example codes to see how vectorized division works
  • +
  • Use vector division to create a new column attitude in +the lrn14 data frame, where each observation of +Attitude is scaled back to the original scale of the +questions, by dividing it with the number of questions.
  • +
+

Hint: - Assign ‘Attitude divided by 10’ to the new column +’attitude.

+
+
+

R code

+
lrn14$attitude <- lrn14$Attitude / 10
+
+
+
+

2.3 Combining variables

+

Our data includes many questions that can be thought to measure the +same dimension. You can read more about the data and the +variables here. +Here we’ll combine multiple questions into combination variables. Useful +functions for summation with data frames in R are

+ + + + + + + + + + + + + + + + + + + + + + + + + +
functiondescription
colSums(df)returns a sum of each column in df
rowSums(df)returns a sum of each row in df
colMeans(df)returns the mean of each column in df
rowMeans(df)return the mean of each row in df
+

We’ll combine the use of rowMeans()with the +select() function from the dplyr library +to average the answers of selected questions. See how it is done from +the example codes.

+
+

Instructions

+
    +
  • Access the dplyr library
  • +
  • Execute the example codes to create the combination variables ‘deep’ +and ‘surf’ as columns in lrn14
  • +
  • Select the columns related to strategic learning from +lrn14
  • +
  • Create the combination variable ‘stra’ as a column in +lrn14
  • +
+

Hints: - Columns related to strategic learning are in the object +strategic_questions. Use it for selecting the correct +columns. - Use the function rowMeans() identically to the +examples

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+# lrn14 is available
+
+# Access the dplyr library
+library(dplyr)
+
+# questions related to deep, surface and strategic learning
+deep_questions <- c("D03", "D11", "D19", "D27", "D07", "D14", "D22", "D30","D06",  "D15", "D23", "D31")
+surface_questions <- c("SU02","SU10","SU18","SU26", "SU05","SU13","SU21","SU29","SU08","SU16","SU24","SU32")
+strategic_questions <- c("ST01","ST09","ST17","ST25","ST04","ST12","ST20","ST28")
+
+
+# select the columns related to deep learning 
+deep_columns <- select(lrn14, one_of(deep_questions))
+# and create column 'deep' by averaging
+lrn14$deep <- rowMeans(deep_columns)
+
+# select the columns related to surface learning 
+surface_columns <- select(lrn14, one_of(surface_questions))
+# and create column 'surf' by averaging
+lrn14$surf <- rowMeans(surface_columns)
+
+# select the columns related to strategic learning 
+strategic_columns <- select(lrn14, one_of(strategic_questions))
+# and create column 'stra' by averaging
+lrn14$stra <- rowMeans(strategic_columns)
+
+
+
+

2.4 Selecting columns

+

Often it is convenient to work with only a certain column or a subset +of columns of a bigger data frame. There are many ways to select columns +of data frame in R and you saw one of them in the previous exercise: +select() from dplyr*.

+

dplyr is a popular library for data +wrangling. There are also convenient data wrangling +cheatsheets by RStudio to help you get started (dplyr, tidyr +etc.)

+
+

Instructions

+
    +
  • Access the dplyr library
  • +
  • Create object keep_columns
  • +
  • Use select() (possibly together with +one_of()) to create a new data frame +learning2014 with the columns named in +keep_columns.
  • +
  • Look at the structure of the new dataset
  • +
+

Hint: - See the previous exercise or the data wrangling cheatsheet +for help on how to select columns

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+
+# lrn14 is available
+
+# access the dplyr library
+library(dplyr)
+
+# choose a handful of columns to keep
+keep_columns <- c("gender","Age","attitude", "deep", "stra", "surf", "Points")
+
+# select the 'keep_columns' to create a new dataset
+learning2014 <- select(lrn14,all_of(keep_columns))
+
+# see the structure of the new dataset
+
+print(learning2014)
+
##     gender Age attitude     deep  stra     surf Points
+## 1        F  53      3.7 3.583333 3.375 2.583333     25
+## 2        M  55      3.1 2.916667 2.750 3.166667     12
+## 3        F  49      2.5 3.500000 3.625 2.250000     24
+## 4        M  53      3.5 3.500000 3.125 2.250000     10
+## 5        M  49      3.7 3.666667 3.625 2.833333     22
+## 6        F  38      3.8 4.750000 3.625 2.416667     21
+## 7        M  50      3.5 3.833333 2.250 1.916667     21
+## 8        F  37      2.9 3.250000 4.000 2.833333     31
+## 9        M  37      3.8 4.333333 4.250 2.166667     24
+## 10       F  42      2.1 4.000000 3.500 3.000000     26
+## 11       M  37      3.9 3.583333 3.625 2.666667     31
+## 12       F  34      3.8 3.833333 4.750 2.416667     31
+## 13       F  34      2.4 4.250000 3.625 2.250000     23
+## 14       F  34      3.0 3.333333 3.500 2.750000     25
+## 15       M  35      2.6 4.166667 1.750 2.333333     21
+## 16       F  46      2.5 3.083333 3.125 2.666667      0
+## 17       F  33      4.1 3.666667 3.875 2.333333     31
+## 18       F  32      2.6 4.083333 1.375 2.916667     20
+## 19       F  44      2.6 3.500000 3.250 2.500000     22
+## 20       M  29      1.7 4.083333 3.000 3.750000      9
+## 21       F  30      2.7 4.000000 3.750 2.750000     24
+## 22       M  27      3.9 3.916667 2.625 2.333333     28
+## 23       M  29      3.4 4.000000 2.375 2.416667     30
+## 24       F  31      2.7 4.000000 3.625 3.000000     24
+## 25       F  37      2.3 3.666667 2.750 2.416667      9
+## 26       F  26      3.7 3.666667 1.750 2.833333     26
+## 27       F  26      4.4 4.416667 3.250 3.166667     32
+## 28       M  30      4.1 3.916667 4.000 3.000000     32
+## 29       F  37      2.4 3.833333 2.125 2.166667      0
+## 30       F  33      3.7 3.750000 3.625 2.000000     33
+## 31       F  33      2.5 3.250000 2.875 3.500000     29
+## 32       M  28      3.0 3.583333 3.000 3.750000     30
+## 33       M  26      3.4 4.916667 1.625 2.500000     19
+## 34       F  27      3.2 3.583333 3.250 2.083333     23
+## 35       F  25      2.0 2.916667 3.500 2.416667     19
+## 36       F  31      2.4 3.666667 3.000 2.583333     12
+## 37       M  20      4.2 4.500000 3.250 1.583333     10
+## 38       F  39      1.6 4.083333 1.875 2.833333     11
+## 39       M  38      3.1 3.833333 4.375 1.833333     20
+## 40       M  24      3.8 3.250000 3.625 2.416667     26
+## 41       M  26      3.8 2.333333 2.500 3.250000     31
+## 42       M  25      3.3 3.333333 1.250 3.416667     20
+## 43       F  30      1.7 4.083333 4.000 3.416667     23
+## 44       F  25      2.5 2.916667 3.000 3.166667     12
+## 45       M  30      3.2 3.333333 2.500 3.500000     24
+## 46       F  48      3.5 3.833333 4.875 2.666667     17
+## 47       F  24      3.2 3.666667 5.000 2.416667     29
+## 48       F  40      4.2 4.666667 4.375 3.583333     23
+## 49       M  25      3.1 3.750000 3.250 2.083333     28
+## 50       F  23      3.9 3.416667 4.000 3.750000     31
+## 51       F  25      1.9 4.166667 3.125 2.916667     23
+## 52       F  23      2.1 2.916667 2.500 2.916667     25
+## 53       M  27      2.5 4.166667 3.125 2.416667     18
+## 54       M  25      3.2 3.583333 3.250 3.000000     19
+## 55       M  23      3.2 2.833333 2.125 3.416667     22
+## 56       F  23      2.6 4.000000 2.750 2.916667     25
+## 57       F  23      2.3 2.916667 2.375 3.250000     21
+## 58       F  45      3.8 3.000000 3.125 3.250000      9
+## 59       F  22      2.8 4.083333 4.000 2.333333     28
+## 60       F  23      3.3 2.916667 4.000 3.250000     25
+## 61       M  21      4.8 3.500000 2.250 2.500000     29
+## 62       M  21      4.0 4.333333 3.250 1.750000     33
+## 63       F  21      4.0 4.250000 3.625 2.250000     33
+## 64       F  21      4.7 3.416667 3.625 2.083333     25
+## 65       F  26      2.3 3.083333 2.500 2.833333     18
+## 66       F  25      3.1 4.583333 1.875 2.833333     22
+## 67       F  26      2.7 3.416667 2.000 2.416667     17
+## 68       M  21      4.1 3.416667 1.875 2.250000     25
+## 69       F  22      3.4 3.500000 2.625 2.333333      0
+## 70       F  23      3.4 3.416667 4.000 2.833333     28
+## 71       F  22      2.5 3.583333 2.875 2.250000     22
+## 72       F  22      2.1 1.583333 3.875 1.833333     26
+## 73       F  22      1.4 3.333333 2.500 2.916667     11
+## 74       F  23      1.9 4.333333 2.750 2.916667     29
+## 75       M  22      3.7 4.416667 4.500 2.083333     22
+## 76       M  25      4.1 4.583333 3.500 2.666667      0
+## 77       M  23      3.2 4.833333 3.375 2.333333     21
+## 78       M  24      2.8 3.083333 2.625 2.416667     28
+## 79       F  22      4.1 3.000000 4.125 2.750000     33
+## 80       F  23      2.5 4.083333 2.625 3.250000     16
+## 81       M  22      2.8 4.083333 2.250 1.750000     31
+## 82       M  20      3.8 3.750000 2.750 2.583333     22
+## 83       M  22      3.1 3.083333 3.000 3.333333     31
+## 84       M  21      3.5 4.750000 1.625 2.833333     23
+## 85       F  22      3.6 4.250000 1.875 2.500000     26
+## 86       F  23      2.6 4.166667 3.375 2.416667     12
+## 87       M  21      4.4 4.416667 3.750 2.416667     26
+## 88       M  22      4.5 3.833333 2.125 2.583333     31
+## 89       M  29      3.2 3.333333 2.375 3.000000     19
+## 90       F  26      2.0 3.416667 1.750 2.333333      0
+## 91       F  29      3.9 3.166667 2.750 2.000000     30
+## 92       F  21      2.5 3.166667 3.125 3.416667     12
+## 93       M  28      3.3 3.833333 3.500 2.833333     17
+## 94       M  23      3.5 4.166667 1.625 3.416667      0
+## 95       F  21      3.3 4.250000 2.625 2.250000     18
+## 96       F  30      3.0 3.833333 3.375 2.750000     19
+## 97       F  21      2.9 3.666667 2.250 3.916667     21
+## 98       M  23      3.3 3.833333 3.000 2.333333     24
+## 99       F  21      3.3 3.833333 4.000 2.750000     28
+## 100      F  21      3.5 3.833333 3.500 2.750000     17
+## 101      F  20      3.6 3.666667 2.625 2.916667     18
+## 102      M  21      4.2 4.166667 2.875 2.666667      0
+## 103      M  22      3.7 4.333333 2.500 2.083333     17
+## 104      F  35      2.8 4.416667 3.250 3.583333      0
+## 105      M  21      4.2 3.750000 3.750 3.666667     23
+## 106      M  22      2.2 2.666667 2.000 3.416667      0
+## 107      M  21      3.2 4.166667 3.625 2.833333     26
+## 108      F  20      5.0 4.000000 4.125 3.416667     28
+## 109      M  22      4.7 4.000000 4.375 1.583333     31
+## 110      F  20      3.6 4.583333 2.625 2.916667     27
+## 111      F  20      3.6 3.666667 4.000 3.000000     25
+## 112      M  24      2.9 3.666667 2.750 2.916667     23
+## 113      F  20      3.5 3.833333 2.750 2.666667     21
+## 114      F  19      4.0 2.583333 1.375 3.000000     27
+## 115      F  21      3.5 3.500000 2.250 2.750000     28
+## 116      F  21      3.2 3.083333 3.625 3.083333     23
+## 117      F  22      2.6 4.250000 3.750 2.500000     21
+## 118      F  25      2.0 3.166667 4.000 2.333333     25
+## 119      F  21      2.7 3.083333 3.125 3.000000     11
+## 120      F  22      3.2 4.166667 3.250 3.000000     19
+## 121      F  25      3.3 2.250000 2.125 4.000000     24
+## 122      F  20      3.9 3.333333 2.875 3.250000     28
+## 123      M  24      3.3 3.083333 1.500 3.500000     21
+## 124      F  20      3.0 2.750000 2.500 3.500000     24
+## 125      M  21      3.7 3.250000 3.250 3.833333     24
+## 126      F  26      1.4 3.750000 3.250 3.083333      0
+## 127      M  28      3.0 4.750000 2.500 2.500000      0
+## 128      F  20      2.5 4.000000 3.625 2.916667     20
+## 129      F  20      2.9 3.583333 3.875 2.166667     19
+## 130      M  31      3.9 4.083333 3.875 1.666667     30
+## 131      F  20      3.6 4.250000 2.375 2.083333     22
+## 132      F  22      2.9 3.416667 3.000 2.833333     16
+## 133      F  22      2.1 3.083333 3.375 3.416667     16
+## 134      M  21      3.1 3.500000 2.750 3.333333     19
+## 135      F  20      2.4 3.916667 3.125 3.500000      0
+## 136      M  22      4.0 3.666667 4.500 2.583333     30
+## 137      F  21      3.1 4.250000 2.625 2.833333     23
+## 138      F  21      2.3 4.250000 2.750 3.333333     19
+## 139      F  21      2.8 3.833333 3.250 3.000000     18
+## 140      F  21      3.7 4.416667 4.125 2.583333     28
+## 141      F  20      2.6 3.500000 3.375 2.416667     21
+## 142      F  21      2.4 3.583333 2.750 3.583333     19
+## 143      F  25      3.0 3.666667 4.125 2.083333     27
+## 144      F  27      2.9 4.250000 3.000 2.750000      0
+## 145      F  20      3.2 3.750000 4.125 3.916667      0
+## 146      M  21      2.8 2.083333 3.250 4.333333     24
+## 147      F  24      2.9 4.250000 2.875 2.666667     21
+## 148      F  20      2.4 3.583333 2.875 3.000000     20
+## 149      M  21      3.1 4.000000 2.375 2.666667     28
+## 150      F  20      1.9 3.333333 3.875 2.166667     12
+## 151      F  20      2.0 3.500000 2.125 2.666667     21
+## 152      F  18      3.8 3.166667 4.000 2.250000     28
+## 153      F  21      3.4 3.583333 3.250 2.666667     31
+## 154      F  19      3.7 3.416667 2.625 3.333333     18
+## 155      F  21      2.9 4.250000 2.750 3.500000     25
+## 156      F  20      2.3 3.250000 4.000 2.750000     19
+## 157      M  21      4.1 4.416667 3.000 2.000000     21
+## 158      F  20      2.7 3.250000 3.375 2.833333     16
+## 159      F  21      3.5 3.916667 3.875 3.500000      7
+## 160      F  20      3.4 3.583333 3.250 2.500000     21
+## 161      F  18      3.2 4.500000 3.375 3.166667     17
+## 162      M  22      3.3 3.583333 4.125 3.083333     22
+## 163      F  22      3.3 3.666667 3.500 2.916667     18
+## 164      M  24      3.5 2.583333 2.000 3.166667     25
+## 165      F  19      3.2 4.166667 3.625 2.500000     24
+## 166      F  20      3.1 3.250000 3.375 3.833333     23
+## 167      F  21      2.4 3.583333 2.250 2.833333      0
+## 168      F  20      2.8 4.333333 2.125 2.250000     23
+## 169      F  17      1.7 3.916667 4.625 3.416667     26
+## 170      M  19      1.9 2.666667 2.500 3.750000     12
+## 171      F  23      3.2 3.583333 2.000 1.750000      0
+## 172      F  20      3.5 3.083333 2.875 3.000000     32
+## 173      F  20      2.4 3.750000 2.750 2.583333     22
+## 174      F  25      3.8 4.083333 3.375 2.750000      0
+## 175      F  20      2.1 4.166667 4.000 3.333333     20
+## 176      F  20      2.9 4.166667 2.375 2.833333     21
+## 177      F  19      1.9 3.250000 3.875 3.000000     23
+## 178      F  19      2.0 4.083333 3.375 2.833333     20
+## 179      F  22      4.2 2.916667 1.750 3.166667     28
+## 180      M  35      4.1 3.833333 3.000 2.750000     31
+## 181      F  18      3.7 3.166667 2.625 3.416667     18
+## 182      F  19      3.6 3.416667 2.625 3.000000     30
+## 183      M  21      1.8 4.083333 3.375 2.666667     19
+
+
+
+

2.5 Modifying column names

+

Sometimes you want to rename your column. You could do this by +creating copies of the columns with new names, but you can also directly +get and set the column names of a data frame, using the function +colnames().

+

The dplyr library has a rename() +function, which can also be used. Remember the +cheatsheets.

+
+

Instructions

+
    +
  • Print out the column names of learning2014
  • +
  • Change the name of the second column to ‘age’
  • +
  • Change the name of ‘Points’ to ‘points’
  • +
  • Print out the column names again to see the changes
  • +
+

Hint: - You can use colnames() similarily to the +example. Which index matches the column ‘Points’?

+
+
+

R code

+
print(names(learning2014))
+
## [1] "gender"   "Age"      "attitude" "deep"     "stra"     "surf"     "Points"
+
colnames(learning2014)[2] <- "age"
+learning2014 <- rename(learning2014, points = Points)
+
print(dim(learning2014)) #check the dimension now (must have 166 rown and 7)
+
## [1] 183   7
+
+
+
+

2.6 Excluding observations

+

Often your data includes outliers or other observations which you +wish to remove before further analysis. Or perhaps you simply wish to +work with some subset of your data.

+

In the learning2014 data the variable ‘points’ +denotes the students exam points in a statistics course exam. If the +student did not attend an exam, the value of ‘points’ will be zero. We +will remove these observations from the data.

+
+

R code

+
+
+

Instructions

+
    +
  • Access the dplyr library
  • +
  • As an example, create object male_students by selecting +the male students from learning2014
  • +
  • Override learning2014 and select rows where the +‘points’ variable is greater than zero.
  • +
  • If you do not remember how logical comparison works in R, see the +‘Logical comparison’ exercise from the course ‘R Short and Sweet’.
  • +
+

Hint: - The “greater than” logical operator is >

+
dim(lrn14)
+
## [1] 183  64
+
dim(learning2014)
+
## [1] 166   7
+
#Export csv file
+setwd("~/Documents/GitHub/IODS-project")
+write_csv(learning2014, 'learning2014.csv')
+
+
+
+

2.7 Visualizations with ggplot2

+

ggplot2 is a +popular library for creating stunning graphics with R. It has some +advantages over the basic plotting system in R, mainly consistent use of +function arguments and flexible plot alteration. ggplot2 is an +implementation of Leland Wilkinson’s Grammar of Graphics — a +general scheme for data visualization.

+

In ggplot2, plots may be created via the convenience function +qplot() where arguments and defaults are meant to be +similar to base R’s plot() function. More complex plotting +capacity is available via ggplot(), which exposes the user +to more explicit elements of the grammar. (from wikipedia)

+

RStudio has a cheatsheet for +data visualization with ggplot2.

+
+

Instructions

+
    +
  • Access the ggplot2 library
  • +
  • Initialize the plot with data and aesthetic mappings
  • +
  • Adjust the plot initialization: Add an aesthetic element to the plot +by defining col = gender inside aes().
  • +
  • Define the visualization type (points)
  • +
  • Draw the plot to see how it looks at this point
  • +
  • Add a regression line to the plot
  • +
  • Add the title “Student’s attitude versus exam points” with +ggtitle("<insert title here>") to the plot with +regression line
  • +
  • Draw the plot again to see the changes
  • +
+

Hints: - Use + to add the title to the plot - The plot +with regression line is saved in the object p3 - You can +draw the plot by typing the object name where the plot is saved

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+# learning2014 is available
+
+# Access the gglot2 library
+library(ggplot2)
+
+# initialize plot with data and aesthetic mapping
+p1 <- ggplot(learning2014, aes(x = attitude, y = points))
+
+# define the visualization type (points)
+p2 <- p1 + geom_point()
+
+# draw the plot
+p2
+

+
# add a regression line
+p3 <- p2 + geom_smooth(method = "lm")
+
+# draw the plot
+p3
+
## `geom_smooth()` using formula 'y ~ x'
+

+
#Lets try and overview summary
+p <- ggpairs(learning2014, mapping = aes(col = gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20)))
+ # draw the plot!
+p
+

+
+
+
+

2.8 Exploring a data frame

+

Often the most interesting feature of your data are the relationships +between the variables. If there are only a handful of variables saved as +columns in a data frame, it is possible to visualize all of these +relationships neatly in a single plot.

+

Base R offers a fast plotting function pairs(), which +draws all possible scatter plots from the columns of a data frame, +resulting in a scatter plot matrix. Libraries GGally +and ggplot2 together offer a slow but more detailed +look at the variables, their distributions and relationships.

+
+

Instructions

+
    +
  • Draw a scatter matrix of the variables in learning2014 (other than +gender)
  • +
  • Adjust the code: Add the argument col to the +pairs() function, defining the colour with the ‘gender’ +variable in learning2014.
  • +
  • Draw the plot again to see the changes.
  • +
  • Access the ggpot2 and GGally +libraries and create the plot p with +ggpairs().
  • +
  • Draw the plot. Note that the function is a bit slow.
  • +
  • Adjust the argument mapping of ggpairs() +by defining col = gender inside aes().
  • +
  • Draw the plot again.
  • +
  • Adjust the code a little more: add another aesthetic element +alpha = 0.3 inside aes().
  • +
  • See the difference between the plots?
  • +
+

Hints: - You can use $ to access a column of a data +frame. - Remember to separate function arguments with a comma - You can +draw the plot p by simply typing it’s name: just like +printing R objects.

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+# learning2014 is available
+
+# draw a scatter plot matrix of the variables in learning2014.
+# [-1] excludes the first column (gender)
+pairs(learning2014[-1])
+

+
# access the GGally and ggplot2 libraries
+library(GGally)
+library(ggplot2)
+
+# create a more advanced plot matrix with ggpairs()
+p <- ggpairs(learning2014, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20)))
+
+
+
+

2.9 Simple regression

+

Regression +analysis with R is easy once you have your data in a neat data +frame. You can simply use the lm() function to fit a linear +model. The first argument of lm() is a +formula, which defines the target variable and the +explanatory variable(s).

+

The formula should be y ~ x, where y is the +target (or outcome) variable and x the explanatory variable +(predictor). The second argument of lm() is +data, which should be a data frame where y and +x are columns.

+

The output of lm() is a linear model object, which can +be saved for later use. The generic function summary() can +be used to print out a summary of the model.

+
+

Instructions

+
    +
  • Create a scatter plot of ‘points’ versus ‘attitude’.
  • +
  • Fit a regression model where ‘points’ is the target and ‘attitude’ +is the explanatory variable
  • +
  • Print out the summary of the linear model object
  • +
+

Hints: - Replace 1 with the name of the explanatory +variable in the formula inside lm() - Use +summary() on the model object to print out a summary

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+# learning2014 is available
+
+# a scatter plot of points versus attitude
+library(ggplot2)
+qplot(attitude, points, data = learning2014) + geom_smooth(method = "lm")
+
## `geom_smooth()` using formula 'y ~ x'
+

+
# fit a linear model
+my_model <- lm(points ~ 1, data = learning2014)
+
+# print out a summary of the model
+summary(my_model)
+
## 
+## Call:
+## lm(formula = points ~ 1, data = learning2014)
+## 
+## Residuals:
+##      Min       1Q   Median       3Q      Max 
+## -15.7169  -3.7169   0.2831   5.0331  10.2831 
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)  22.7169     0.4575   49.65   <2e-16 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 5.895 on 165 degrees of freedom
+
+
+
+

2.10 Multiple regression

+

When there are more than one explanatory variables in the linear +model, it is called multiple regression. In R, it is easy to include +more than one explanatory variables in your linear model. This is done +by simply defining more explanatory variables with the +formula argument of lm(), as below

+
y ~ x1 + x2 + ..
+

Here y is again the target variable and +x1, x2, .. are the explanatory variables.

+
+

Instructions

+
    +
  • Draw a plot matrix of the learning2014 data with +ggpairs()
  • +
  • Fit a regression model where points is the target +variable and both attitude and stra are the +explanatory variables.
  • +
  • Print out a summary of the model.
  • +
  • Adjust the code: Add one more explanatory variable to the model. +Based on the plot matrix, choose the variable with the third highest +(absolute) correlation with the target variable and use that as the +third variable.
  • +
  • Print out a summary of the new model.
  • +
+

Hint: - The variable with the third highest absolute correlation with +points is surf.

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+# learning2014 is available
+
+library(GGally)
+library(ggplot2)
+# create an plot matrix with ggpairs()
+ggpairs(learning2014, lower = list(combo = wrap("facethist", bins = 20)))
+

+
# create a regression model with multiple explanatory variables
+my_model2 <- lm(points ~ attitude + stra, data = learning2014)
+
+# print out a summary of the model
+summary(my_model2)
+
## 
+## Call:
+## lm(formula = points ~ attitude + stra, data = learning2014)
+## 
+## Residuals:
+##      Min       1Q   Median       3Q      Max 
+## -17.6436  -3.3113   0.5575   3.7928  10.9295 
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)   8.9729     2.3959   3.745  0.00025 ***
+## attitude      3.4658     0.5652   6.132 6.31e-09 ***
+## stra          0.9137     0.5345   1.709  0.08927 .  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 5.289 on 163 degrees of freedom
+## Multiple R-squared:  0.2048, Adjusted R-squared:  0.1951 
+## F-statistic: 20.99 on 2 and 163 DF,  p-value: 7.734e-09
+
+
+
+

2.11 Graphical model validation

+

R makes it easy to graphically explore the validity of your model +assumptions. If you give a linear model object as the first argument to +the plot() function, the function automatically assumes you +want diagnostic plots and will produce them. You can check the help page +of plotting an lm object by typing ?plot.lm or +help(plot.lm) to the R console.

+

In the plot function you can then use the argument which +to choose which plots you want. which must be an integer +vector corresponding to the following list of plots:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
whichgraphic
1Residuals vs Fitted values
2Normal QQ-plot
3Standardized residuals vs Fitted values
4Cook’s distances
5Residuals vs Leverage
6Cook’s distance vs Leverage
+


We will focus on plots 1, 2 and 5: Residuals vs Fitted values, +Normal QQ-plot and Residuals vs Leverage.

+
+

Instructions

+
    +
  • Create the linear model object my_model2
  • +
  • Produce the following diagnostic plots using the plot() +function: Residuals vs Fitted values, Normal QQ-plot and Residuals vs +Leverage using the argument which.
  • +
  • Before the call to the plot() function, add the +following: par(mfrow = c(2,2)). This will place the +following 4 graphics to the same plot. Execute the code again to see the +effect.
  • +
+

Hint: - You can combine integers to an integer vector with +c(). For example: c(1,2,3).

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+# learning2014 is available
+
+# create a regression model with multiple explanatory variables
+my_model2 <- lm(points ~ attitude + stra, data = learning2014)
+
+# draw diagnostic plots using the plot() function. Choose the plots 1, 2 and 5
+plot(my_model2, which = 1)
+

+
plot(my_model2, which = 2)
+

+
plot(my_model2, which = 3)
+

+
plot(my_model2, which = 4)
+

+
plot(my_model2, which = 5)
+

+
plot(my_model2, which = 6)
+

+
+
+
+

2.12 Making predictions

+

Okay, so let’s assume that we have a linear model which seems to fit +our standards. What can we do with it?

+

The model quantifies the relationship between the explanatory +variable(s) and the dependent variable. The model can also be used for +predicting the dependent variable based on new observations of the +explanatory variable(s).

+

In R, predicting can be done using the predict() +function. (see ?predict). The first argument of predict is +a model object and the argument newdata (a data.frame) can +be used to make predictions based on new observations. One or more +columns of newdata should have the same name as the +explanatory variables in the model object.

+
+

Instructions

+
    +
  • Create object m and print out a summary of the +model
  • +
  • Create object new_attitudes
  • +
  • Adjust the code: Create a new data frame with a column named +‘attitude’ holding the new attitudes defined in +new_attitudes
  • +
  • Print out the new data frame
  • +
  • predict() the new student’s exam points based on their +attitudes, using the newdata argument
  • +
+

Hints: - Type attitude = new_attitudes inside the +data.frame() function. - Give the new_data +data.frame as the newdata argument for +predict()

+
+
+

R code

+
# Work with the exercise in this chunk, step-by-step. Fix the R code!
+# learning2014 is available
+
+# Create model object m
+m <- lm(points ~ attitude, data = learning2014)
+
+# print out a summary of the model
+summary(m)
+
## 
+## Call:
+## lm(formula = points ~ attitude, data = learning2014)
+## 
+## Residuals:
+##      Min       1Q   Median       3Q      Max 
+## -16.9763  -3.2119   0.4339   4.1534  10.6645 
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)  11.6372     1.8303   6.358 1.95e-09 ***
+## attitude      3.5255     0.5674   6.214 4.12e-09 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 5.32 on 164 degrees of freedom
+## Multiple R-squared:  0.1906, Adjusted R-squared:  0.1856 
+## F-statistic: 38.61 on 1 and 164 DF,  p-value: 4.119e-09
+
# New observations
+new_attitudes <- c("Mia" = 3.8, "Mike"= 4.4, "Riikka" = 2.2, "Pekka" = 2.9)
+new_data <- data.frame(attitude = new_attitudes)
+
+# Print out the new data
+summary(new_data)
+
##     attitude    
+##  Min.   :2.200  
+##  1st Qu.:2.725  
+##  Median :3.350  
+##  Mean   :3.325  
+##  3rd Qu.:3.950  
+##  Max.   :4.400
+
# Predict the new students exam points based on attitude
+predict(m, newdata = new_data)
+
##      Mia     Mike   Riikka    Pekka 
+## 25.03390 27.14918 19.39317 21.86099
+

Awesome work!

+
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/chapter3.Rmd b/chapter3.Rmd new file mode 100644 index 000000000..3a03be17e --- /dev/null +++ b/chapter3.Rmd @@ -0,0 +1,103 @@ +# Week 3: Logistic Regression + +## 3.0 Loading packages + +```{r} +library("boot") +library("readr") +``` + +## 3.1 Data wrangling + +Data wrangling completed using the UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets.html). csv files downloaded from the repository and final data set "alc.csv" has been exported into the project folder. Rcodes in the learning diary has been updated. + +## 3.1 Creating new R Markdown file + +New Rmd file has been created with tittle "chapter3" and now saved in the project folder. + +## 3.2 Importing the data set and exploration + +```{r} +library(tidyverse) +stu_alc2014 <- read_csv("alc.csv" , show_col_types= FALSE) +spec(stu_alc2014) + +#looking at the data +dim(stu_alc2014) +colnames(stu_alc2014) +glimpse(stu_alc2014) +``` +## 3.3 Choosing the variables + +```{r} +stu_alc2014_2 <- select(stu_alc2014, absences, G3, age, freetime, high_use) +str(stu_alc2014_2) +colnames(stu_alc2014_2) +``` +In above data set we have 370 observations and 35 variables. These data set belong to a survey from questionnaire in two secondary school from Portugal in which different variables, demographics and socio-economic features measures students association with alcohol consumption. Here I choose 4 interesting variable that I think has greater influence in the level of alcohol consumption among these school kids. +My rationale behind choosing these variables is that certain age groups have higher access to alcohol, ages above 16 lets say can access alcohol easily than ages below 16 so I wish to see the relation here. Also free time activities and amount of free time influences alcohol consumption. Likewise final grade and absences can directly correlate with higher use. So I wish to test my model with these variables. + +## 3.4 Exploring and plotting the choosen variable + +```{r} +#Let's explore the choose variables using box plots + +##Let's see for absences +g1 <- ggplot(stu_alc2014, aes(x = high_use, y = absences)) +g1 + geom_boxplot() + ylab("Absences") + +##Let's see for G3(Final Grade) +g1 <- ggplot(stu_alc2014, aes(x = high_use, y = G3)) +g1 + geom_boxplot() + ylab("Final Grade") + +##Let's see for age +g1 <- ggplot(stu_alc2014, aes(x = high_use, y = age)) +g1 + geom_boxplot() + ylab("Age") + +##And for freetime +g1 <- ggplot(stu_alc2014, aes(x = high_use, y = freetime)) +g1 + geom_boxplot() + ylab("Freetime") + +``` +General overview from the plot infers some association between high alcohol use and absence and also age (holds true). It would be interesting to fit this kind of model to see the relationship. Final grade and alcohol consumption shows some association but there appears to be some difference in their mean for true and false.Free time doesn't seem to have much effect on alcohol consumption. + + +## 3.4 Logistic regression + +```{r} +##Lets call this model-1 (m1) which explores 4 variable +m1 <- glm(high_use ~ absences + G3 + age + freetime, data = stu_alc2014_2, family = "binomial") +summary(m1) +``` +This seems to be an interesting outcome. Fitted model seem to match my above hypothesis based on box plot and distribution for some variable, absences for instant is the most significant and free time is second most significant among other variable. Absence has the highest significance (p = 0.000249) and free time is significant with p = 0.001511. Final grade and age however doesn't have the same lever of significance as other two. Final grade has p = 0.05 which can be considered significant result but comparatively, two other variable stands out the most. + +## 3.5 Power of the model + +```{r} +coef(m1) +OR <- coef(m1) %>% exp +CI <- confint(m1) %>% exp + +#Print OR and CI +cbind(OR, CI) +``` +Coefficient and Confidence Interval: Looking at the coefficient, final grade has negative value suggesting an opposite association between higher alcohol consumption which makes sense because higher grade can result in lower alcohol consumption. absences, age and free time shows positive association with free time being the most cause of higher alcohol consumption followed by age and absences. This is also supported by the odds ratio and confidence interval for each tested variable. +The above hypothesis therefore holds true for most variable. It is quite obviously important to explore other variable to see effect of an additional variable on an outcome through multivariate analysis. + +## 3.6 Cross validation (Bonus) + +```{r} + + +``` + + +## 3.7 Cross validation of different model (Super-Bonus) + +```{r} + +``` + + + + diff --git a/chapter3.html b/chapter3.html new file mode 100644 index 000000000..8d7067021 --- /dev/null +++ b/chapter3.html @@ -0,0 +1,656 @@ + + + + + + + + + + + + + +chapter3.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Week 3: Logistic Regression

+
+

3.0 Loading packages

+
library("boot")
+library("readr")
+
+
+

3.1 Data wrangling

+

Data wrangling completed using the UCI Machine Learning Repository +(https://archive.ics.uci.edu/ml/datasets.html). csv files +downloaded from the repository and final data set “alc.csv” has been +exported into the project folder. Rcodes in the learning diary has been +updated.

+
+
+

3.1 Creating new R Markdown file

+

New Rmd file has been created with tittle “chapter3” and now saved in +the project folder.

+
+
+

3.2 Importing the data set and exploration

+
library(tidyverse)
+
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
+## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
+## ✔ tibble  3.1.8     ✔ stringr 1.4.1
+## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
+## ✔ purrr   0.3.4     
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag()    masks stats::lag()
+
stu_alc2014 <- read_csv("alc.csv" , show_col_types= FALSE)
+spec(stu_alc2014)
+
## cols(
+##   school = col_character(),
+##   sex = col_character(),
+##   age = col_double(),
+##   address = col_character(),
+##   famsize = col_character(),
+##   Pstatus = col_character(),
+##   Medu = col_double(),
+##   Fedu = col_double(),
+##   Mjob = col_character(),
+##   Fjob = col_character(),
+##   reason = col_character(),
+##   guardian = col_character(),
+##   traveltime = col_double(),
+##   studytime = col_double(),
+##   schoolsup = col_character(),
+##   famsup = col_character(),
+##   activities = col_character(),
+##   nursery = col_character(),
+##   higher = col_character(),
+##   internet = col_character(),
+##   romantic = col_character(),
+##   famrel = col_double(),
+##   freetime = col_double(),
+##   goout = col_double(),
+##   Dalc = col_double(),
+##   Walc = col_double(),
+##   health = col_double(),
+##   failures = col_double(),
+##   paid = col_character(),
+##   absences = col_double(),
+##   G1 = col_double(),
+##   G2 = col_double(),
+##   G3 = col_double(),
+##   alc_use = col_double(),
+##   high_use = col_logical()
+## )
+
#looking at the data
+dim(stu_alc2014)
+
## [1] 370  35
+
colnames(stu_alc2014)
+
##  [1] "school"     "sex"        "age"        "address"    "famsize"   
+##  [6] "Pstatus"    "Medu"       "Fedu"       "Mjob"       "Fjob"      
+## [11] "reason"     "guardian"   "traveltime" "studytime"  "schoolsup" 
+## [16] "famsup"     "activities" "nursery"    "higher"     "internet"  
+## [21] "romantic"   "famrel"     "freetime"   "goout"      "Dalc"      
+## [26] "Walc"       "health"     "failures"   "paid"       "absences"  
+## [31] "G1"         "G2"         "G3"         "alc_use"    "high_use"
+
glimpse(stu_alc2014)
+
## Rows: 370
+## Columns: 35
+## $ school     <chr> "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP",…
+## $ sex        <chr> "F", "F", "F", "F", "F", "M", "M", "F", "M", "M", "F", "F",…
+## $ age        <dbl> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15, 15, 15,…
+## $ address    <chr> "U", "U", "U", "U", "U", "U", "U", "U", "U", "U", "U", "U",…
+## $ famsize    <chr> "GT3", "GT3", "LE3", "GT3", "GT3", "LE3", "LE3", "GT3", "LE…
+## $ Pstatus    <chr> "A", "T", "T", "T", "T", "T", "T", "A", "A", "T", "T", "T",…
+## $ Medu       <dbl> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, 3, 3, 4,…
+## $ Fedu       <dbl> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, 3, 2, 3,…
+## $ Mjob       <chr> "at_home", "at_home", "at_home", "health", "other", "servic…
+## $ Fjob       <chr> "teacher", "other", "other", "services", "other", "other", …
+## $ reason     <chr> "course", "course", "other", "home", "home", "reputation", …
+## $ guardian   <chr> "mother", "father", "mother", "mother", "father", "mother",…
+## $ traveltime <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 1,…
+## $ studytime  <dbl> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 1,…
+## $ schoolsup  <chr> "yes", "no", "yes", "no", "no", "no", "no", "yes", "no", "n…
+## $ famsup     <chr> "no", "yes", "no", "yes", "yes", "yes", "no", "yes", "yes",…
+## $ activities <chr> "no", "no", "no", "yes", "no", "yes", "no", "no", "no", "ye…
+## $ nursery    <chr> "yes", "no", "yes", "yes", "yes", "yes", "yes", "yes", "yes…
+## $ higher     <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "ye…
+## $ internet   <chr> "no", "yes", "yes", "yes", "no", "yes", "yes", "no", "yes",…
+## $ romantic   <chr> "no", "no", "no", "yes", "no", "no", "no", "no", "no", "no"…
+## $ famrel     <dbl> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, 5, 5, 3,…
+## $ freetime   <dbl> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, 3, 5, 1,…
+## $ goout      <dbl> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 5, 3,…
+## $ Dalc       <dbl> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
+## $ Walc       <dbl> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, 1, 4, 3,…
+## $ health     <dbl> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, 4, 5, 5,…
+## $ failures   <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,…
+## $ paid       <chr> "no", "no", "yes", "yes", "yes", "yes", "no", "no", "yes", …
+## $ absences   <dbl> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, 3, 9, 5,…
+## $ G1         <dbl> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11, 14, 16,…
+## $ G2         <dbl> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11, 15, 16…
+## $ G3         <dbl> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 12, 16, 1…
+## $ alc_use    <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1.5, 1.0,…
+## $ high_use   <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
+
+
+

3.3 Choosing the variables

+
stu_alc2014_2 <- select(stu_alc2014, absences, G3, age, freetime, high_use)
+str(stu_alc2014_2)
+
## tibble [370 × 5] (S3: tbl_df/tbl/data.frame)
+##  $ absences: num [1:370] 5 3 8 1 2 8 0 4 0 0 ...
+##  $ G3      : num [1:370] 8 8 11 14 12 14 12 10 18 14 ...
+##  $ age     : num [1:370] 18 17 15 15 16 16 16 17 15 15 ...
+##  $ freetime: num [1:370] 3 3 3 2 3 4 4 1 2 5 ...
+##  $ high_use: logi [1:370] FALSE FALSE TRUE FALSE FALSE FALSE ...
+
colnames(stu_alc2014_2)
+
## [1] "absences" "G3"       "age"      "freetime" "high_use"
+

In above data set we have 370 observations and 35 variables. These +data set belong to a survey from questionnaire in two secondary school +from Portugal in which different variables, demographics and +socio-economic features measures students association with alcohol +consumption. Here I choose 4 interesting variable that I think has +greater influence in the level of alcohol consumption among these school +kids. My rationale behind choosing these variables is that certain age +groups have higher access to alcohol, ages above 16 lets say can access +alcohol easily than ages below 16 so I wish to see the relation here. +Also free time activities and amount of free time influences alcohol +consumption. Likewise final grade and absences can directly correlate +with higher use. So I wish to test my model with these variables.

+
+
+

3.4 Exploring and plotting the choosen variable

+
#Let's explore the choose variables using box plots
+
+##Let's see for absences
+g1 <- ggplot(stu_alc2014, aes(x = high_use, y = absences))
+g1 + geom_boxplot() + ylab("Absences")
+

+
##Let's see for G3(Final Grade)
+g1 <- ggplot(stu_alc2014, aes(x = high_use, y = G3))
+g1 + geom_boxplot() + ylab("Final Grade")
+

+
##Let's see for age
+g1 <- ggplot(stu_alc2014, aes(x = high_use, y = age))
+g1 + geom_boxplot() + ylab("Age")
+

+
##And for freetime
+g1 <- ggplot(stu_alc2014, aes(x = high_use, y = freetime))
+g1 + geom_boxplot() + ylab("Freetime")
+

+General overview from the plot infers some association between high +alcohol use and absence and also age (holds true). It would be +interesting to fit this kind of model to see the relationship. Final +grade and alcohol consumption shows some association but there appears +to be some difference in their mean for true and false.Free time doesn’t +seem to have much effect on alcohol consumption.

+
+
+

3.4 Logistic regression

+
##Lets call this model-1 (m1) which explores 4 variable
+m1 <- glm(high_use ~ absences + G3 + age + freetime, data = stu_alc2014_2, family = "binomial")
+summary(m1)
+
## 
+## Call:
+## glm(formula = high_use ~ absences + G3 + age + freetime, family = "binomial", 
+##     data = stu_alc2014_2)
+## 
+## Deviance Residuals: 
+##     Min       1Q   Median       3Q      Max  
+## -2.0110  -0.8181  -0.6584   1.1345   2.0343  
+## 
+## Coefficients:
+##             Estimate Std. Error z value Pr(>|z|)    
+## (Intercept) -3.80128    1.87360  -2.029 0.042472 *  
+## absences     0.08428    0.02301   3.663 0.000249 ***
+## G3          -0.06623    0.03666  -1.807 0.070838 .  
+## age          0.11958    0.10233   1.169 0.242592    
+## freetime     0.39844    0.12559   3.172 0.001511 ** 
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 452.04  on 369  degrees of freedom
+## Residual deviance: 417.94  on 365  degrees of freedom
+## AIC: 427.94
+## 
+## Number of Fisher Scoring iterations: 4
+

This seems to be an interesting outcome. Fitted model seem to match +my above hypothesis based on box plot and distribution for some +variable, absences for instant is the most significant and free time is +second most significant among other variable. Absence has the highest +significance (p = 0.000249) and free time is significant with p = +0.001511. Final grade and age however doesn’t have the same lever of +significance as other two. Final grade has p = 0.05 which can be +considered significant result but comparatively, two other variable +stands out the most.

+
+
+

3.5 Power of the model

+
coef(m1)
+
## (Intercept)    absences          G3         age    freetime 
+## -3.80128143  0.08428256 -0.06622629  0.11957546  0.39844165
+
OR <- coef(m1) %>% exp
+CI <- confint(m1) %>% exp
+
## Waiting for profiling to be done...
+
#Print OR and CI 
+cbind(OR, CI)
+
##                     OR        2.5 %    97.5 %
+## (Intercept) 0.02234212 0.0005401794 0.8536065
+## absences    1.08793626 1.0421786444 1.1408573
+## G3          0.93591905 0.8705458921 1.0055377
+## age         1.12701829 0.9228580718 1.3797569
+## freetime    1.48950173 1.1689712226 1.9147718
+

Coefficient and Confidence Interval: Looking at the coefficient, +final grade has negative value suggesting an opposite association +between higher alcohol consumption which makes sense because higher +grade can result in lower alcohol consumption. absences, age and free +time shows positive association with free time being the most cause of +higher alcohol consumption followed by age and absences. This is also +supported by the odds ratio and confidence interval for each tested +variable.
+The above hypothesis therefore holds true for most variable. It is quite +obviously important to explore other variable to see effect of an +additional variable on an outcome through multivariate analysis.

+
+
+

3.6 Cross validation (Bonus)

+
+
+

3.7 Cross validation of different model (Super-Bonus)

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/chapter4.Rmd b/chapter4.Rmd new file mode 100644 index 000000000..230bc746b --- /dev/null +++ b/chapter4.Rmd @@ -0,0 +1,318 @@ +# Week 4: Clustering and Classification + +The topics of this chapter - clustering and classification - are handy and visual tools of exploring statistical data. Clustering means that some points (or observations) of the data are in some sense closer to each other than some other points. In other words, the data points do not comprise a homogeneous sample, but instead, it is somehow clustered. + +In general, the clustering methods try to find these clusters (or groups) from the data. One of the most typical clustering methods is called k-means clustering. Also hierarchical clustering methods quite popular, giving tree-like dendrograms as their main output. + +As such, clusters are easy to find, but what might be the "right" number of clusters? It is not always clear. And how to give these clusters names and interpretations? + +Based on a successful clustering, we may try to classify new observations to these clusters and hence validate the results of clustering. Another way is to use various forms of discriminant analysis, which operates with the (now) known clusters, asking: "what makes the difference(s) between these groups (clusters)?" + +In the connection of these methods, we also discuss the topic of distance (or dissimilarity or similarity) measures. There are lots of other measures than just the ordinary Euclidean distance, although it is one of the most important ones. Several discrete and even binary measures exist and are widely used for different purposes in various disciplines. + +## 4.0 Packages for clustering and classification + +```{r} +library(tidyverse) +library(GGally) +library(dplyr) +library(ggplot2) +library(MASS) +library(corrplot) +``` + +## 4.1 loading data and Exploring the data + +```{r} +#Loading the Boston data from the MASS package +data("Boston") + +# Exploring the structure and the dimensions of the data set + +str(Boston) +dim(Boston) +``` +The data set "Boston" from MASS library presents geographical, demographic, structural, economic and cultural description of different suburbs in Boston and their implication on housing values within different suburbs. Data set contains 14 variables (factors) and 506 observation and most variables are numerical. + +The variables in the data set which influences the housing prices are: + +crim = per capita crime rate by town. +zn = proportion of residential land zoned for lots over 25,000 sq.ft. +indus = proportion of non-retail business acres per town. +chas = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). +nox = nitrogen oxides concentration (parts per 10 million). +rm = average number of rooms per dwelling. +age = proportion of owner-occupied units built prior to 1940. +dis = weighted mean of distances to five Boston employment centres. +rad = index of accessibility to radial highways. +tax = full-value property-tax rate per $10,000. +ptratio = pupil-teacher ratio by town. +black = 1000(Bk - 0.63)^2 where BkBk is the proportion of blacks by town. +lstat = lower status of the population (percent). +medv = median value of owner-occupied homes in $1000s. + +Data is sourced from +Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand for clean air. J. Environ. Economics and Management 5, 81–102. +Belsley D.A., Kuh, E. and Welsch, R.E. (1980) Regression Diagnostics. Identifying Influential Data and Sources of Collinearity. New York: Wiley. + +For more information about the "Boston" data set follow the link https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/Boston.html + +## 4.2 Graphical Overview of the data set + +```{r} +#overview +summary(Boston) + +#Plotting +long_Boston <- pivot_longer(Boston, cols=everything(), names_to = 'variable', values_to = 'value') +p1 <- ggplot(data=long_Boston) +p1 + geom_histogram(mapping= aes(x=value)) + facet_wrap(~variable, scales="free") +``` + + + +```{r} +#Relationship between the variables +cor_matrix <- cor(Boston) +cor_matrix +corrplot(cor_matrix, method="circle") + +#Let's try a mixed plot with correlation values (closer the values to 1, stronger the correlation) +corrplot.mixed(cor_matrix, lower = 'number', upper = 'circle',tl.pos ="lt", tl.col='black', tl.cex=0.6, number.cex = 0.5) +``` +In the correlation matrix, Color blue represents positive and red represents the negative correlation. The values closer to +1 and -1 implies a stronger positive and negative correlation respectively. The threshold is represented by different shades of blue and black and decimal points which is indexed on the right side of the plot. + +Based on the plot we can see: + +Positive correlation between some variables for example: + +- Proportional of non-retail business acres per town (indus) and nitrogen oxides concentration in ppm (nox) +- Index of accessibility to radial highways (rad) and full-value property-tax rate per $10,000 (tax) + +Negative correlation between some variables for example: + +- Proportional of non-retail business acres per town (indus) and weighted mean of distances to five Boston employment centers (dis) +- Nitrogen oxides concentration in ppm (nox) and weighted mean of distances to five Boston employment centers (dis) + +## 4.3 Standardizing the dataset + +Lets beging the scaling exercise to standardize the data set. Since the Boston data contains only numerical values, so we can use the function `scale()` to standardize the whole dataset as mentioned in the exercise 4 instruction. + +We also have the following equation from exercise which basically gives us the idea on scaling i.e., subtract the column means from corresponding means and dividing with the standard deviation. + +$$scaled(x) = \frac{x - mean(x)}{ sd(x)}$$ + +```{r} +#Saving the scaled data to the object +boston_scaled <- scale(Boston) + +#Lets see +summary(boston_scaled) + +# class of the boston_scaled object +class(boston_scaled) + +# change the object to data frame +boston_scaled <- as.data.frame(boston_scaled) +``` +Scaling the data set with various scales makes the analyses easier. Scale() transforms the data into a matrix and array so for further analysis we can change it back to the data frame + +Let's scale further by creating a quantile vector of crim and print it + +```{r} +bins <- quantile(boston_scaled$crim) +bins + +# create a categorical variable 'crime' +crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE) + +# look at the table of the new factor crime +table(crime) + +# remove original crim from the dataset +boston_scaled <- dplyr::select(boston_scaled, -crim) + +# add the new categorical value to scaled data +boston_scaled <- data.frame(boston_scaled, crime) + +# let's see the new data set now !! +summary(boston_scaled) +``` + +Now as mention in the exercise set "Divide and conquer", lets divide train and test sets: training -> 80% and testing -> 20% + +```{r} + +# number of rows in the Boston data set +n <- nrow(boston_scaled) +n + +# choose randomly 80% of the rows +ind <- sample(n, size = n * 0.8) + +# create train set +train <- boston_scaled[ind,] + +# create test set +test <- boston_scaled[-ind,] + +# save the correct classes from test data +correct_classes <- test$crime + +# remove the crime variable from test data +test <- dplyr::select(test, -crime) + +``` + +## 4.4 Fitting the linear discriminant analysis + +Let's move on fitting LDA on the training set. + +Our target variable: crime(categorical) + +```{r} + +# linear discriminant analysis +# Other variables are designated (.) +lda.fit <- lda(crime ~ ., data = train) + +# print the lda.fit object +lda.fit +``` + +There are three discriminant function LD(proportion of trace) as follow: + + LD1 LD2 LD3 +0.9489 0.0362 0.0150 + +```{r} +# the function for lda biplot arrows +lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){ + heads <- coef(x) + arrows(x0 = 0, y0 = 0, + x1 = myscale * heads[,choices[1]], + y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads) + text(myscale * heads[,choices], labels = row.names(heads), + cex = tex, col=color, pos=3) +} + +# target classes as numeric +classes <- as.numeric(train$crime) + +# plot the lda results +plot(lda.fit, dimen = 2, col = classes, pch = classes) +lda.arrows(lda.fit, myscale = 1) +``` + +## 4.5 Predicting the LDA model + +```{r} +library(MASS) + +ind <- sample(nrow(boston_scaled), size = nrow(boston_scaled) * 0.8) + +train <- boston_scaled[ind,] + +test <- boston_scaled[-ind,] + +correct_classes <- test$crime + +test <- dplyr::select(test, -crime) + +lda.fit = lda(crime ~ ., data=train) + +# predict classes with test data +lda.pred <- predict(lda.fit, newdata = test) + +# cross tabulate the results +table(correct = correct_classes, predicted = lda.pred$class) + +``` + +## 4.6 Model Optimization and K means clustering + +Let's work with clustering now +Let's calculate also the distances between observations to assess the similarity between data points. As instructed we calculated two distance matrix euclidean and manhattan. + +```{r} +#start by reloading the Boston data set +data("Boston") + +boston_scaled <- scale(Boston) +summary(boston_scaled) + +#class of Boston_scaled object +class(boston_scaled) + +# change the object to data frame for futher analysis +boston_scaled <- as.data.frame(boston_scaled) + +``` +```{r} +# Calculating distances between the observation +# euclidean distance matrix +dist_eu <- dist(boston_scaled, method = "euclidean") + +# look at the summary of the distances +summary(dist_eu) + +# Manhattan distance matrix +dist_man <- dist(boston_scaled, method = "manhattan") + +# look at the summary of the distances +summary(dist_man) +``` + +Lets do K-means clustering (first with 4 clusters and then 3) + +```{r} + +set.seed(123) #function set.seed() is used here to deal with the random assigning of the initial cluster centers when conducting k-means clustering + +# k-means clustering +km <- kmeans(boston_scaled, centers = 4) + +# plotting the scaled Boston data set with clusters +pairs(boston_scaled, col = km$cluster) +``` +With 4 clusters, it appears that there is some overlapping between clusters. lets try 3 clusters and check +```{r} +# k-means clustering +km <- kmeans(boston_scaled, centers = 3) + +# plotting the scaled Boston data set with clusters +pairs(boston_scaled, col = km$cluster) +``` +with 3 clusters it is even worse !! +We can try to determine the optimal number of cluster for our model and see how it works + +## 4.7 K-means: determine the k + +```{r} + +#Investigating the optimal number of clusters +set.seed(123) #setseed function again + +# determine the number of clusters +k_max <- 10 + +# calculate the total within sum of squares +twcss <- sapply(1:k_max, function(k){kmeans(boston_scaled, k)$tot.withinss}) + +# visualize the results +qplot(x = 1:k_max, y = twcss, geom = 'line', ylab = "TWCSS", xlab = "Number of cluster") +``` + +From the plot it looks like the optimal number of clusters for our model is 2-3 as there is a sharp drop in TWCSS values. We can try with 2 because 2 seems more optimal as by 2.5 the drop is significant and we can see the substantial change. + +```{r} +# k-means clustering +km <- kmeans(boston_scaled, centers = 2) + +# plot the Boston data set with clusters predicted +pairs(boston_scaled, col = km$cluster) +``` +```{r} +pairs(boston_scaled[6:10], col = km$cluster) +``` +With 2 clusters, the model seem better, there is good separation for example rad and tax. rm and age seems ok as well. diff --git a/chapter4.html b/chapter4.html new file mode 100644 index 000000000..0a2b89433 --- /dev/null +++ b/chapter4.html @@ -0,0 +1,958 @@ + + + + + + + + + + + + + +chapter4.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Week 4: Clustering and Classification

+

The topics of this chapter - clustering and classification - are +handy and visual tools of exploring statistical data. Clustering means +that some points (or observations) of the data are in some sense closer +to each other than some other points. In other words, the data points do +not comprise a homogeneous sample, but instead, it is somehow +clustered.

+

In general, the clustering methods try to find these clusters (or +groups) from the data. One of the most typical clustering methods is +called k-means clustering. Also hierarchical clustering methods quite +popular, giving tree-like dendrograms as their main output.

+

As such, clusters are easy to find, but what might be the “right” +number of clusters? It is not always clear. And how to give these +clusters names and interpretations?

+

Based on a successful clustering, we may try to classify new +observations to these clusters and hence validate the results of +clustering. Another way is to use various forms of discriminant +analysis, which operates with the (now) known clusters, asking: “what +makes the difference(s) between these groups (clusters)?”

+

In the connection of these methods, we also discuss the topic of +distance (or dissimilarity or similarity) measures. There are lots of +other measures than just the ordinary Euclidean distance, although it is +one of the most important ones. Several discrete and even binary +measures exist and are widely used for different purposes in various +disciplines.

+
+

4.0 Packages for clustering and classification

+
library(tidyverse)
+
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
+## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
+## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
+## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
+## ✔ readr   2.1.2     ✔ forcats 0.5.2
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag()    masks stats::lag()
+
library(GGally)
+
## Registered S3 method overwritten by 'GGally':
+##   method from   
+##   +.gg   ggplot2
+
library(dplyr)
+library(ggplot2)
+library(MASS)
+
## 
+## Attaching package: 'MASS'
+## 
+## The following object is masked from 'package:dplyr':
+## 
+##     select
+
library(corrplot)
+
## corrplot 0.92 loaded
+
+
+

4.1 loading data and Exploring the data

+
#Loading the Boston data from the MASS package
+data("Boston")
+
+# Exploring the structure and the dimensions of the data set
+
+str(Boston)
+
## 'data.frame':    506 obs. of  14 variables:
+##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
+##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
+##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
+##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
+##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
+##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
+##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
+##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
+##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
+##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
+##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
+##  $ black  : num  397 397 393 395 397 ...
+##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
+##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
+
dim(Boston)
+
## [1] 506  14
+

The data set “Boston” from MASS library presents geographical, +demographic, structural, economic and cultural description of different +suburbs in Boston and their implication on housing values within +different suburbs. Data set contains 14 variables (factors) and 506 +observation and most variables are numerical.

+

The variables in the data set which influences the housing prices +are:

+

crim = per capita crime rate by town. zn = proportion of residential +land zoned for lots over 25,000 sq.ft. indus = proportion of non-retail +business acres per town. chas = Charles River dummy variable (= 1 if +tract bounds river; 0 otherwise). nox = nitrogen oxides concentration +(parts per 10 million). rm = average number of rooms per dwelling. age = +proportion of owner-occupied units built prior to 1940. dis = weighted +mean of distances to five Boston employment centres. rad = index of +accessibility to radial highways. tax = full-value property-tax rate per +$10,000. ptratio = pupil-teacher ratio by town. black = 1000(Bk - +0.63)^2 where BkBk is the proportion of blacks by town. lstat = lower +status of the population (percent). medv = median value of +owner-occupied homes in $1000s.

+

Data is sourced from Harrison, D. and Rubinfeld, D.L. (1978) Hedonic +prices and the demand for clean air. J. Environ. Economics and +Management 5, 81–102. Belsley D.A., Kuh, E. and Welsch, R.E. (1980) +Regression Diagnostics. Identifying Influential Data and Sources of +Collinearity. New York: Wiley.

+

For more information about the “Boston” data set follow the link https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/Boston.html

+
+
+

4.2 Graphical Overview of the data set

+
#overview
+summary(Boston)
+
##       crim                zn             indus            chas        
+##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
+##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
+##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
+##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
+##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
+##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
+##       nox               rm             age              dis        
+##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
+##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
+##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
+##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
+##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
+##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
+##       rad              tax           ptratio          black       
+##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
+##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
+##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
+##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
+##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
+##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
+##      lstat            medv      
+##  Min.   : 1.73   Min.   : 5.00  
+##  1st Qu.: 6.95   1st Qu.:17.02  
+##  Median :11.36   Median :21.20  
+##  Mean   :12.65   Mean   :22.53  
+##  3rd Qu.:16.95   3rd Qu.:25.00  
+##  Max.   :37.97   Max.   :50.00
+
#Plotting 
+long_Boston <- pivot_longer(Boston, cols=everything(), names_to = 'variable', values_to = 'value')
+p1 <- ggplot(data=long_Boston)
+p1 + geom_histogram(mapping= aes(x=value)) + facet_wrap(~variable, scales="free")
+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+
#Relationship between the variables
+cor_matrix <- cor(Boston)
+cor_matrix
+
##                crim          zn       indus         chas         nox
+## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
+## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
+## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
+## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
+## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
+## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
+## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
+## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
+## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
+## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
+## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
+## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
+## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
+## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
+##                  rm         age         dis          rad         tax    ptratio
+## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
+## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
+## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
+## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
+## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
+## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
+## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
+## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
+## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
+## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
+## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
+## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
+## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
+## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
+##               black      lstat       medv
+## crim    -0.38506394  0.4556215 -0.3883046
+## zn       0.17552032 -0.4129946  0.3604453
+## indus   -0.35697654  0.6037997 -0.4837252
+## chas     0.04878848 -0.0539293  0.1752602
+## nox     -0.38005064  0.5908789 -0.4273208
+## rm       0.12806864 -0.6138083  0.6953599
+## age     -0.27353398  0.6023385 -0.3769546
+## dis      0.29151167 -0.4969958  0.2499287
+## rad     -0.44441282  0.4886763 -0.3816262
+## tax     -0.44180801  0.5439934 -0.4685359
+## ptratio -0.17738330  0.3740443 -0.5077867
+## black    1.00000000 -0.3660869  0.3334608
+## lstat   -0.36608690  1.0000000 -0.7376627
+## medv     0.33346082 -0.7376627  1.0000000
+
corrplot(cor_matrix, method="circle")
+

+
#Let's try a mixed plot with correlation values (closer the values to 1, stronger the correlation) 
+corrplot.mixed(cor_matrix, lower = 'number', upper = 'circle',tl.pos ="lt", tl.col='black', tl.cex=0.6, number.cex = 0.5)
+

+In the correlation matrix, Color blue represents positive and red +represents the negative correlation. The values closer to +1 and -1 +implies a stronger positive and negative correlation respectively. The +threshold is represented by different shades of blue and black and +decimal points which is indexed on the right side of the plot.

+

Based on the plot we can see:

+

Positive correlation between some variables for example:

+
    +
  • Proportional of non-retail business acres per town (indus) and +nitrogen oxides concentration in ppm (nox)
  • +
  • Index of accessibility to radial highways (rad) and full-value +property-tax rate per $10,000 (tax)
  • +
+

Negative correlation between some variables for example:

+
    +
  • Proportional of non-retail business acres per town (indus) and +weighted mean of distances to five Boston employment centers (dis)
  • +
  • Nitrogen oxides concentration in ppm (nox) and weighted mean of +distances to five Boston employment centers (dis)
  • +
+
+
+

4.3 Standardizing the dataset

+

Lets beging the scaling exercise to standardize the data set. Since +the Boston data contains only numerical values, so we can use the +function scale() to standardize the whole dataset as +mentioned in the exercise 4 instruction.

+

We also have the following equation from exercise which basically +gives us the idea on scaling i.e., subtract the column means from +corresponding means and dividing with the standard deviation.

+

\[scaled(x) = \frac{x - mean(x)}{ +sd(x)}\]

+
#Saving the scaled data to the object 
+boston_scaled <- scale(Boston)
+
+#Lets see
+summary(boston_scaled)
+
##       crim                 zn               indus              chas        
+##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723  
+##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723  
+##  Median :-0.390280   Median :-0.48724   Median :-0.2109   Median :-0.2723  
+##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723  
+##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648  
+##       nox                rm               age               dis         
+##  Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658  
+##  1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049  
+##  Median :-0.1441   Median :-0.1084   Median : 0.3171   Median :-0.2790  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617  
+##  Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566  
+##       rad               tax             ptratio            black        
+##  Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033  
+##  1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049  
+##  Median :-0.5225   Median :-0.4642   Median : 0.2746   Median : 0.3808  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332  
+##  Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406  
+##      lstat              medv        
+##  Min.   :-1.5296   Min.   :-1.9063  
+##  1st Qu.:-0.7986   1st Qu.:-0.5989  
+##  Median :-0.1811   Median :-0.1449  
+##  Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.6024   3rd Qu.: 0.2683  
+##  Max.   : 3.5453   Max.   : 2.9865
+
# class of the boston_scaled object
+class(boston_scaled)
+
## [1] "matrix" "array"
+
# change the object to data frame
+boston_scaled <- as.data.frame(boston_scaled)
+

Scaling the data set with various scales makes the analyses easier. +Scale() transforms the data into a matrix and array so for further +analysis we can change it back to the data frame

+

Let’s scale further by creating a quantile vector of crim and print +it

+
bins <- quantile(boston_scaled$crim)
+bins
+
##           0%          25%          50%          75%         100% 
+## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610
+
# create a categorical variable 'crime'
+crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE)
+
+# look at the table of the new factor crime
+table(crime)
+
## crime
+## [-0.419,-0.411]  (-0.411,-0.39] (-0.39,0.00739]  (0.00739,9.92] 
+##             127             126             126             127
+
# remove original crim from the dataset
+boston_scaled <- dplyr::select(boston_scaled, -crim)
+
+# add the new categorical value to scaled data
+boston_scaled <- data.frame(boston_scaled, crime)
+
+# let's see the new data set now !! 
+summary(boston_scaled)
+
##        zn               indus              chas              nox         
+##  Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723   Min.   :-1.4644  
+##  1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723   1st Qu.:-0.9121  
+##  Median :-0.48724   Median :-0.2109   Median :-0.2723   Median :-0.1441  
+##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723   3rd Qu.: 0.5981  
+##  Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648   Max.   : 2.7296  
+##        rm               age               dis               rad         
+##  Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658   Min.   :-0.9819  
+##  1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049   1st Qu.:-0.6373  
+##  Median :-0.1084   Median : 0.3171   Median :-0.2790   Median :-0.5225  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617   3rd Qu.: 1.6596  
+##  Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566   Max.   : 1.6596  
+##       tax             ptratio            black             lstat        
+##  Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033   Min.   :-1.5296  
+##  1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049   1st Qu.:-0.7986  
+##  Median :-0.4642   Median : 0.2746   Median : 0.3808   Median :-0.1811  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332   3rd Qu.: 0.6024  
+##  Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406   Max.   : 3.5453  
+##       medv                     crime    
+##  Min.   :-1.9063   [-0.419,-0.411]:127  
+##  1st Qu.:-0.5989   (-0.411,-0.39] :126  
+##  Median :-0.1449   (-0.39,0.00739]:126  
+##  Mean   : 0.0000   (0.00739,9.92] :127  
+##  3rd Qu.: 0.2683                        
+##  Max.   : 2.9865
+

Now as mention in the exercise set “Divide and conquer”, lets divide +train and test sets: training -> 80% and testing -> 20%

+
# number of rows in the Boston data set 
+n <- nrow(boston_scaled)
+n
+
## [1] 506
+
# choose randomly 80% of the rows
+ind <- sample(n,  size = n * 0.8)
+
+# create train set
+train <- boston_scaled[ind,]
+
+# create test set 
+test <- boston_scaled[-ind,]
+
+# save the correct classes from test data
+correct_classes <- test$crime
+
+# remove the crime variable from test data
+test <- dplyr::select(test, -crime)
+
+
+

4.4 Fitting the linear discriminant analysis

+

Let’s move on fitting LDA on the training set.

+

Our target variable: crime(categorical)

+
# linear discriminant analysis
+# Other variables are designated (.)
+lda.fit <- lda(crime ~ ., data = train)
+
+# print the lda.fit object
+lda.fit
+
## Call:
+## lda(crime ~ ., data = train)
+## 
+## Prior probabilities of groups:
+## [-0.419,-0.411]  (-0.411,-0.39] (-0.39,0.00739]  (0.00739,9.92] 
+##       0.2475248       0.2425743       0.2648515       0.2450495 
+## 
+## Group means:
+##                          zn      indus        chas        nox         rm
+## [-0.419,-0.411]  0.89447554 -0.8958692 -0.15421606 -0.8856586  0.4200376
+## (-0.411,-0.39]  -0.07378209 -0.3153201 -0.03128211 -0.5402615 -0.1894835
+## (-0.39,0.00739] -0.37343545  0.2036824  0.27960087  0.3978276  0.1189162
+## (0.00739,9.92]  -0.48724019  1.0171737 -0.15302300  1.0420418 -0.4325790
+##                        age        dis        rad        tax     ptratio
+## [-0.419,-0.411] -0.8914002  0.9357258 -0.7004968 -0.6996510 -0.45106613
+## (-0.411,-0.39]  -0.2556164  0.3769261 -0.5377192 -0.4763222 -0.07796912
+## (-0.39,0.00739]  0.4301848 -0.3891695 -0.3936844 -0.2751204 -0.33430767
+## (0.00739,9.92]   0.7949744 -0.8417167  1.6375616  1.5136504  0.78011702
+##                       black        lstat        medv
+## [-0.419,-0.411]  0.36979482 -0.750037404  0.49756898
+## (-0.411,-0.39]   0.31566531 -0.065445306 -0.03607504
+## (-0.39,0.00739]  0.09089154  0.007096295  0.19478879
+## (0.00739,9.92]  -0.86757833  0.978459544 -0.71986628
+## 
+## Coefficients of linear discriminants:
+##                 LD1          LD2         LD3
+## zn       0.14031540  0.598096615 -0.92383757
+## indus   -0.02930192 -0.189172614  0.22826688
+## chas    -0.10117097 -0.142233787  0.03149598
+## nox      0.41573233 -0.794904909 -1.25110968
+## rm      -0.11314539 -0.023087098 -0.22031401
+## age      0.33303790 -0.386946830  0.10834464
+## dis     -0.08707448 -0.225670182  0.26125036
+## rad      3.05322455  0.876145452  0.07643080
+## tax     -0.04818628  0.024343114  0.23153680
+## ptratio  0.13124250 -0.007118077 -0.25133969
+## black   -0.14400936 -0.003522677  0.13875654
+## lstat    0.21595687 -0.113343967  0.39891878
+## medv     0.18585476 -0.356563842 -0.20147563
+## 
+## Proportion of trace:
+##    LD1    LD2    LD3 
+## 0.9480 0.0404 0.0116
+

There are three discriminant function LD(proportion of trace) as +follow:

+

LD1 LD2 LD3 0.9489 0.0362 0.0150

+
# the function for lda biplot arrows
+lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
+  heads <- coef(x)
+  arrows(x0 = 0, y0 = 0, 
+         x1 = myscale * heads[,choices[1]], 
+         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
+  text(myscale * heads[,choices], labels = row.names(heads), 
+       cex = tex, col=color, pos=3)
+}
+
+# target classes as numeric
+classes <- as.numeric(train$crime)
+
+# plot the lda results
+plot(lda.fit, dimen = 2, col = classes, pch = classes)
+lda.arrows(lda.fit, myscale = 1)
+

+
+
+

4.5 Predicting the LDA model

+
library(MASS)
+
+ind <- sample(nrow(boston_scaled),  size = nrow(boston_scaled) * 0.8)
+
+train <- boston_scaled[ind,]
+
+test <- boston_scaled[-ind,]
+
+correct_classes <- test$crime
+
+test <- dplyr::select(test, -crime)
+
+lda.fit = lda(crime ~ ., data=train)
+
+# predict classes with test data
+lda.pred <- predict(lda.fit, newdata = test)
+
+# cross tabulate the results
+table(correct = correct_classes, predicted = lda.pred$class)
+
##                  predicted
+## correct           [-0.419,-0.411] (-0.411,-0.39] (-0.39,0.00739] (0.00739,9.92]
+##   [-0.419,-0.411]              23              9               1              0
+##   (-0.411,-0.39]                2             17               2              0
+##   (-0.39,0.00739]               1             10              14              3
+##   (0.00739,9.92]                0              0               0             20
+
+
+

4.6 Model Optimization and K means clustering

+

Let’s work with clustering now Let’s calculate also the distances +between observations to assess the similarity between data points. As +instructed we calculated two distance matrix euclidean and +manhattan.

+
#start by reloading the Boston data set
+data("Boston")
+
+boston_scaled <- scale(Boston)
+summary(boston_scaled)
+
##       crim                 zn               indus              chas        
+##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723  
+##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723  
+##  Median :-0.390280   Median :-0.48724   Median :-0.2109   Median :-0.2723  
+##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723  
+##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648  
+##       nox                rm               age               dis         
+##  Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658  
+##  1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049  
+##  Median :-0.1441   Median :-0.1084   Median : 0.3171   Median :-0.2790  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617  
+##  Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566  
+##       rad               tax             ptratio            black        
+##  Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033  
+##  1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049  
+##  Median :-0.5225   Median :-0.4642   Median : 0.2746   Median : 0.3808  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332  
+##  Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406  
+##      lstat              medv        
+##  Min.   :-1.5296   Min.   :-1.9063  
+##  1st Qu.:-0.7986   1st Qu.:-0.5989  
+##  Median :-0.1811   Median :-0.1449  
+##  Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.6024   3rd Qu.: 0.2683  
+##  Max.   : 3.5453   Max.   : 2.9865
+
#class of Boston_scaled object
+class(boston_scaled)
+
## [1] "matrix" "array"
+
# change the object to data frame for futher analysis
+boston_scaled <- as.data.frame(boston_scaled)
+
# Calculating distances between the observation
+# euclidean distance matrix
+dist_eu <- dist(boston_scaled, method = "euclidean")
+
+# look at the summary of the distances
+summary(dist_eu)
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970
+
# Manhattan distance matrix
+dist_man <- dist(boston_scaled, method = "manhattan")
+
+# look at the summary of the distances
+summary(dist_man)
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##  0.2662  8.4832 12.6090 13.5488 17.7568 48.8618
+

Lets do K-means clustering (first with 4 clusters and then 3)

+
set.seed(123) #function set.seed() is used here to deal with the random assigning of the initial cluster centers when conducting k-means clustering
+
+# k-means clustering
+km <- kmeans(boston_scaled, centers = 4)
+
+# plotting the scaled Boston data set with clusters
+pairs(boston_scaled, col = km$cluster)
+

+With 4 clusters, it appears that there is some overlapping between +clusters. lets try 3 clusters and check

+
# k-means clustering
+km <- kmeans(boston_scaled, centers = 3)
+
+# plotting the scaled Boston data set with clusters
+pairs(boston_scaled, col = km$cluster)
+

+with 3 clusters it is even worse !! We can try to determine the optimal +number of cluster for our model and see how it works

+
+
+

4.7 K-means: determine the k

+
#Investigating the optimal number of clusters
+set.seed(123) #setseed function again
+
+# determine the number of clusters
+k_max <- 10
+
+# calculate the total within sum of squares
+twcss <- sapply(1:k_max, function(k){kmeans(boston_scaled, k)$tot.withinss})
+
+# visualize the results
+qplot(x = 1:k_max, y = twcss, geom = 'line', ylab = "TWCSS", xlab = "Number of cluster")
+

+

From the plot it looks like the optimal number of clusters for our +model is 2-3 as there is a sharp drop in TWCSS values. We can try with 2 +because 2 seems more optimal as by 2.5 the drop is significant and we +can see the substantial change.

+
# k-means clustering
+km <- kmeans(boston_scaled, centers = 2)
+
+# plot the Boston data set with clusters predicted
+pairs(boston_scaled, col = km$cluster)
+

+
pairs(boston_scaled[6:10], col = km$cluster)
+

+With 2 clusters, the model seem better, there is good separation for +example rad and tax. rm and age seems ok as well.

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/chapter5.Rmd b/chapter5.Rmd new file mode 100644 index 000000000..229c208b7 --- /dev/null +++ b/chapter5.Rmd @@ -0,0 +1,200 @@ +# Week 5: Dimensionality reduction techniques + +Actually, a fairly large selection of statistical methods can be listed under the title "dimensionality reduction techniques". Most often (nearly always, that is!) the real-world phenomena are multidimensional: they may consist of not just two or three but 5 or 10 or 20 or 50 (or more) dimensions. Of course, we are living only in a three-dimensional (3D) world, so those multiple dimensions may really challenge our imagination. It would be easier to reduce the number of dimensions in one way or another. + +We shall now learn the basics of two data science based ways of reducing the dimensions. The principal method here is principal component analysis (PCA), which reduces any number of measured (continuous) and correlated variables into a few uncorrelated components that collect together as much variance as possible from the original variables. The most important components can be then used for various purposes, e.g., drawing scatterplots and other fancy graphs that would be quite impossible to achieve with the original variables and too many dimensions. + +Multiple correspondence analysis (MCA) and other variations of CA bring us similar possibilities in the world of discrete variables, even nominal scale (classified) variables, by finding a suitable transformation into continuous scales and then reducing the dimensions quite analogously with the PCA. The typical graphs show the original classes of the discrete variables on the same "map", making it possible to reveal connections (correspondences) between different things that would be quite impossible to see from the corresponding cross tables (too many numbers!). + +Briefly stated, these methods help to visualize and understand multidimensional phenomena by reducing their dimensionality that may first feel impossible to handle at all. + +```{r} +date() +``` + +Lets start !! + +## 5.1 Packages required + +```{r} +#load the packages + +library(tidyverse) +library(GGally) +library(dplyr) +library(ggplot2) +library(corrplot) +library(stringr) +library(psych) +library(FactoMineR) +library(tidyr) +``` + +## 5.2 Loading the data set from wrangling exercise + +```{r} +#loading from project folder +human_ <- read.csv('human_.csv', row.names = 1) + +#Alternatively url is given in the instruction page, so we can also use that !! +# human <- read.csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human2.txt", row.names = 1) + +#lets check how the data looks +str(human_);dim(human_) +colnames(human_) +``` + +## 5.3 Graphical overview + +```{r} +summary(human_) +``` +```{r} +Plot1 <- p1 <- ggpairs(human_, mapping = aes(alpha=0.5), title="summary plot",lower = list(combo = wrap("facethist", bins = 25))) +Plot1 +``` +```{r} +#Lets see the correlation matrix with corrplot, I have used same method as last week's exercise with some changes +Plot2 <- cor(human_, method='spearman') +Plot2 +corrplot.mixed(Plot2, lower = 'number', upper = 'ellipse',tl.pos ="lt", tl.col='black', tl.cex=0.8, number.cex = 0.7) +``` +In above, correlation plot, I have used ellipse method to visualize the relationship between different variables. The correlation is stronger when the ellipse are narrower and two color spectrum blue and red represents positive and negative correlation respectively. + +Statistically significant strong positive correlation from two plots are between variables + + - Life expectancy (Life_Exp) and Expected years of schooling (Exp_Edu) + - Life expectancy (Life_Exp) and Gross National Income per capita (GNI) + - Expected years of schooling (Exp_Edu) and Gross National Income per capita (GNI) + - Maternal Mortality Rates (MMR) and Adolescent Birth Rate (ABR) + +Statistically significant strong negative correlation from two plots are between variables + + - Life expectancy (Life_Exp) and Maternal Mortality Rate (MMR) + - Expected years of schooling (Exp_Edu) and Maternal Mortality Rate (MMR) + - Gross National Income (GNI) and Maternal Mortality Rate (MMR) + +Two variables; labor Force Mortality of male and female combined (LFR_FM) and percentage of female representatives in the parliament (%PR) doesn't show any correlation and therefore are not associated with any other variables. Like wise secondary education of male and female combines (SeEdu_FM) isn't associated strongly with any other variables. + +## 5.4 Principal component analysis (PCA) + +[Principal Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) (PCA) can be performed by two sightly different matrix decomposition methods from linear algebra: the [Eigenvalue Decomposition](https://en.wikipedia.org/wiki/Eigendecomposition_of_a_matrix) and the [Singular Value Decomposition](https://en.wikipedia.org/wiki/Singular_value_decomposition) (SVD). + +There are two functions in the default package distribution of R that can be used to perform PCA: `princomp()` and `prcomp()`. The `prcomp()` function uses the SVD and is the preferred, more numerically accurate method. +Both methods quite literally *decompose* a data matrix into a product of smaller matrices, which let's us extract the underlying **principal components**. This makes it possible to approximate a lower dimensional representation of the data by choosing only a few principal components. + +Lets follow the instruction from course material and + +```{r} + +# lets create `human_std` by standardizing the variables in `human` +human_std <- scale(human_) + +# print out summaries of the standardized variables +summary(human_std) + +# perform principal component analysis (with the SVD method) +pca_human <- prcomp(human_std) +pca_human +summary(pca_human) + +# draw a biplot of the principal component representation and the original variables +biplot(pca_human, choices = 1:2, cex = c(0.8, 1), col = c("darkred", "darkgreen")) +``` +From the plot we can see the variability captured by the principal components which seems to have a realistic distribution between the principal components. + +Summary of the result + +From the plot (rounded with %) + +PC1 = 53.61 % variance +PC2 = 16.24 % variance +PC3 = 9.57 % variance +PC4 = 7.58 % variance +PC5 = 5.47 % variance +PC6 = 3.59 % variance +PC7 = 2.63 % variance +PC8 = 1.29 % variance + +And the standard deviation (SD) + + PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 +Standard deviation 2.0708 1.1397 0.87505 0.77886 0.66196 0.53631 0.45900 0.32224 + +## 5.5 Intrepretation of the analysis + +Summary of PC1-PC8 rounded with percentage (2 Decimal points only) is elaborated above + +PC1 gives the most (53,61%) and PC8 gives the least (1.29%) of the variability in the data set + +The variables affect mostly based PC1-PC8 are (explained as an example from table in the summary) + +Exp_Edu (positive effect) +GNI (positive effect) +Life_exp (positive effect) +SeEdu_FM (positive effect) +MMR (negative effect) +ABR (negative effect) + +## 5.6 Lets see the "tea" data set + +The tea data comes from the FactoMineR package and it is measured with a questionnaire on tea: 300 individuals were asked how they drink tea (18 questions) and what are their product's perception (12 questions). In addition, some personal details were asked (4 questions). + +The [Factominer](https://cran.r-project.org/web/packages/FactoMineR/index.html) package contains functions dedicated to multivariate explanatory data analysis. It contains for example methods *(Multiple) Correspondence analysis* , *Multiple Factor analysis* as well as PCA. + +In the next exercises we are going to use the `tea` dataset. The dataset contains the answers of a questionnaire on tea consumption. + +Let's dwell in teas for a bit! + +```{r} +tea <- read.csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/tea.csv", stringsAsFactors = TRUE) +view(tea) +str(tea);dim(tea) +colnames(tea) +summary(tea) +``` + + +```{r} +# lets work with some variables +keep_columns <- c("Tea", "How", "how", "sugar", "where", "lunch") + + +# select the 'keep_columns' to create a new data set +tea_time <- dplyr::select(tea, all_of(keep_columns)) + +# look at the summaries and structure of the data +summary(tea_time) + +# visualize the data set +pivot_longer(tea_time, cols = everything()) %>% + ggplot(aes(value)) + facet_wrap("name", scales = "free", ncol=6) + + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) + + +``` + +## 5.7 Multiple Correspondence Analysis (MCA) with "tea" data set + +```{r} +# multiple correspondence analysis +#library(FactoMineR), package is loaded above already, this just as note !! + +mca <- MCA(tea_time, graph = FALSE) + +# summary of the model +summary(mca) + +# visualize MCA +plot(mca, invisible=c("ind"), graph.type = "classic", habillage = "quali") +mca +``` +```{r} +plotellipses(mca) +``` +I have only chosen selected variables here. From the selected categories, in category where , chain store and tea shop seem to be favored. Likewise in category how, milk tea, alone and other (undefined) seemed preferred. Also how, tea bag, un-packaged seem to be preferred. + + + + + diff --git a/chapter5.html b/chapter5.html new file mode 100644 index 000000000..0745592d5 --- /dev/null +++ b/chapter5.html @@ -0,0 +1,954 @@ + + + + + + + + + + + + + +chapter5.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Week 5: Dimensionality reduction techniques

+

Actually, a fairly large selection of statistical methods can be +listed under the title “dimensionality reduction techniques”. Most often +(nearly always, that is!) the real-world phenomena are multidimensional: +they may consist of not just two or three but 5 or 10 or 20 or 50 (or +more) dimensions. Of course, we are living only in a three-dimensional +(3D) world, so those multiple dimensions may really challenge our +imagination. It would be easier to reduce the number of dimensions in +one way or another.

+

We shall now learn the basics of two data science based ways of +reducing the dimensions. The principal method here is principal +component analysis (PCA), which reduces any number of measured +(continuous) and correlated variables into a few uncorrelated components +that collect together as much variance as possible from the original +variables. The most important components can be then used for various +purposes, e.g., drawing scatterplots and other fancy graphs that would +be quite impossible to achieve with the original variables and too many +dimensions.

+

Multiple correspondence analysis (MCA) and other variations of CA +bring us similar possibilities in the world of discrete variables, even +nominal scale (classified) variables, by finding a suitable +transformation into continuous scales and then reducing the dimensions +quite analogously with the PCA. The typical graphs show the original +classes of the discrete variables on the same “map”, making it possible +to reveal connections (correspondences) between different things that +would be quite impossible to see from the corresponding cross tables +(too many numbers!).

+

Briefly stated, these methods help to visualize and understand +multidimensional phenomena by reducing their dimensionality that may +first feel impossible to handle at all.

+
date()
+
## [1] "Tue Dec  6 21:29:41 2022"
+

Lets start !!

+
+

5.1 Packages required

+
#load the packages 
+
+library(tidyverse)
+
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
+## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
+## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
+## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
+## ✔ readr   2.1.2     ✔ forcats 0.5.2
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag()    masks stats::lag()
+
library(GGally)
+
## Registered S3 method overwritten by 'GGally':
+##   method from   
+##   +.gg   ggplot2
+
library(dplyr)
+library(ggplot2)
+library(corrplot)
+
## corrplot 0.92 loaded
+
library(stringr)
+library(psych) 
+
## 
+## Attaching package: 'psych'
+## 
+## The following objects are masked from 'package:ggplot2':
+## 
+##     %+%, alpha
+
library(FactoMineR)
+library(tidyr)
+
+
+

5.2 Loading the data set from wrangling exercise

+
#loading from project folder 
+human_ <- read.csv('human_.csv', row.names = 1)
+
+#Alternatively url is given in the instruction page, so we can also use that !! 
+# human <- read.csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human2.txt", row.names = 1)
+
+#lets check how the data looks
+str(human_);dim(human_)
+
## 'data.frame':    155 obs. of  8 variables:
+##  $ SeEdu_FM: num  1.007 0.997 0.983 0.989 0.969 ...
+##  $ LFR_FM  : num  0.891 0.819 0.825 0.884 0.829 ...
+##  $ Life_Exp: num  81.6 82.4 83 80.2 81.6 80.9 80.9 79.1 82 81.8 ...
+##  $ Exp_Edu : num  17.5 20.2 15.8 18.7 17.9 16.5 18.6 16.5 15.9 19.2 ...
+##  $ GNI     : int  64992 42261 56431 44025 45435 43919 39568 52947 42155 32689 ...
+##  $ MMR     : int  4 6 6 5 6 7 9 28 11 8 ...
+##  $ ABR     : num  7.8 12.1 1.9 5.1 6.2 3.8 8.2 31 14.5 25.3 ...
+##  $ X.PR    : num  39.6 30.5 28.5 38 36.9 36.9 19.9 19.4 28.2 31.4 ...
+
## [1] 155   8
+
colnames(human_)
+
## [1] "SeEdu_FM" "LFR_FM"   "Life_Exp" "Exp_Edu"  "GNI"      "MMR"      "ABR"     
+## [8] "X.PR"
+
+
+

5.3 Graphical overview

+
summary(human_)
+
##     SeEdu_FM          LFR_FM          Life_Exp        Exp_Edu     
+##  Min.   :0.1717   Min.   :0.1857   Min.   :49.00   Min.   : 5.40  
+##  1st Qu.:0.7264   1st Qu.:0.5984   1st Qu.:66.30   1st Qu.:11.25  
+##  Median :0.9375   Median :0.7535   Median :74.20   Median :13.50  
+##  Mean   :0.8529   Mean   :0.7074   Mean   :71.65   Mean   :13.18  
+##  3rd Qu.:0.9968   3rd Qu.:0.8535   3rd Qu.:77.25   3rd Qu.:15.20  
+##  Max.   :1.4967   Max.   :1.0380   Max.   :83.50   Max.   :20.20  
+##       GNI              MMR              ABR              X.PR      
+##  Min.   :   581   Min.   :   1.0   Min.   :  0.60   Min.   : 0.00  
+##  1st Qu.:  4198   1st Qu.:  11.5   1st Qu.: 12.65   1st Qu.:12.40  
+##  Median : 12040   Median :  49.0   Median : 33.60   Median :19.30  
+##  Mean   : 17628   Mean   : 149.1   Mean   : 47.16   Mean   :20.91  
+##  3rd Qu.: 24512   3rd Qu.: 190.0   3rd Qu.: 71.95   3rd Qu.:27.95  
+##  Max.   :123124   Max.   :1100.0   Max.   :204.80   Max.   :57.50
+
Plot1 <- p1 <- ggpairs(human_, mapping = aes(alpha=0.5), title="summary plot",lower = list(combo = wrap("facethist", bins = 25)))
+Plot1
+

+
#Lets see the correlation matrix with corrplot, I have used same method as last week's exercise with some changes
+Plot2 <- cor(human_, method='spearman')
+Plot2
+
##             SeEdu_FM      LFR_FM   Life_Exp     Exp_Edu        GNI        MMR
+## SeEdu_FM  1.00000000 -0.07525177  0.4987487  0.52316296  0.5663235 -0.4937468
+## LFR_FM   -0.07525177  1.00000000 -0.1535502 -0.03956409 -0.1414489  0.1213061
+## Life_Exp  0.49874873 -0.15355015  1.0000000  0.81053640  0.8361208 -0.8753753
+## Exp_Edu   0.52316296 -0.03956409  0.8105364  1.00000000  0.8495071 -0.8472241
+## GNI       0.56632346 -0.14144887  0.8361208  0.84950709  1.0000000 -0.8647968
+## MMR      -0.49374677  0.12130614 -0.8753753 -0.84722412 -0.8647968  1.0000000
+## ABR      -0.35514985  0.09705612 -0.7468356 -0.72251512 -0.7562608  0.8315049
+## X.PR      0.09722070  0.20545990  0.2523837  0.22396862  0.1778011 -0.2028040
+##                  ABR       X.PR
+## SeEdu_FM -0.35514985  0.0972207
+## LFR_FM    0.09705612  0.2054599
+## Life_Exp -0.74683557  0.2523837
+## Exp_Edu  -0.72251512  0.2239686
+## GNI      -0.75626078  0.1778011
+## MMR       0.83150492 -0.2028040
+## ABR       1.00000000 -0.1214131
+## X.PR     -0.12141308  1.0000000
+
corrplot.mixed(Plot2, lower = 'number', upper = 'ellipse',tl.pos ="lt", tl.col='black', tl.cex=0.8, number.cex = 0.7)
+

+In above, correlation plot, I have used ellipse method to visualize the +relationship between different variables. The correlation is stronger +when the ellipse are narrower and two color spectrum blue and red +represents positive and negative correlation respectively.

+

Statistically significant strong positive correlation from two plots +are between variables

+
    +
  • Life expectancy (Life_Exp) and Expected years of schooling +(Exp_Edu)
  • +
  • Life expectancy (Life_Exp) and Gross National Income per capita +(GNI)
  • +
  • Expected years of schooling (Exp_Edu) and Gross National Income per +capita (GNI)
  • +
  • Maternal Mortality Rates (MMR) and Adolescent Birth Rate (ABR)
  • +
+

Statistically significant strong negative correlation from two plots +are between variables

+
    +
  • Life expectancy (Life_Exp) and Maternal Mortality Rate (MMR)
  • +
  • Expected years of schooling (Exp_Edu) and Maternal Mortality Rate +(MMR)
  • +
  • Gross National Income (GNI) and Maternal Mortality Rate (MMR)
  • +
+

Two variables; labor Force Mortality of male and female combined +(LFR_FM) and percentage of female representatives in the parliament +(%PR) doesn’t show any correlation and therefore are not associated with +any other variables. Like wise secondary education of male and female +combines (SeEdu_FM) isn’t associated strongly with any other +variables.

+
+
+

5.4 Principal component analysis (PCA)

+

Principal +Component Analysis (PCA) can be performed by two sightly different +matrix decomposition methods from linear algebra: the Eigenvalue +Decomposition and the Singular +Value Decomposition (SVD).

+

There are two functions in the default package distribution of R that +can be used to perform PCA: princomp() and +prcomp(). The prcomp() function uses the SVD +and is the preferred, more numerically accurate method. Both methods +quite literally decompose a data matrix into a product of +smaller matrices, which let’s us extract the underlying +principal components. This makes it possible to +approximate a lower dimensional representation of the data by choosing +only a few principal components.

+

Lets follow the instruction from course material and

+
# lets create `human_std` by standardizing the variables in `human`
+human_std <- scale(human_)
+
+# print out summaries of the standardized variables
+summary(human_std)
+
##     SeEdu_FM           LFR_FM           Life_Exp          Exp_Edu       
+##  Min.   :-2.8189   Min.   :-2.6247   Min.   :-2.7188   Min.   :-2.7378  
+##  1st Qu.:-0.5233   1st Qu.:-0.5484   1st Qu.:-0.6425   1st Qu.:-0.6782  
+##  Median : 0.3503   Median : 0.2316   Median : 0.3056   Median : 0.1140  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.5958   3rd Qu.: 0.7350   3rd Qu.: 0.6717   3rd Qu.: 0.7126  
+##  Max.   : 2.6646   Max.   : 1.6632   Max.   : 1.4218   Max.   : 2.4730  
+##       GNI               MMR               ABR               X.PR        
+##  Min.   :-0.9193   Min.   :-0.6992   Min.   :-1.1325   Min.   :-1.8203  
+##  1st Qu.:-0.7243   1st Qu.:-0.6496   1st Qu.:-0.8394   1st Qu.:-0.7409  
+##  Median :-0.3013   Median :-0.4726   Median :-0.3298   Median :-0.1403  
+##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
+##  3rd Qu.: 0.3712   3rd Qu.: 0.1932   3rd Qu.: 0.6030   3rd Qu.: 0.6127  
+##  Max.   : 5.6890   Max.   : 4.4899   Max.   : 3.8344   Max.   : 3.1850
+
# perform principal component analysis (with the SVD method)
+pca_human <- prcomp(human_std)
+pca_human
+
## Standard deviations (1, .., p=8):
+## [1] 2.0708380 1.1397204 0.8750485 0.7788630 0.6619563 0.5363061 0.4589994
+## [8] 0.3222406
+## 
+## Rotation (n x k) = (8 x 8):
+##                  PC1         PC2         PC3         PC4        PC5         PC6
+## SeEdu_FM -0.35664370  0.03796058 -0.24223089  0.62678110 -0.5983585  0.17713316
+## LFR_FM    0.05457785  0.72432726 -0.58428770  0.06199424  0.2625067 -0.03500707
+## Life_Exp -0.44372240 -0.02530473  0.10991305 -0.05834819  0.1628935 -0.42242796
+## Exp_Edu  -0.42766720  0.13940571 -0.07340270 -0.07020294  0.1659678 -0.38606919
+## GNI      -0.35048295  0.05060876 -0.20168779 -0.72727675 -0.4950306  0.11120305
+## MMR       0.43697098  0.14508727 -0.12522539 -0.25170614 -0.1800657  0.17370039
+## ABR       0.41126010  0.07708468  0.01968243  0.04986763 -0.4672068 -0.76056557
+## X.PR     -0.08438558  0.65136866  0.72506309  0.01396293 -0.1523699  0.13749772
+##                  PC7         PC8
+## SeEdu_FM  0.05773644  0.16459453
+## LFR_FM   -0.22729927 -0.07304568
+## Life_Exp -0.43406432  0.62737008
+## Exp_Edu   0.77962966 -0.05415984
+## GNI      -0.13711838 -0.16961173
+## MMR       0.35380306  0.72193946
+## ABR      -0.06897064 -0.14335186
+## X.PR      0.00568387 -0.02306476
+
summary(pca_human)
+
## Importance of components:
+##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
+## Standard deviation     2.0708 1.1397 0.87505 0.77886 0.66196 0.53631 0.45900
+## Proportion of Variance 0.5361 0.1624 0.09571 0.07583 0.05477 0.03595 0.02634
+## Cumulative Proportion  0.5361 0.6984 0.79413 0.86996 0.92473 0.96069 0.98702
+##                            PC8
+## Standard deviation     0.32224
+## Proportion of Variance 0.01298
+## Cumulative Proportion  1.00000
+
# draw a biplot of the principal component representation and the original variables
+biplot(pca_human, choices = 1:2, cex = c(0.8, 1), col = c("darkred", "darkgreen"))
+

+From the plot we can see the variability captured by the principal +components which seems to have a realistic distribution between the +principal components.

+

Summary of the result

+

From the plot (rounded with %)

+

PC1 = 53.61 % variance PC2 = 16.24 % variance PC3 = 9.57 % variance +PC4 = 7.58 % variance PC5 = 5.47 % variance PC6 = 3.59 % variance PC7 = +2.63 % variance PC8 = 1.29 % variance

+

And the standard deviation (SD)

+
                     PC1    PC2     PC3     PC4     PC5     PC6     PC7     PC8
+

Standard deviation 2.0708 1.1397 0.87505 0.77886 0.66196 0.53631 +0.45900 0.32224

+
+
+

5.5 Intrepretation of the analysis

+

Summary of PC1-PC8 rounded with percentage (2 Decimal points only) is +elaborated above

+

PC1 gives the most (53,61%) and PC8 gives the least (1.29%) of the +variability in the data set

+

The variables affect mostly based PC1-PC8 are (explained as an +example from table in the summary)

+

Exp_Edu (positive effect) GNI (positive effect) Life_exp (positive +effect) SeEdu_FM (positive effect) MMR (negative effect) ABR (negative +effect)

+
+
+

5.6 Lets see the “tea” data set

+

The tea data comes from the FactoMineR package and it is measured +with a questionnaire on tea: 300 individuals were asked how they drink +tea (18 questions) and what are their product’s perception (12 +questions). In addition, some personal details were asked (4 +questions).

+

The Factominer +package contains functions dedicated to multivariate explanatory data +analysis. It contains for example methods (Multiple) Correspondence +analysis , Multiple Factor analysis as well as PCA.

+

In the next exercises we are going to use the tea +dataset. The dataset contains the answers of a questionnaire on tea +consumption.

+

Let’s dwell in teas for a bit!

+
tea <- read.csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/tea.csv", stringsAsFactors = TRUE)
+view(tea)
+str(tea);dim(tea)
+
## 'data.frame':    300 obs. of  36 variables:
+##  $ breakfast       : Factor w/ 2 levels "breakfast","Not.breakfast": 1 1 2 2 1 2 1 2 1 1 ...
+##  $ tea.time        : Factor w/ 2 levels "Not.tea time",..: 1 1 2 1 1 1 2 2 2 1 ...
+##  $ evening         : Factor w/ 2 levels "evening","Not.evening": 2 2 1 2 1 2 2 1 2 1 ...
+##  $ lunch           : Factor w/ 2 levels "lunch","Not.lunch": 2 2 2 2 2 2 2 2 2 2 ...
+##  $ dinner          : Factor w/ 2 levels "dinner","Not.dinner": 2 2 1 1 2 1 2 2 2 2 ...
+##  $ always          : Factor w/ 2 levels "always","Not.always": 2 2 2 2 1 2 2 2 2 2 ...
+##  $ home            : Factor w/ 2 levels "home","Not.home": 1 1 1 1 1 1 1 1 1 1 ...
+##  $ work            : Factor w/ 2 levels "Not.work","work": 1 1 2 1 1 1 1 1 1 1 ...
+##  $ tearoom         : Factor w/ 2 levels "Not.tearoom",..: 1 1 1 1 1 1 1 1 1 2 ...
+##  $ friends         : Factor w/ 2 levels "friends","Not.friends": 2 2 1 2 2 2 1 2 2 2 ...
+##  $ resto           : Factor w/ 2 levels "Not.resto","resto": 1 1 2 1 1 1 1 1 1 1 ...
+##  $ pub             : Factor w/ 2 levels "Not.pub","pub": 1 1 1 1 1 1 1 1 1 1 ...
+##  $ Tea             : Factor w/ 3 levels "black","Earl Grey",..: 1 1 2 2 2 2 2 1 2 1 ...
+##  $ How             : Factor w/ 4 levels "alone","lemon",..: 1 3 1 1 1 1 1 3 3 1 ...
+##  $ sugar           : Factor w/ 2 levels "No.sugar","sugar": 2 1 1 2 1 1 1 1 1 1 ...
+##  $ how             : Factor w/ 3 levels "tea bag","tea bag+unpackaged",..: 1 1 1 1 1 1 1 1 2 2 ...
+##  $ where           : Factor w/ 3 levels "chain store",..: 1 1 1 1 1 1 1 1 2 2 ...
+##  $ price           : Factor w/ 6 levels "p_branded","p_cheap",..: 4 6 6 6 6 3 6 6 5 5 ...
+##  $ age             : int  39 45 47 23 48 21 37 36 40 37 ...
+##  $ sex             : Factor w/ 2 levels "F","M": 2 1 1 2 2 2 2 1 2 2 ...
+##  $ SPC             : Factor w/ 7 levels "employee","middle",..: 2 2 4 6 1 6 5 2 5 5 ...
+##  $ Sport           : Factor w/ 2 levels "Not.sportsman",..: 2 2 2 1 2 2 2 2 2 1 ...
+##  $ age_Q           : Factor w/ 5 levels "+60","15-24",..: 4 5 5 2 5 2 4 4 4 4 ...
+##  $ frequency       : Factor w/ 4 levels "+2/day","1 to 2/week",..: 3 3 1 3 1 3 4 2 1 1 ...
+##  $ escape.exoticism: Factor w/ 2 levels "escape-exoticism",..: 2 1 2 1 1 2 2 2 2 2 ...
+##  $ spirituality    : Factor w/ 2 levels "Not.spirituality",..: 1 1 1 2 2 1 1 1 1 1 ...
+##  $ healthy         : Factor w/ 2 levels "healthy","Not.healthy": 1 1 1 1 2 1 1 1 2 1 ...
+##  $ diuretic        : Factor w/ 2 levels "diuretic","Not.diuretic": 2 1 1 2 1 2 2 2 2 1 ...
+##  $ friendliness    : Factor w/ 2 levels "friendliness",..: 2 2 1 2 1 2 2 1 2 1 ...
+##  $ iron.absorption : Factor w/ 2 levels "iron absorption",..: 2 2 2 2 2 2 2 2 2 2 ...
+##  $ feminine        : Factor w/ 2 levels "feminine","Not.feminine": 2 2 2 2 2 2 2 1 2 2 ...
+##  $ sophisticated   : Factor w/ 2 levels "Not.sophisticated",..: 1 1 1 2 1 1 1 2 2 1 ...
+##  $ slimming        : Factor w/ 2 levels "No.slimming",..: 1 1 1 1 1 1 1 1 1 1 ...
+##  $ exciting        : Factor w/ 2 levels "exciting","No.exciting": 2 1 2 2 2 2 2 2 2 2 ...
+##  $ relaxing        : Factor w/ 2 levels "No.relaxing",..: 1 1 2 2 2 2 2 2 2 2 ...
+##  $ effect.on.health: Factor w/ 2 levels "effect on health",..: 2 2 2 2 2 2 2 2 2 2 ...
+
## [1] 300  36
+
colnames(tea)
+
##  [1] "breakfast"        "tea.time"         "evening"          "lunch"           
+##  [5] "dinner"           "always"           "home"             "work"            
+##  [9] "tearoom"          "friends"          "resto"            "pub"             
+## [13] "Tea"              "How"              "sugar"            "how"             
+## [17] "where"            "price"            "age"              "sex"             
+## [21] "SPC"              "Sport"            "age_Q"            "frequency"       
+## [25] "escape.exoticism" "spirituality"     "healthy"          "diuretic"        
+## [29] "friendliness"     "iron.absorption"  "feminine"         "sophisticated"   
+## [33] "slimming"         "exciting"         "relaxing"         "effect.on.health"
+
summary(tea)
+
##          breakfast           tea.time          evening          lunch    
+##  breakfast    :144   Not.tea time:131   evening    :103   lunch    : 44  
+##  Not.breakfast:156   tea time    :169   Not.evening:197   Not.lunch:256  
+##                                                                          
+##                                                                          
+##                                                                          
+##                                                                          
+##                                                                          
+##         dinner           always          home           work    
+##  dinner    : 21   always    :103   home    :291   Not.work:213  
+##  Not.dinner:279   Not.always:197   Not.home:  9   work    : 87  
+##                                                                 
+##                                                                 
+##                                                                 
+##                                                                 
+##                                                                 
+##         tearoom           friends          resto          pub     
+##  Not.tearoom:242   friends    :196   Not.resto:221   Not.pub:237  
+##  tearoom    : 58   Not.friends:104   resto    : 79   pub    : 63  
+##                                                                   
+##                                                                   
+##                                                                   
+##                                                                   
+##                                                                   
+##         Tea         How           sugar                     how     
+##  black    : 74   alone:195   No.sugar:155   tea bag           :170  
+##  Earl Grey:193   lemon: 33   sugar   :145   tea bag+unpackaged: 94  
+##  green    : 33   milk : 63                  unpackaged        : 36  
+##                  other:  9                                          
+##                                                                     
+##                                                                     
+##                                                                     
+##                   where                 price          age        sex    
+##  chain store         :192   p_branded      : 95   Min.   :15.00   F:178  
+##  chain store+tea shop: 78   p_cheap        :  7   1st Qu.:23.00   M:122  
+##  tea shop            : 30   p_private label: 21   Median :32.00          
+##                             p_unknown      : 12   Mean   :37.05          
+##                             p_upscale      : 53   3rd Qu.:48.00          
+##                             p_variable     :112   Max.   :90.00          
+##                                                                          
+##            SPC               Sport       age_Q          frequency  
+##  employee    :59   Not.sportsman:121   +60  :38   +2/day     :127  
+##  middle      :40   sportsman    :179   15-24:92   1 to 2/week: 44  
+##  non-worker  :64                       25-34:69   1/day      : 95  
+##  other worker:20                       35-44:40   3 to 6/week: 34  
+##  senior      :35                       45-59:61                    
+##  student     :70                                                   
+##  workman     :12                                                   
+##              escape.exoticism           spirituality        healthy   
+##  escape-exoticism    :142     Not.spirituality:206   healthy    :210  
+##  Not.escape-exoticism:158     spirituality    : 94   Not.healthy: 90  
+##                                                                       
+##                                                                       
+##                                                                       
+##                                                                       
+##                                                                       
+##          diuretic             friendliness            iron.absorption
+##  diuretic    :174   friendliness    :242   iron absorption    : 31   
+##  Not.diuretic:126   Not.friendliness: 58   Not.iron absorption:269   
+##                                                                      
+##                                                                      
+##                                                                      
+##                                                                      
+##                                                                      
+##          feminine             sophisticated        slimming          exciting  
+##  feminine    :129   Not.sophisticated: 85   No.slimming:255   exciting   :116  
+##  Not.feminine:171   sophisticated    :215   slimming   : 45   No.exciting:184  
+##                                                                                
+##                                                                                
+##                                                                                
+##                                                                                
+##                                                                                
+##         relaxing              effect.on.health
+##  No.relaxing:113   effect on health   : 66    
+##  relaxing   :187   No.effect on health:234    
+##                                               
+##                                               
+##                                               
+##                                               
+## 
+
# lets work with some variables 
+keep_columns <- c("Tea", "How", "how", "sugar", "where", "lunch")
+
+
+# select the 'keep_columns' to create a new data set
+tea_time <- dplyr::select(tea, all_of(keep_columns))
+
+# look at the summaries and structure of the data
+summary(tea_time)
+
##         Tea         How                      how           sugar    
+##  black    : 74   alone:195   tea bag           :170   No.sugar:155  
+##  Earl Grey:193   lemon: 33   tea bag+unpackaged: 94   sugar   :145  
+##  green    : 33   milk : 63   unpackaged        : 36                 
+##                  other:  9                                          
+##                   where           lunch    
+##  chain store         :192   lunch    : 44  
+##  chain store+tea shop: 78   Not.lunch:256  
+##  tea shop            : 30                  
+## 
+
# visualize the data set
+pivot_longer(tea_time, cols = everything()) %>% 
+  ggplot(aes(value)) + facet_wrap("name", scales = "free", ncol=6) +
+  geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
+

+
+
+

5.7 Multiple Correspondence Analysis (MCA) with “tea” data set

+
# multiple correspondence analysis
+#library(FactoMineR), package is loaded above already, this just as note !! 
+
+mca <- MCA(tea_time, graph = FALSE)
+
+# summary of the model
+summary(mca)
+
## 
+## Call:
+## MCA(X = tea_time, graph = FALSE) 
+## 
+## 
+## Eigenvalues
+##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
+## Variance               0.279   0.261   0.219   0.189   0.177   0.156   0.144
+## % of var.             15.238  14.232  11.964  10.333   9.667   8.519   7.841
+## Cumulative % of var.  15.238  29.471  41.435  51.768  61.434  69.953  77.794
+##                        Dim.8   Dim.9  Dim.10  Dim.11
+## Variance               0.141   0.117   0.087   0.062
+## % of var.              7.705   6.392   4.724   3.385
+## Cumulative % of var.  85.500  91.891  96.615 100.000
+## 
+## Individuals (the 10 first)
+##                       Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
+## 1                  | -0.298  0.106  0.086 | -0.328  0.137  0.105 | -0.327
+## 2                  | -0.237  0.067  0.036 | -0.136  0.024  0.012 | -0.695
+## 3                  | -0.369  0.162  0.231 | -0.300  0.115  0.153 | -0.202
+## 4                  | -0.530  0.335  0.460 | -0.318  0.129  0.166 |  0.211
+## 5                  | -0.369  0.162  0.231 | -0.300  0.115  0.153 | -0.202
+## 6                  | -0.369  0.162  0.231 | -0.300  0.115  0.153 | -0.202
+## 7                  | -0.369  0.162  0.231 | -0.300  0.115  0.153 | -0.202
+## 8                  | -0.237  0.067  0.036 | -0.136  0.024  0.012 | -0.695
+## 9                  |  0.143  0.024  0.012 |  0.871  0.969  0.435 | -0.067
+## 10                 |  0.476  0.271  0.140 |  0.687  0.604  0.291 | -0.650
+##                       ctr   cos2  
+## 1                   0.163  0.104 |
+## 2                   0.735  0.314 |
+## 3                   0.062  0.069 |
+## 4                   0.068  0.073 |
+## 5                   0.062  0.069 |
+## 6                   0.062  0.069 |
+## 7                   0.062  0.069 |
+## 8                   0.735  0.314 |
+## 9                   0.007  0.003 |
+## 10                  0.643  0.261 |
+## 
+## Categories (the 10 first)
+##                        Dim.1     ctr    cos2  v.test     Dim.2     ctr    cos2
+## black              |   0.473   3.288   0.073   4.677 |   0.094   0.139   0.003
+## Earl Grey          |  -0.264   2.680   0.126  -6.137 |   0.123   0.626   0.027
+## green              |   0.486   1.547   0.029   2.952 |  -0.933   6.111   0.107
+## alone              |  -0.018   0.012   0.001  -0.418 |  -0.262   2.841   0.127
+## lemon              |   0.669   2.938   0.055   4.068 |   0.531   1.979   0.035
+## milk               |  -0.337   1.420   0.030  -3.002 |   0.272   0.990   0.020
+## other              |   0.288   0.148   0.003   0.876 |   1.820   6.347   0.102
+## tea bag            |  -0.608  12.499   0.483 -12.023 |  -0.351   4.459   0.161
+## tea bag+unpackaged |   0.350   2.289   0.056   4.088 |   1.024  20.968   0.478
+## unpackaged         |   1.958  27.432   0.523  12.499 |  -1.015   7.898   0.141
+##                     v.test     Dim.3     ctr    cos2  v.test  
+## black                0.929 |  -1.081  21.888   0.382 -10.692 |
+## Earl Grey            2.867 |   0.433   9.160   0.338  10.053 |
+## green               -5.669 |  -0.108   0.098   0.001  -0.659 |
+## alone               -6.164 |  -0.113   0.627   0.024  -2.655 |
+## lemon                3.226 |   1.329  14.771   0.218   8.081 |
+## milk                 2.422 |   0.013   0.003   0.000   0.116 |
+## other                5.534 |  -2.524  14.526   0.197  -7.676 |
+## tea bag             -6.941 |  -0.065   0.183   0.006  -1.287 |
+## tea bag+unpackaged  11.956 |   0.019   0.009   0.000   0.226 |
+## unpackaged          -6.482 |   0.257   0.602   0.009   1.640 |
+## 
+## Categorical variables (eta2)
+##                      Dim.1 Dim.2 Dim.3  
+## Tea                | 0.126 0.108 0.410 |
+## How                | 0.076 0.190 0.394 |
+## how                | 0.708 0.522 0.010 |
+## sugar              | 0.065 0.001 0.336 |
+## where              | 0.702 0.681 0.055 |
+## lunch              | 0.000 0.064 0.111 |
+
# visualize MCA
+plot(mca, invisible=c("ind"), graph.type = "classic", habillage = "quali")
+

+
mca
+
## **Results of the Multiple Correspondence Analysis (MCA)**
+## The analysis was performed on 300 individuals, described by 6 variables
+## *The results are available in the following objects:
+## 
+##    name              description                       
+## 1  "$eig"            "eigenvalues"                     
+## 2  "$var"            "results for the variables"       
+## 3  "$var$coord"      "coord. of the categories"        
+## 4  "$var$cos2"       "cos2 for the categories"         
+## 5  "$var$contrib"    "contributions of the categories" 
+## 6  "$var$v.test"     "v-test for the categories"       
+## 7  "$ind"            "results for the individuals"     
+## 8  "$ind$coord"      "coord. for the individuals"      
+## 9  "$ind$cos2"       "cos2 for the individuals"        
+## 10 "$ind$contrib"    "contributions of the individuals"
+## 11 "$call"           "intermediate results"            
+## 12 "$call$marge.col" "weights of columns"              
+## 13 "$call$marge.li"  "weights of rows"
+
plotellipses(mca)
+

+I have only chosen selected variables here. From the selected +categories, in category where , chain store and tea shop seem to be +favored. Likewise in category how, milk tea, alone and other (undefined) +seemed preferred. Also how, tea bag, un-packaged seem to be +preferred.

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/chapter6.Rmd b/chapter6.Rmd new file mode 100644 index 000000000..0e78174da --- /dev/null +++ b/chapter6.Rmd @@ -0,0 +1,379 @@ +# Week 6: Analysis of longitudinal data + +After working hard with multivariate, mostly exploratory, even heuristic techniques that are common in data science, the last topic of IODS course will take us back in the task of building statistical models. + +The new challenge is that the data will include two types of dependencies simultaneously: In addition to the correlated variables that we have faced with all models and methods so far, the observations of the data will also be correlated with each other. + +**Lets start this weeks session** + +Usually, we can assume that the observations are not correlated - instead, they are assumed to be independent of each other. However, in **longitudinal settings** this assumption seldom holds, because we have multiple observations or measurements of the same individuals. The concept of repeated measures highlights this phenomenon that is actually quite typical in many applications. Both types of dependencies (variables and observations) must be taken into account; otherwise the models will be biased. + +```{r echo=FALSE, out.width= "50%"} +knitr::include_graphics("long2.jpg", error = FALSE) +``` + +To analyze this kind of data sets, we will focus on a single class of methods, called **linear mixed effects models** that can cope quite nicely with the setting described above. + +Before we consider two examples of mixed models, namely the random intercept model and the random intercept and slope model, we will learn how to wrangle longitudinal data in wide form and long form, take a look at some graphical displays of longitudinal data, and try a simple summary measure approach that may sometimes provide a useful first step in these challenges. In passing, we “violently” apply the usual “fixed” models (although we know that they are not the right choice here) in order to compare the results and see the consequences of making invalid assumptions. + +**Load the packages first !! ** + +## 6.1 Packages for Week 6 !! + +```{r} +#load required packages +library(ggplot2) +library(corrplot) +library(tidyverse) +library(GGally) +library(dplyr) +library(stringr) +library(psych) +library(FactoMineR) +library(lme4) +``` + +That's a lot of packages !!! + +```{r echo=FALSE, out.width= "40%"} +knitr::include_graphics("long1.jpg", error = FALSE) +``` + +## Analysis of RATS data + +Lets implement the analyses of Chapter 8 of MABS, using the R codes of Exercise Set 6: Meet and Repeat: PART I but using the RATS data (from Chapter 9 and Meet and Repeat: PART II) as instructed in the Moodle. + +### 6.2 Loading the data + +```{r} +# read long format data of rats +RATSL <- read.csv('RATSL.csv') + +# Lets convert categorical data to factors first +## WE have ID and Group (Just like wrangling exercise) +RATSL$ID <- factor(RATSL$ID) +RATSL$Group <- factor(RATSL$Group) + +# glimpse and dimensions +head(RATSL);dim(RATSL) +str(RATSL) +summary(RATSL) +``` +The data set contains observation from 6 rats and 11 observation of change in weight by Time. They are divided into 3 groups based on treatment. Weight in this case is the outcome variable in this longitudinal study. The idea is to analyse the weight difference in three group over time. + +### 6.3 Plotting the data + +```{r} +ggplot(RATSL, aes(x = Time, y = Weight, group = ID)) + + geom_line(aes(linetype = Group))+ + scale_x_continuous(name = "Time (days)", breaks = seq(0, 60, 10))+ + scale_y_continuous(name = "Weight (grams)")+ + theme(legend.position = "top") +``` + + +```{r} + +# Draw the plot ##We plot the Weights of each rat by time and groups +# Rats are divided into three groups +ggplot(RATSL, aes(x = Time, y = Weight, linetype = Group, color = ID)) + + geom_line() + + scale_linetype_manual(values = rep(1:10, times=4)) + + facet_grid(. ~ Group, labeller = label_both) + + theme(legend.position = "none") + + scale_y_continuous(limits = c(min(RATSL$Weight), max(RATSL$Weight))) +``` +From the plot we can see that Group 1 has the most rats with lowest weight even at starting point (time of recruitment). Group 2 the most incremental weight outcome compare to baseline but has only 4 rats. Group 3 has also 4 rats with almost same weight range as Group 2, however the weight doesn't seem to increase significantly as group 2. + +### 6.4 Standardizing for tracking + +Higher baseline values means higher values throughout the study.This phenomenon is generally referred to as tracking. + +The tracking phenomenon can be seen more clearly in a plot of the standardized values of each +observation, i.e., + +$$standardised(x) = \frac{x - mean(x)}{ sd(x)}$$ + +```{r} + +# Standardize the variable weight by groups +RATSL <- RATSL %>% + group_by(Time) %>% + mutate(Weight_std = (scale(Weight))) %>% + ungroup() + +# Glimpse the data +glimpse(RATSL) +head(RATSL) + +# Plot again with the standardized weight in RATSL +ggplot(RATSL, aes(x = Time, y = Weight_std, linetype = Group, color =ID)) + + geom_line() + + scale_linetype_manual(values = rep(1:10, times=4)) + + facet_grid(. ~ Group, labeller = label_both) + + theme(legend.position = "none") + scale_y_continuous(name = "standardized Weight") +``` + +The weight difference looks similar now after the standardization, + +### 6.5 Summary graphs + +With large numbers of observations, graphical displays of individual response profiles are of little use and investigators then commonly produce graphs showing average (mean) profiles for each treatment group along with some indication of the variation of the observations at each time point, in this case the standard error of mean + +$$se = \frac{sd(x)}{\sqrt{n}}$$ +```{r} +# Summary data with mean and standard error of RATSl Weight by group and time +RATSS <- RATSL %>% + group_by(Group, Time) %>% + summarise(mean = mean(Weight), se = (sd(Weight)/sqrt(length(Weight))) ) %>% #using formula above; + ungroup() + +# Glimpse the data +glimpse(RATSL) + +# Plot the mean profiles +ggplot(RATSS, aes(x = Time, y = mean, color=Group, linetype = Group, shape = Group)) + + geom_line() + + scale_linetype_manual(values = c(1,2,3)) + + geom_point(size=3) + + scale_shape_manual(values = c(1,2,3)) + + geom_errorbar(aes(ymin=mean-se, ymax=mean+se, linetype="1"), width=0.3) + + theme(legend.position = "right") + + scale_y_continuous(name = "mean(Weight) +/- se(Weight)") +``` +From the plot we can see ,All groups are independent and doesn't seem to overlap with each other. There is a signifiant difference in Group 1 compared to Group 2 and Group 3. It is also clear that the weight of the rat seems to increase over time (observation) with a significant increase in Group 2 and 3. + +### 6.6 Find the outlier using summary measure approach + +Using the summary measure approach we will look into the post treatment values of the RATSL data set. Lets look into the mean weight for each rat. The mean of weeks will be our summary measure and we'll plot boxplots of the mean for each diet group which is our treatment measure. + +```{r} + +# Create a summary data by treatment and subject with mean as the summary variable (ignoring baseline week 0) +RATSL8S <- RATSL %>% + filter(Time > 0) %>% + group_by(Group, ID) %>% + summarise(Weight_mean = mean(Weight)) %>% + ungroup() + +# Glimpse the data +glimpse(RATSL8S) + +# Draw a boxplot of the mean versus treatment + +ggplot(RATSL8S, aes(x = Group, y = Weight_mean, color = Group)) + + geom_boxplot() + + stat_summary(fun = "mean", geom = "point", shape=23, size=4, fill = "white") + + scale_y_continuous(name = "mean(Weight), Days 1-60") + +``` +From the box plot, we can see all three groups has outliers. Group 2 has a large one making the uneven distribution. The next step is to find and filter the outliers identified above. + +```{r} +# define outlier from group 3 +g3 <- RATSL8S %>% filter(Group==3) +out3 <- min(g3$Weight_mean) + +# Create a new data by filtering the outliers +RATSL8S2 <- RATSL8S %>% + filter(250 < Weight_mean & Weight_mean < 560 & Weight_mean != out3) + + +# Draw a boxplot of the mean versus diet +ggplot(RATSL8S2, aes(x = Group, y = Weight_mean, col=Group)) + + geom_boxplot() + + stat_summary(fun = "mean", geom = "point", shape=23, size=4, fill = "white") + + scale_y_continuous(name = "mean(Weight) by days") +``` +### 6.7 T-test and ANOVA + +Although the informal graphical material presented up to now has all indicated a lack of difference in the two treatment groups, most investigators would still require a formal test for a difference. Consequently we shall now apply a t-test to assess any difference between the treatment groups, and also calculate a confidence interval for this difference. We use the data without the **outlier** created above. The t-test confirms the lack of any evidence for a group difference. Also the 95% confidence interval is wide and includes the zero, allowing for similar conclusions to be made. +However, T-test only tests for a statistical difference between two groups and in the dataset above we have 3 corresponding groups to be compared, we will therefore use a more stringent and diverse test ANOVA which compares differences among multiple groups. ANOVA assumes homogeniety of variance-the variance in the groups 1-3 should be similar + +```{r} +# Load original wide form rats data +RATSL <- as_tibble(read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t')) +RATSL$ID <- factor(RATSL$ID) + +# Add the baseline from the original data as a new variable to the summary data +join_vars <- c("ID","WD1") +RATSL8S3 <- RATSL8S2 %>% + left_join(RATSL[join_vars], by='ID') +# Rename column +RATSL8S3 <- RATSL8S3 %>% + rename('Weight_baseline' = 'WD1') + +# Fit the linear model with the mean Weight as the response +fit2 <- lm(Weight_mean ~ Weight_baseline + Group, data = RATSL8S3) + +# Compute the analysis of variance table for the fitted model with anova() +anova(fit2) +``` +Looking at the ANOVA table, p-values < 0.05 considering a significance level p = 0.05 at 95% CI. There seem to be significant difference between the groups. The data however doesn't tell us much about differences between which groups, i.e., multiple comparison. Usually data which follows the normal distribution curve are analysed with ANOVA followed by tukey test for multiple comparison. However in case of data which doesn't follow the normal distribution curve, Kruskal Wallis followed by Dunn's test for multiple comparison is conducted. Now assuming our data as normally distributed as we have been doing in this exercise, we can perform a tukey's test for multiple comparison. + +## Analysis of BPRS data + +Lets implement Implement the analyses of Chapter 9 of MABS, using the R codes of Exercise Set 6: Meet and Repeat: PART II, but using the BPRS data (from Chapter 8 and Meet and Repeat: PART I) as instructed in the Moodle. + +BPRS data includes data pertaining to a brief psychiatric rating scale (BPRS) score prior to treatment and BPRS from 8 weeks during treatment. The patients (n=40) have been randomly assigned to treatment arm 1 or 2 and we are interested whether there is a difference in BPRS scores depending on the received treatment. A lower score means less symptoms. + +### 6.8 Loading the data + +Lets load and explore the data first + +```{r} +BPRSL <- read.table("BPRSL.csv", header = T, sep=",") + +# Factor treatment & subject +BPRSL$treatment <- factor(BPRSL$treatment) +BPRSL$subject <- factor(BPRSL$subject) + + +# Glimpse the data + +glimpse(BPRSL) + + +str(BPRSL) +summary(BPRSL) +glimpse(BPRSL) +``` +BPRSL data set has 360 observations and 6 variable. From the glimpse function, we can see the 6 columns, in which two treatment arms are coded 1 and 2 for treatment 1 and treatment 2. Subjects are coded from 1 to 20 however the repetition of same code in subjects suggests that participants were randomized to Treatment 1 or 2. + +```{r} +#Lets plot the data +ggplot(BPRSL, aes(x = week, y = bprs, linetype = subject)) + + geom_line() + + scale_linetype_manual(values = rep(1:10, times=4)) + + facet_grid(. ~ treatment, labeller = label_both) + + theme(legend.position = "none") + + scale_y_continuous(limits = c(min(BPRSL$bprs), max(BPRSL$bprs))) +``` +From the plot, it appears that BPRS seems to decrease during the treatment period in both treatment arms. A clear diffrerence cannot be validated however between groups from the plots. + +### 6.9 Linear Mixed Effects Models + +```{r} +# lets create a regression model for BPRSL +BPRS_reg <- lm(bprs ~ week + treatment, data = BPRSL) + +# print summary of the model +summary(BPRS_reg) +``` +We have BPRS score as our target variable and time (weeks) and treatment 1 and 2 as our explanatory variable. From the summary model, week variable seem to be statistically significant with BPRS but treatment variable doesn't (p = 0.661). No significant difference can be seen in the difference in BPRS based on treatments +However this analyses assumes independence of observations, i.e., the observation or outcome is not affected by any other confounder and is completely influenced by the explanatory variable which is not very rational. Therefore we now move on to a more stringent analyses which assumes observations ad dependent variable and can be influence by other effect. We will analyse the data set with both **Fixed-effect models and Random-effect models**. + +```{r} +#lets load the package +library(lme4) + +# Create a random intercept model +BPRS_ref <- lmer(bprs ~ week + treatment + (1 | subject), data = BPRSL, REML = FALSE) + +# Print the summary of the model +summary(BPRS_ref) +confint(BPRS_ref) +``` +Lets first reflect on our BPRS data set. Our maxima and minima is 95 and 18 respectively. Given the high variance, the baseline seems to differ from the outcomes. + +Let’s fit the random intercept and random slope model to our data: + +### 6.10 Random Intercept and Random Slope Model + +`Fitting a random intercept and random slope model allows the linear regression fits for each individual to differ in intercept but also in slope. This allows us to account for the individual differences in the individuals symptom (BRPS score) profiles, but also the effect of time.` + +```{r} +# create a random intercept and random slope model +BPRS_ref1 <- lmer(bprs ~ week * treatment + (week | subject), data = BPRSL, REML = FALSE) + +# print a summary of the model + +summary(BPRS_ref1) +confint((BPRS_ref1)) +``` +```{r} +# perform an ANOVA test on the two models to assess formal differences between them +anova(BPRS_ref1, BPRS_ref) +``` + +ANOVA test seems significant (p < 0.05). Addition of slope definitely increases the model fit. Its clear that addition of a random intercept model increased the inter individual variance. Treatment group seems unaffected over time. Lets see the slope model and see the outcomes. + +### 6.11 Random Intercept and Random Slope Model with interaction + +```{r} +BPRS_ref2 <- lmer(bprs ~ week * treatment + (week | subject), data = BPRSL, REML = FALSE) + +# print a summary of the model + +summary(BPRS_ref2) +``` +```{r} +# perform an ANOVA test on the two models +anova(BPRS_ref2, BPRS_ref1) +``` +Finally, looking at the model, the two model and outcomes seems similar. Comparing the ANOVA test conforms there isn't much significant difference between them. Adding the interaction variable as mentioned above in model 1 doesn't seem to work out as the model didn't change and the significance level hasn't changed either. + +### 6.12 Plotting the observed BPRS values and the fitted BPRS values + +```{r} +# draw the plot of BPRSL with the observed BPRS values +ggplot(BPRSL, aes(x = week, y = bprs, group = subject, col= treatment)) + + geom_line(aes(linetype = treatment)) + + scale_x_continuous(name = "Time (weeks)", breaks = seq(0, 4, 8)) + + scale_y_continuous(name = "Observed BPRS") + + theme(legend.position = "right") + + facet_grid(. ~ treatment, labeller=label_both) +``` +```{r} +# Create a vector of the fitted values +Fitted <- fitted(BPRS_ref) +Fitted1 <- fitted(BPRS_ref1) +Fitted2 <- fitted(BPRS_ref2) + +# Create a new column fitted to BPRSL +BPRSL <- BPRSL %>% mutate(bprs_fitted_values_BPRSL_ref = Fitted, bprs_fitted_values_BPRSL_ref1 = Fitted1, bprs_fitted_values_BPRSL_ref2 = Fitted2) +head(BPRSL) +``` +```{r} + # draw the plot of BPRSL with the Fitted values of bprs model 1 +ggplot(BPRSL, aes(x = week, y = bprs_fitted_values_BPRSL_ref, group = subject, col=treatment)) + + geom_line(aes(linetype = treatment)) + + scale_x_continuous(name = "Time (weeks)", breaks = seq(0, 4, 8)) + + scale_y_continuous(name = "Fitted BPRS (model 1: rnd intercept)") + + theme(legend.position = "right") + + facet_grid(. ~ treatment, labeller=label_both) +``` +```{r} +# draw the plot of BPRSL with the Fitted values of bprs model 2 +ggplot(BPRSL, aes(x = week, y = bprs_fitted_values_BPRSL_ref1, group = subject, col=treatment)) + + geom_line(aes(linetype = treatment)) + + scale_x_continuous(name = "Time (weeks)", breaks = seq(0, 4, 8)) + + scale_y_continuous(name = "Fitted BPRS (model 2: rnd intercept + slope)") + + theme(legend.position = "right") + + facet_grid(. ~ treatment, labeller=label_both) +``` +From the plot, we can see random intercept model differs from random intercept and slope model. Adding a slope intercept didn't change the outcomes however. We can also see the final plot random intercept and slope with interaction model also different from all three model above. In conclusion we can say that random intercept model doesn't highlight the individual's effect on bprs over time and also the outcomes didn't change with subsequent model. + +******************************************************************* END ****************************************************************** + +**DONE AND DUSTED** $\large \surd$ !!! + +```{r echo=FALSE, out.width= "20%"} +knitr::include_graphics("santa.jpg", error = FALSE) +``` + + +**Well not really !!** There is so much more to learn. This course was however a great "push" +I have truly enjoyed the exercise sessions, the course material and the review exercises. I have learned so much and I am looking forward to learning more !! + +**Merry Christmas and a Happy New Year Everyone !!** + +```{r echo=FALSE, out.width= "100%"} +knitr::include_graphics("newyear.jpg", error = FALSE) +``` + + + + + diff --git a/chapter6.html b/chapter6.html new file mode 100644 index 000000000..612f6cd07 --- /dev/null +++ b/chapter6.html @@ -0,0 +1,1108 @@ + + + + + + + + + + + + + +chapter6.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Week 6: Analysis of longitudinal data

+

After working hard with multivariate, mostly exploratory, even +heuristic techniques that are common in data science, the last topic of +IODS course will take us back in the task of building statistical +models.

+

The new challenge is that the data will include two types of +dependencies simultaneously: In addition to the correlated variables +that we have faced with all models and methods so far, the observations +of the data will also be correlated with each other.

+

Lets start this weeks session

+

Usually, we can assume that the observations are not correlated - +instead, they are assumed to be independent of each other. However, in +longitudinal settings this assumption seldom holds, +because we have multiple observations or measurements of the same +individuals. The concept of repeated measures highlights this phenomenon +that is actually quite typical in many applications. Both types of +dependencies (variables and observations) must be taken into account; +otherwise the models will be biased.

+

+

To analyze this kind of data sets, we will focus on a single class of +methods, called linear mixed effects models that can +cope quite nicely with the setting described above.

+

Before we consider two examples of mixed models, namely the random +intercept model and the random intercept and slope model, we will learn +how to wrangle longitudinal data in wide form and long form, take a look +at some graphical displays of longitudinal data, and try a simple +summary measure approach that may sometimes provide a useful first step +in these challenges. In passing, we “violently” apply the usual “fixed” +models (although we know that they are not the right choice here) in +order to compare the results and see the consequences of making invalid +assumptions.

+

Load the packages first !!

+
+

6.1 Packages for Week 6 !!

+
#load required packages
+library(ggplot2)
+library(corrplot)
+
## corrplot 0.92 loaded
+
library(tidyverse)
+
## ── Attaching packages
+## ───────────────────────────────────────
+## tidyverse 1.3.2 ──
+
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
+## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
+## ✔ readr   2.1.2     ✔ forcats 0.5.2
+## ✔ purrr   0.3.4     
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag()    masks stats::lag()
+
library(GGally)
+
## Registered S3 method overwritten by 'GGally':
+##   method from   
+##   +.gg   ggplot2
+
library(dplyr)
+library(stringr)
+library(psych) 
+
## 
+## Attaching package: 'psych'
+## 
+## The following objects are masked from 'package:ggplot2':
+## 
+##     %+%, alpha
+
library(FactoMineR)
+library(lme4)
+
## Loading required package: Matrix
+## 
+## Attaching package: 'Matrix'
+## 
+## The following objects are masked from 'package:tidyr':
+## 
+##     expand, pack, unpack
+

That’s a lot of packages !!!

+

+
+
+

Analysis of RATS data

+

Lets implement the analyses of Chapter 8 of MABS, using the R codes +of Exercise Set 6: Meet and Repeat: PART I but using the RATS data (from +Chapter 9 and Meet and Repeat: PART II) as instructed in the Moodle.

+
+

6.2 Loading the data

+
# read long format data of rats
+RATSL <- read.csv('RATSL.csv')
+
+# Lets convert categorical data to factors first 
+## WE have ID and Group (Just like wrangling exercise)
+RATSL$ID <- factor(RATSL$ID)
+RATSL$Group <- factor(RATSL$Group)
+
+# glimpse and dimensions
+head(RATSL);dim(RATSL)
+
##   ID Group  WD Weight Time
+## 1  1     1 WD1    240    1
+## 2  2     1 WD1    225    1
+## 3  3     1 WD1    245    1
+## 4  4     1 WD1    260    1
+## 5  5     1 WD1    255    1
+## 6  6     1 WD1    260    1
+
## [1] 176   5
+
str(RATSL)
+
## 'data.frame':    176 obs. of  5 variables:
+##  $ ID    : Factor w/ 16 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
+##  $ Group : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 2 2 ...
+##  $ WD    : chr  "WD1" "WD1" "WD1" "WD1" ...
+##  $ Weight: int  240 225 245 260 255 260 275 245 410 405 ...
+##  $ Time  : int  1 1 1 1 1 1 1 1 1 1 ...
+
summary(RATSL)
+
##        ID      Group       WD                Weight           Time      
+##  1      : 11   1:88   Length:176         Min.   :225.0   Min.   : 1.00  
+##  2      : 11   2:44   Class :character   1st Qu.:267.0   1st Qu.:15.00  
+##  3      : 11   3:44   Mode  :character   Median :344.5   Median :36.00  
+##  4      : 11                             Mean   :384.5   Mean   :33.55  
+##  5      : 11                             3rd Qu.:511.2   3rd Qu.:50.00  
+##  6      : 11                             Max.   :628.0   Max.   :64.00  
+##  (Other):110
+

The data set contains observation from 6 rats and 11 observation of +change in weight by Time. They are divided into 3 groups based on +treatment. Weight in this case is the outcome variable in this +longitudinal study. The idea is to analyse the weight difference in +three group over time.

+
+
+

6.3 Plotting the data

+
ggplot(RATSL, aes(x = Time, y = Weight, group = ID)) +
+  geom_line(aes(linetype = Group))+
+  scale_x_continuous(name = "Time (days)", breaks = seq(0, 60, 10))+
+  scale_y_continuous(name = "Weight (grams)")+
+  theme(legend.position = "top")
+

+
# Draw the plot ##We plot the Weights of each rat by time and groups
+# Rats are divided into three groups
+ggplot(RATSL, aes(x = Time, y = Weight, linetype = Group, color = ID)) +
+  geom_line() +
+  scale_linetype_manual(values = rep(1:10, times=4)) +
+  facet_grid(. ~ Group, labeller = label_both) +
+  theme(legend.position = "none") + 
+  scale_y_continuous(limits = c(min(RATSL$Weight), max(RATSL$Weight)))
+

+From the plot we can see that Group 1 has the most rats with lowest +weight even at starting point (time of recruitment). Group 2 the most +incremental weight outcome compare to baseline but has only 4 rats. +Group 3 has also 4 rats with almost same weight range as Group 2, +however the weight doesn’t seem to increase significantly as group +2.

+
+
+

6.4 Standardizing for tracking

+

Higher baseline values means higher values throughout the study.This +phenomenon is generally referred to as tracking.

+

The tracking phenomenon can be seen more clearly in a plot of the +standardized values of each observation, i.e.,

+

\[standardised(x) = \frac{x - mean(x)}{ +sd(x)}\]

+
# Standardize the variable weight by groups
+RATSL <- RATSL %>%
+  group_by(Time) %>%
+  mutate(Weight_std = (scale(Weight))) %>%
+  ungroup()
+
+# Glimpse the data
+glimpse(RATSL)
+
## Rows: 176
+## Columns: 6
+## $ ID         <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2…
+## $ Group      <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1,…
+## $ WD         <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD…
+## $ Weight     <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 445, 555,…
+## $ Time       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8,…
+## $ Weight_std <dbl[,1]> <matrix[26 x 1]>
+
head(RATSL)
+
## # A tibble: 6 × 6
+##   ID    Group WD    Weight  Time Weight_std[,1]
+##   <fct> <fct> <chr>  <int> <int>          <dbl>
+## 1 1     1     WD1      240     1         -1.00 
+## 2 2     1     WD1      225     1         -1.12 
+## 3 3     1     WD1      245     1         -0.961
+## 4 4     1     WD1      260     1         -0.842
+## 5 5     1     WD1      255     1         -0.882
+## 6 6     1     WD1      260     1         -0.842
+
# Plot again with the standardized weight in RATSL 
+ggplot(RATSL, aes(x = Time, y = Weight_std, linetype = Group, color =ID)) +
+  geom_line() +
+  scale_linetype_manual(values = rep(1:10, times=4)) +
+  facet_grid(. ~ Group, labeller = label_both) +
+  theme(legend.position = "none")
+

+
  scale_y_continuous(name = "standardized Weight")
+
## <ScaleContinuousPosition>
+##  Range:  
+##  Limits:    0 --    1
+

The weight difference looks similar now after the +standardization,

+
+
+

6.5 Summary graphs

+

With large numbers of observations, graphical displays of individual +response profiles are of little use and investigators then commonly +produce graphs showing average (mean) profiles for each treatment group +along with some indication of the variation of the observations at each +time point, in this case the standard error of mean

+

\[se = \frac{sd(x)}{\sqrt{n}}\]

+
# Summary data with mean and standard error of RATSl Weight by group and time
+RATSS <- RATSL %>%
+  group_by(Group, Time) %>%
+  summarise(mean = mean(Weight), se = (sd(Weight)/sqrt(length(Weight))) ) %>% #using formula above;
+  ungroup()
+
## `summarise()` has grouped output by 'Group'. You can override using the
+## `.groups` argument.
+
# Glimpse the data
+glimpse(RATSL)
+
## Rows: 176
+## Columns: 6
+## $ ID         <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2…
+## $ Group      <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1,…
+## $ WD         <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD…
+## $ Weight     <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 445, 555,…
+## $ Time       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8,…
+## $ Weight_std <dbl[,1]> <matrix[26 x 1]>
+
# Plot the mean profiles
+ggplot(RATSS, aes(x = Time, y = mean, color=Group, linetype = Group, shape = Group)) +
+  geom_line() +
+  scale_linetype_manual(values = c(1,2,3)) +
+  geom_point(size=3) +
+  scale_shape_manual(values = c(1,2,3)) +
+  geom_errorbar(aes(ymin=mean-se, ymax=mean+se, linetype="1"), width=0.3) +
+  theme(legend.position = "right") +
+  scale_y_continuous(name = "mean(Weight) +/- se(Weight)")
+

+From the plot we can see ,All groups are independent and doesn’t seem to +overlap with each other. There is a signifiant difference in Group 1 +compared to Group 2 and Group 3. It is also clear that the weight of the +rat seems to increase over time (observation) with a significant +increase in Group 2 and 3.

+
+
+

6.6 Find the outlier using summary measure approach

+

Using the summary measure approach we will look into the post +treatment values of the RATSL data set. Lets look into the mean weight +for each rat. The mean of weeks will be our summary measure and we’ll +plot boxplots of the mean for each diet group which is our treatment +measure.

+
# Create a summary data by treatment and subject with mean as the summary variable (ignoring baseline week 0)
+RATSL8S <- RATSL %>%
+  filter(Time > 0) %>%
+  group_by(Group, ID) %>%
+  summarise(Weight_mean = mean(Weight)) %>%
+  ungroup()
+
## `summarise()` has grouped output by 'Group'. You can override using the
+## `.groups` argument.
+
# Glimpse the data
+glimpse(RATSL8S)
+
## Rows: 16
+## Columns: 3
+## $ Group       <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+## $ ID          <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+## $ Weight_mean <dbl> 261.0909, 237.6364, 260.1818, 266.5455, 269.4545, 274.7273…
+
# Draw a boxplot of the mean versus treatment
+
+ggplot(RATSL8S, aes(x = Group, y = Weight_mean, color = Group)) +
+  geom_boxplot() +
+  stat_summary(fun = "mean", geom = "point", shape=23, size=4, fill = "white") +
+  scale_y_continuous(name = "mean(Weight), Days 1-60")
+

+From the box plot, we can see all three groups has outliers. Group 2 has +a large one making the uneven distribution. The next step is to find and +filter the outliers identified above.

+
# define outlier from group 3
+g3 <- RATSL8S %>% filter(Group==3)
+out3 <- min(g3$Weight_mean)
+
+# Create a new data by filtering the outliers
+RATSL8S2 <- RATSL8S %>%
+  filter(250 < Weight_mean & Weight_mean < 560 & Weight_mean != out3)
+
+
+# Draw a boxplot of the mean versus diet
+ggplot(RATSL8S2, aes(x = Group, y = Weight_mean, col=Group)) +
+  geom_boxplot() +
+  stat_summary(fun = "mean", geom = "point", shape=23, size=4, fill = "white") +
+  scale_y_continuous(name = "mean(Weight) by days")
+

+### 6.7 T-test and ANOVA

+

Although the informal graphical material presented up to now has all +indicated a lack of difference in the two treatment groups, most +investigators would still require a formal test for a difference. +Consequently we shall now apply a t-test to assess any difference +between the treatment groups, and also calculate a confidence interval +for this difference. We use the data without the +outlier created above. The t-test confirms the lack of +any evidence for a group difference. Also the 95% confidence interval is +wide and includes the zero, allowing for similar conclusions to be made. +However, T-test only tests for a statistical difference between two +groups and in the dataset above we have 3 corresponding groups to be +compared, we will therefore use a more stringent and diverse test ANOVA +which compares differences among multiple groups. ANOVA assumes +homogeniety of variance-the variance in the groups 1-3 should be +similar

+
# Load original wide form rats data
+RATSL <- as_tibble(read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", header = TRUE, sep = '\t'))
+RATSL$ID <- factor(RATSL$ID)
+
+# Add the baseline from the original data as a new variable to the summary data
+join_vars <- c("ID","WD1")
+RATSL8S3 <- RATSL8S2 %>%
+  left_join(RATSL[join_vars], by='ID') 
+# Rename column
+RATSL8S3 <- RATSL8S3 %>%
+  rename('Weight_baseline' = 'WD1')
+
+# Fit the linear model with the mean Weight as the response 
+fit2 <- lm(Weight_mean ~ Weight_baseline + Group, data = RATSL8S3)
+
+# Compute the analysis of variance table for the fitted model with anova()
+anova(fit2)
+
## Analysis of Variance Table
+## 
+## Response: Weight_mean
+##                 Df Sum Sq Mean Sq  F value    Pr(>F)    
+## Weight_baseline  1 174396  174396 7999.137 1.384e-14 ***
+## Group            2   1707     853   39.147 3.628e-05 ***
+## Residuals        9    196      22                       
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

Looking at the ANOVA table, p-values < 0.05 considering a +significance level p = 0.05 at 95% CI. There seem to be significant +difference between the groups. The data however doesn’t tell us much +about differences between which groups, i.e., multiple comparison. +Usually data which follows the normal distribution curve are analysed +with ANOVA followed by tukey test for multiple comparison. However in +case of data which doesn’t follow the normal distribution curve, Kruskal +Wallis followed by Dunn’s test for multiple comparison is conducted. Now +assuming our data as normally distributed as we have been doing in this +exercise, we can perform a tukey’s test for multiple comparison.

+
+
+
+

Analysis of BPRS data

+

Lets implement Implement the analyses of Chapter 9 of MABS, using the +R codes of Exercise Set 6: Meet and Repeat: PART II, but using the BPRS +data (from Chapter 8 and Meet and Repeat: PART I) as instructed in the +Moodle.

+

BPRS data includes data pertaining to a brief psychiatric rating +scale (BPRS) score prior to treatment and BPRS from 8 weeks during +treatment. The patients (n=40) have been randomly assigned to treatment +arm 1 or 2 and we are interested whether there is a difference in BPRS +scores depending on the received treatment. A lower score means less +symptoms.

+
+

6.8 Loading the data

+

Lets load and explore the data first

+
BPRSL <- read.table("BPRSL.csv", header = T, sep=",")
+
+# Factor treatment & subject
+BPRSL$treatment <- factor(BPRSL$treatment)
+BPRSL$subject <- factor(BPRSL$subject)
+
+
+# Glimpse the data
+
+glimpse(BPRSL)
+
## Rows: 360
+## Columns: 6
+## $ X         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
+## $ treatment <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
+## $ subject   <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
+## $ weeks     <chr> "week0", "week0", "week0", "week0", "week0", "week0", "week0…
+## $ bprs      <int> 42, 58, 54, 55, 72, 48, 71, 30, 41, 57, 30, 55, 36, 38, 66, …
+## $ week      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
+
str(BPRSL)
+
## 'data.frame':    360 obs. of  6 variables:
+##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
+##  $ treatment: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
+##  $ subject  : Factor w/ 20 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
+##  $ weeks    : chr  "week0" "week0" "week0" "week0" ...
+##  $ bprs     : int  42 58 54 55 72 48 71 30 41 57 ...
+##  $ week     : int  0 0 0 0 0 0 0 0 0 0 ...
+
summary(BPRSL)
+
##        X          treatment    subject       weeks                bprs      
+##  Min.   :  1.00   1:180     1      : 18   Length:360         Min.   :18.00  
+##  1st Qu.: 90.75   2:180     2      : 18   Class :character   1st Qu.:27.00  
+##  Median :180.50             3      : 18   Mode  :character   Median :35.00  
+##  Mean   :180.50             4      : 18                      Mean   :37.66  
+##  3rd Qu.:270.25             5      : 18                      3rd Qu.:43.00  
+##  Max.   :360.00             6      : 18                      Max.   :95.00  
+##                             (Other):252                                     
+##       week  
+##  Min.   :0  
+##  1st Qu.:2  
+##  Median :4  
+##  Mean   :4  
+##  3rd Qu.:6  
+##  Max.   :8  
+## 
+
glimpse(BPRSL)
+
## Rows: 360
+## Columns: 6
+## $ X         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
+## $ treatment <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
+## $ subject   <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
+## $ weeks     <chr> "week0", "week0", "week0", "week0", "week0", "week0", "week0…
+## $ bprs      <int> 42, 58, 54, 55, 72, 48, 71, 30, 41, 57, 30, 55, 36, 38, 66, …
+## $ week      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
+

BPRSL data set has 360 observations and 6 variable. From the glimpse +function, we can see the 6 columns, in which two treatment arms are +coded 1 and 2 for treatment 1 and treatment 2. Subjects are coded from 1 +to 20 however the repetition of same code in subjects suggests that +participants were randomized to Treatment 1 or 2.

+
#Lets plot the data 
+ggplot(BPRSL, aes(x = week, y = bprs, linetype = subject)) +
+  geom_line() +
+  scale_linetype_manual(values = rep(1:10, times=4)) +
+  facet_grid(. ~ treatment, labeller = label_both) +
+  theme(legend.position = "none") + 
+  scale_y_continuous(limits = c(min(BPRSL$bprs), max(BPRSL$bprs)))
+

+From the plot, it appears that BPRS seems to decrease during the +treatment period in both treatment arms. A clear diffrerence cannot be +validated however between groups from the plots.

+
+
+

6.9 Linear Mixed Effects Models

+
# lets create a regression model for BPRSL
+BPRS_reg <-  lm(bprs ~ week + treatment, data = BPRSL)
+
+# print summary of the model 
+summary(BPRS_reg)
+
## 
+## Call:
+## lm(formula = bprs ~ week + treatment, data = BPRSL)
+## 
+## Residuals:
+##     Min      1Q  Median      3Q     Max 
+## -22.454  -8.965  -3.196   7.002  50.244 
+## 
+## Coefficients:
+##             Estimate Std. Error t value Pr(>|t|)    
+## (Intercept)  46.4539     1.3670  33.982   <2e-16 ***
+## week         -2.2704     0.2524  -8.995   <2e-16 ***
+## treatment2    0.5722     1.3034   0.439    0.661    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 12.37 on 357 degrees of freedom
+## Multiple R-squared:  0.1851, Adjusted R-squared:  0.1806 
+## F-statistic: 40.55 on 2 and 357 DF,  p-value: < 2.2e-16
+

We have BPRS score as our target variable and time (weeks) and +treatment 1 and 2 as our explanatory variable. From the summary model, +week variable seem to be statistically significant with BPRS but +treatment variable doesn’t (p = 0.661). No significant difference can be +seen in the difference in BPRS based on treatments However this analyses +assumes independence of observations, i.e., the observation or outcome +is not affected by any other confounder and is completely influenced by +the explanatory variable which is not very rational. Therefore we now +move on to a more stringent analyses which assumes observations ad +dependent variable and can be influence by other effect. We will analyse +the data set with both Fixed-effect models and Random-effect +models.

+
#lets load the package 
+library(lme4)
+
+# Create a random intercept model
+BPRS_ref <- lmer(bprs ~ week + treatment + (1 | subject), data = BPRSL, REML = FALSE)
+
+# Print the summary of the model
+summary(BPRS_ref)
+
## Linear mixed model fit by maximum likelihood  ['lmerMod']
+## Formula: bprs ~ week + treatment + (1 | subject)
+##    Data: BPRSL
+## 
+##      AIC      BIC   logLik deviance df.resid 
+##   2748.7   2768.1  -1369.4   2738.7      355 
+## 
+## Scaled residuals: 
+##     Min      1Q  Median      3Q     Max 
+## -3.0481 -0.6749 -0.1361  0.4813  3.4855 
+## 
+## Random effects:
+##  Groups   Name        Variance Std.Dev.
+##  subject  (Intercept)  47.41    6.885  
+##  Residual             104.21   10.208  
+## Number of obs: 360, groups:  subject, 20
+## 
+## Fixed effects:
+##             Estimate Std. Error t value
+## (Intercept)  46.4539     1.9090  24.334
+## week         -2.2704     0.2084 -10.896
+## treatment2    0.5722     1.0761   0.532
+## 
+## Correlation of Fixed Effects:
+##            (Intr) week  
+## week       -0.437       
+## treatment2 -0.282  0.000
+
confint(BPRS_ref)
+
## Computing profile confidence intervals ...
+
##                 2.5 %    97.5 %
+## .sig01       4.951451 10.019188
+## .sigma       9.486731 11.026604
+## (Intercept) 42.608796 50.298982
+## week        -2.679990 -1.860844
+## treatment2  -1.542804  2.687248
+

Lets first reflect on our BPRS data set. Our maxima and minima is 95 +and 18 respectively. Given the high variance, the baseline seems to +differ from the outcomes.

+

Let’s fit the random intercept and random slope model to our +data:

+
+
+

6.10 Random Intercept and Random Slope Model

+

Fitting a random intercept and random slope model allows the linear regression fits for each individual to differ in intercept but also in slope. This allows us to account for the individual differences in the individuals symptom (BRPS score) profiles, but also the effect of time.

+
# create a random intercept and random slope model
+BPRS_ref1 <- lmer(bprs ~ week * treatment + (week | subject), data = BPRSL, REML = FALSE)
+
+# print a summary of the model
+
+summary(BPRS_ref1)
+
## Linear mixed model fit by maximum likelihood  ['lmerMod']
+## Formula: bprs ~ week * treatment + (week | subject)
+##    Data: BPRSL
+## 
+##      AIC      BIC   logLik deviance df.resid 
+##   2744.3   2775.4  -1364.1   2728.3      352 
+## 
+## Scaled residuals: 
+##     Min      1Q  Median      3Q     Max 
+## -3.0512 -0.6271 -0.0768  0.5288  3.9260 
+## 
+## Random effects:
+##  Groups   Name        Variance Std.Dev. Corr 
+##  subject  (Intercept) 64.9964  8.0620        
+##           week         0.9687  0.9842   -0.51
+##  Residual             96.4707  9.8220        
+## Number of obs: 360, groups:  subject, 20
+## 
+## Fixed effects:
+##                 Estimate Std. Error t value
+## (Intercept)      47.8856     2.2521  21.262
+## week             -2.6283     0.3589  -7.323
+## treatment2       -2.2911     1.9090  -1.200
+## week:treatment2   0.7158     0.4010   1.785
+## 
+## Correlation of Fixed Effects:
+##             (Intr) week   trtmn2
+## week        -0.650              
+## treatment2  -0.424  0.469       
+## wek:trtmnt2  0.356 -0.559 -0.840
+
confint((BPRS_ref1))
+
## Computing profile confidence intervals ...
+
## Warning in FUN(X[[i]], ...): non-monotonic profile for .sig02
+
## Warning in confint.thpr(pp, level = level, zeta = zeta): bad spline fit
+## for .sig02: falling back to linear interpolation
+
##                       2.5 %     97.5 %
+## .sig01           5.39069597 12.1613571
+## .sig02          -0.82694500  0.1404228
+## .sig03           0.43747319  1.6543265
+## .sigma           9.10743668 10.6349246
+## (Intercept)     43.31885100 52.4522602
+## week            -3.34923718 -1.9074295
+## treatment2      -6.04400897  1.4617868
+## week:treatment2 -0.07243289  1.5040996
+
# perform an ANOVA test on the two models to assess formal differences between them
+anova(BPRS_ref1, BPRS_ref)
+
## Data: BPRSL
+## Models:
+## BPRS_ref: bprs ~ week + treatment + (1 | subject)
+## BPRS_ref1: bprs ~ week * treatment + (week | subject)
+##           npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
+## BPRS_ref     5 2748.7 2768.1 -1369.4   2738.7                       
+## BPRS_ref1    8 2744.3 2775.4 -1364.1   2728.3 10.443  3    0.01515 *
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

ANOVA test seems significant (p < 0.05). Addition of slope +definitely increases the model fit. Its clear that addition of a random +intercept model increased the inter individual variance. Treatment group +seems unaffected over time. Lets see the slope model and see the +outcomes.

+
+
+

6.11 Random Intercept and Random Slope Model with interaction

+
BPRS_ref2 <- lmer(bprs ~ week * treatment + (week | subject), data = BPRSL, REML = FALSE)
+
+# print a summary of the model
+
+summary(BPRS_ref2)
+
## Linear mixed model fit by maximum likelihood  ['lmerMod']
+## Formula: bprs ~ week * treatment + (week | subject)
+##    Data: BPRSL
+## 
+##      AIC      BIC   logLik deviance df.resid 
+##   2744.3   2775.4  -1364.1   2728.3      352 
+## 
+## Scaled residuals: 
+##     Min      1Q  Median      3Q     Max 
+## -3.0512 -0.6271 -0.0768  0.5288  3.9260 
+## 
+## Random effects:
+##  Groups   Name        Variance Std.Dev. Corr 
+##  subject  (Intercept) 64.9964  8.0620        
+##           week         0.9687  0.9842   -0.51
+##  Residual             96.4707  9.8220        
+## Number of obs: 360, groups:  subject, 20
+## 
+## Fixed effects:
+##                 Estimate Std. Error t value
+## (Intercept)      47.8856     2.2521  21.262
+## week             -2.6283     0.3589  -7.323
+## treatment2       -2.2911     1.9090  -1.200
+## week:treatment2   0.7158     0.4010   1.785
+## 
+## Correlation of Fixed Effects:
+##             (Intr) week   trtmn2
+## week        -0.650              
+## treatment2  -0.424  0.469       
+## wek:trtmnt2  0.356 -0.559 -0.840
+
# perform an ANOVA test on the two models
+anova(BPRS_ref2, BPRS_ref1)
+
## Data: BPRSL
+## Models:
+## BPRS_ref2: bprs ~ week * treatment + (week | subject)
+## BPRS_ref1: bprs ~ week * treatment + (week | subject)
+##           npar    AIC    BIC  logLik deviance Chisq Df Pr(>Chisq)
+## BPRS_ref2    8 2744.3 2775.4 -1364.1   2728.3                    
+## BPRS_ref1    8 2744.3 2775.4 -1364.1   2728.3     0  0
+

Finally, looking at the model, the two model and outcomes seems +similar. Comparing the ANOVA test conforms there isn’t much significant +difference between them. Adding the interaction variable as mentioned +above in model 1 doesn’t seem to work out as the model didn’t change and +the significance level hasn’t changed either.

+
+
+

6.12 Plotting the observed BPRS values and the fitted BPRS +values

+
# draw the plot of BPRSL with the observed BPRS values
+ggplot(BPRSL, aes(x = week, y = bprs, group = subject, col= treatment)) +
+  geom_line(aes(linetype = treatment)) +
+  scale_x_continuous(name = "Time (weeks)", breaks = seq(0, 4, 8)) +
+  scale_y_continuous(name = "Observed BPRS") +
+  theme(legend.position = "right") +
+  facet_grid(. ~ treatment, labeller=label_both)
+

+
# Create a vector of the fitted values
+Fitted <- fitted(BPRS_ref)
+Fitted1 <- fitted(BPRS_ref1)
+Fitted2 <- fitted(BPRS_ref2)
+
+# Create a new column fitted to BPRSL
+BPRSL <- BPRSL %>% mutate(bprs_fitted_values_BPRSL_ref = Fitted, bprs_fitted_values_BPRSL_ref1 = Fitted1, bprs_fitted_values_BPRSL_ref2 = Fitted2)
+head(BPRSL)
+
##   X treatment subject weeks bprs week bprs_fitted_values_BPRSL_ref
+## 1 1         1       1 week0   42    0                     53.19460
+## 2 2         1       2 week0   58    0                     43.04516
+## 3 3         1       3 week0   54    0                     43.98584
+## 4 4         1       4 week0   55    0                     49.13483
+## 5 5         1       5 week0   72    0                     58.19506
+## 6 6         1       6 week0   48    0                     41.51037
+##   bprs_fitted_values_BPRSL_ref1 bprs_fitted_values_BPRSL_ref2
+## 1                      49.24017                      49.24017
+## 2                      46.97380                      46.97380
+## 3                      47.65582                      47.65582
+## 4                      49.85313                      49.85313
+## 5                      66.39001                      66.39001
+## 6                      42.59363                      42.59363
+
 # draw the plot of BPRSL with the Fitted values of bprs model 1
+ggplot(BPRSL, aes(x = week, y = bprs_fitted_values_BPRSL_ref, group = subject, col=treatment)) +
+  geom_line(aes(linetype = treatment)) +
+  scale_x_continuous(name = "Time (weeks)", breaks = seq(0, 4, 8)) +
+  scale_y_continuous(name = "Fitted BPRS (model 1: rnd intercept)") +
+  theme(legend.position = "right") +
+  facet_grid(. ~ treatment, labeller=label_both)
+

+
# draw the plot of BPRSL with the Fitted values of bprs model 2
+ggplot(BPRSL, aes(x = week, y = bprs_fitted_values_BPRSL_ref1, group = subject, col=treatment)) +
+  geom_line(aes(linetype = treatment)) +
+  scale_x_continuous(name = "Time (weeks)", breaks = seq(0, 4, 8)) +
+  scale_y_continuous(name = "Fitted BPRS (model 2: rnd intercept + slope)") +
+  theme(legend.position = "right") +
+  facet_grid(. ~ treatment, labeller=label_both)
+

+From the plot, we can see random intercept model differs from random +intercept and slope model. Adding a slope intercept didn’t change the +outcomes however. We can also see the final plot random intercept and +slope with interaction model also different from all three model above. +In conclusion we can say that random intercept model doesn’t highlight +the individual’s effect on bprs over time and also the outcomes didn’t +change with subsequent model.

+

******************************************************************* +END +******************************************************************

+

DONE AND DUSTED \(\large +\surd\) !!!

+

+

Well not really !! There is so much more to learn. +This course was however a great “push” I have truly enjoyed the exercise +sessions, the course material and the review exercises. I have learned +so much and I am looking forward to learning more !!

+

Merry Christmas and a Happy New Year Everyone !!

+

+
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/creat_human.R b/creat_human.R new file mode 100644 index 000000000..c84ca6667 --- /dev/null +++ b/creat_human.R @@ -0,0 +1,97 @@ + +#Name: Subam Kathet + +#Date: 27/11/2022 + +#Introduction to open data science + +#Week 4: Clustering and Classification + +#This weeks learning tasks includes exercises from week 4 - clustering and classification + +#Step 1 +#Let's start by creating a new R script creat_human.R first !! + +# Data resource: https://archive.ics.uci.edu/ml/machine-learning-databases/00320/ + +# Data files: +## human_development.csv +## gender_inequality.csv + +# Meta files +## https://hdr.undp.org/data-center/human-development-index#/indices/HDI + +#Technical notes +## https://hdr.undp.org/system/files/documents//technical-notes-calculating-human-development-indices.pdf + + +## Required packages +library(tidyverse) +library(dplyr) +library(ggplot2) + +# Step 2 +# read data +hd <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/human_development.csv") +gii <- read_csv("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/gender_inequality.csv", na = "..") + +## hd -> human development +## gii -> Gender inequality + +# Step 3 +# See structure and dimensions of the data +str(hd);dim(hd) +str(gii);dim(gii) + +# Summaries of the variables +summary(hd) +summary(gii) + +# Step 4 +# Rename variables with shorter names (see Meta files) +# Percent Representation in parliament is % of women (see HDI technical notes) + +hd2 <- hd %>% rename('HDR' = 'HDI Rank', + 'HDI' = 'Human Development Index (HDI)', + 'Life_Exp' = 'Life Expectancy at Birth', + 'Exp_Edu' = 'Expected Years of Education', + 'Mean_Edu' = 'Mean Years of Education', + 'GNI' = 'Gross National Income (GNI) per Capita', + 'GNI-HDR' = 'GNI per Capita Rank Minus HDI Rank') + +gii2 <- gii %>% rename('GIR'= 'GII Rank', + 'GII' = 'Gender Inequality Index (GII)', + 'MMR' = 'Maternal Mortality Ratio', + 'ABR' = 'Adolescent Birth Rate', + '%PR' = 'Percent Representation in Parliament', + 'SeEdu_F' = 'Population with Secondary Education (Female)', + 'SeEdu_M' = 'Population with Secondary Education (Male)', + 'LFR_F' = 'Labour Force Participation Rate (Female)', + 'LFR_M' = 'Labour Force Participation Rate (Male)') + +# Step 5 +# Add new variables edu2F / edu2M and labF / labM + +gii2 <- mutate(gii2, SeEdu_FM = SeEdu_F / SeEdu_M, LFR_FM = LFR_F / LFR_M) + +# Step 5 +#Joining the dataset +human <- inner_join(hd2, gii2, by='Country') +dim(human) +# Has 195 observation and 19 variables +#All good + +#save new data set to project folder +write_csv(human, file="human.csv") + +#check to see if its ok +read_csv("human.csv") +dim("human.csv") +str("human.csv") + +# Everything looks fine ! + +#Data wrangling complete, All set for analyses !!! + + + diff --git a/human.csv b/human.csv new file mode 100644 index 000000000..de7db3cc1 --- /dev/null +++ b/human.csv @@ -0,0 +1,196 @@ +HDR,Country,HDI,Life_Exp,Exp_Edu,Mean_Edu,GNI,GNI-HDR,GIR,GII,MMR,ABR,%PR,SeEdu_F,SeEdu_M,LFR_F,LFR_M,SeEdu_FM,LFR_FM +1,Norway,0.944,81.6,17.5,12.6,64992,5,1,0.067,4,7.8,39.6,97.4,96.7,61.2,68.7,1.0072388831437435,0.8908296943231441 +2,Australia,0.935,82.4,20.2,13,42261,17,2,0.11,6,12.1,30.5,94.3,94.6,58.8,71.8,0.9968287526427062,0.8189415041782729 +3,Switzerland,0.93,83,15.8,12.8,56431,6,3,0.028,6,1.9,28.5,95,96.6,61.8,74.9,0.9834368530020704,0.8251001335113484 +4,Denmark,0.923,80.2,18.7,12.7,44025,11,4,0.048,5,5.1,38,95.5,96.6,58.7,66.4,0.9886128364389235,0.8840361445783133 +5,Netherlands,0.922,81.6,17.9,11.9,45435,9,5,0.062,6,6.2,36.9,87.7,90.5,58.5,70.6,0.969060773480663,0.8286118980169972 +6,Germany,0.916,80.9,16.5,13.1,43919,11,6,0.041,7,3.8,36.9,96.3,97,53.6,66.4,0.9927835051546392,0.8072289156626505 +6,Ireland,0.916,80.9,18.6,12.2,39568,16,6,0.113,9,8.2,19.9,80.5,78.6,53.1,68.1,1.024173027989822,0.7797356828193833 +8,United States,0.915,79.1,16.5,12.9,52947,3,8,0.28,28,31,19.4,95.1,94.8,56.3,68.9,1.0031645569620253,0.8171262699564585 +9,Canada,0.913,82,15.9,13,42155,11,9,0.129,11,14.5,28.2,100,100,61.6,71,1,0.867605633802817 +9,New Zealand,0.913,81.8,19.2,12.5,32689,23,9,0.157,8,25.3,31.4,95,95.3,62,73.8,0.9968520461699896,0.8401084010840109 +11,Singapore,0.912,83,15.4,10.6,76628,-7,11,0.088,6,6,25.3,74.1,81,58.8,77.2,0.9148148148148147,0.7616580310880828 +12,"Hong Kong, China (SAR)",0.91,84,15.6,11.2,53959,-2,12,NA,NA,3.3,NA,72.2,79.2,51.3,67.8,0.9116161616161617,0.7566371681415929 +13,Liechtenstein,0.908,80,15,11.8,79851,-10,13,NA,NA,NA,20,NA,NA,NA,NA,NA,NA +14,Sweden,0.907,82.2,15.8,12.1,45636,-1,14,0.055,4,6.5,43.6,86.5,87.3,60.3,67.9,0.9908361970217641,0.8880706921944034 +14,United Kingdom,0.907,80.7,16.2,13.1,39267,9,14,0.177,8,25.8,23.5,99.8,99.9,55.7,68.7,0.9989989989989989,0.8107714701601164 +16,Iceland,0.899,82.6,19,10.6,35182,12,16,0.087,4,11.5,41.3,91,91.6,70.5,77.4,0.9934497816593887,0.9108527131782945 +17,Korea (Republic of),0.898,81.9,16.9,11.9,33890,13,17,0.125,27,2.2,16.3,77,89.1,50.1,72.1,0.8641975308641976,0.694868238557559 +18,Israel,0.894,82.4,16,12.5,30676,16,18,0.101,2,7.8,22.5,84.4,87.3,57.9,69.1,0.9667812142038947,0.8379160636758322 +19,Luxembourg,0.892,81.7,13.9,11.7,58711,-11,19,0.1,11,8.3,28.3,100,100,50.7,64.6,1,0.7848297213622292 +20,Japan,0.891,83.5,15.3,11.5,36927,7,20,0.133,6,5.4,11.6,87,85.8,48.8,70.4,1.013986013986014,0.6931818181818181 +21,Belgium,0.89,80.8,16.3,11.3,41187,0,21,0.063,6,6.7,42.4,77.5,82.9,47.5,59.3,0.9348612786489746,0.8010118043844857 +22,France,0.888,82.2,16,11.1,38056,4,22,0.088,12,5.7,25.7,78,83.2,50.7,61.6,0.9375,0.823051948051948 +23,Austria,0.885,81.4,15.7,10.8,43869,-5,23,0.053,4,4.1,30.3,100,100,54.6,67.7,1,0.8064992614475628 +24,Finland,0.883,80.8,17.1,10.3,38695,0,24,0.075,4,9.2,42.5,100,100,55.7,64,1,0.8703125 +25,Slovenia,0.88,80.4,16.8,11.9,27852,12,25,0.016,7,0.6,27.7,95.8,98,52.3,63.2,0.9775510204081632,0.8275316455696202 +26,Spain,0.876,82.6,17.3,9.6,32045,7,26,0.095,4,10.6,38,66.8,73.1,52.5,65.8,0.9138166894664843,0.7978723404255319 +27,Italy,0.873,83.1,16,10.1,33030,4,27,0.068,4,4,30.1,71.2,80.5,39.6,59.5,0.884472049689441,0.665546218487395 +28,Czech Republic,0.87,78.6,16.4,12.3,26660,10,28,0.091,5,4.9,18.9,99.9,99.7,51.1,68.3,1.0020060180541626,0.7481698389458273 +29,Greece,0.865,80.9,17.6,10.3,24524,14,29,0.146,5,11.9,21,59.5,67,44.2,62.5,0.8880597014925373,0.7072 +30,Estonia,0.861,76.8,16.5,12.5,25214,12,30,0.164,11,16.8,19.8,100,100,56.2,68.9,1,0.8156748911465892 +31,Brunei Darussalam,0.856,78.8,14.5,8.8,72570,-26,31,NA,27,23,NA,63.9,67.8,52.6,75.3,0.9424778761061947,0.6985391766268261 +32,Cyprus,0.85,80.2,14,11.6,28633,3,32,0.124,10,5.5,12.5,76,81.7,56,71.1,0.9302325581395349,0.7876230661040788 +32,Qatar,0.85,78.2,13.8,9.1,123124,-31,32,0.524,6,9.5,0,66.7,59,50.8,95.5,1.1305084745762712,0.5319371727748691 +34,Andorra,0.845,81.3,13.5,9.6,43978,-18,34,NA,NA,NA,50,49.5,49.3,NA,NA,1.004056795131846,NA +35,Slovakia,0.844,76.3,15.1,12.2,25845,5,35,0.164,7,15.9,18.7,99.1,99.5,51.1,68.6,0.9959798994974873,0.7448979591836735 +36,Poland,0.843,77.4,15.5,11.8,23177,10,36,0.138,3,12.2,22.1,79.4,85.5,48.9,64.9,0.928654970760234,0.7534668721109398 +37,Lithuania,0.839,73.3,16.4,12.4,24500,7,37,0.125,11,10.6,23.4,89.1,94.3,55.8,67.3,0.9448568398727465,0.8291233283803863 +37,Malta,0.839,80.6,14.4,10.3,27930,-1,37,0.227,9,18.2,13,68.6,78.2,37.9,66.3,0.877237851662404,0.5716440422322775 +39,Saudi Arabia,0.837,74.3,16.3,8.7,52821,-27,39,0.284,16,10.2,19.9,60.5,70.3,20.2,78.3,0.860597439544808,0.25798212005108556 +40,Argentina,0.836,76.3,17.9,9.8,22050,11,40,0.376,69,54.4,36.8,56.3,57.6,47.5,75,0.9774305555555555,0.6333333333333333 +41,United Arab Emirates,0.835,77,13.3,9.5,60868,-34,41,0.232,8,27.6,17.5,73.1,61.2,46.5,92,1.1944444444444442,0.5054347826086957 +42,Chile,0.832,81.7,15.2,9.8,21290,11,42,0.338,22,55.3,15.8,73.3,76.4,49.2,74.8,0.9594240837696334,0.6577540106951872 +43,Portugal,0.83,80.9,16.3,8.2,25757,-2,43,0.111,8,12.6,31.3,47.7,48.2,54.9,66.2,0.9896265560165975,0.8293051359516616 +44,Hungary,0.828,75.2,15.4,11.6,22916,3,44,0.209,14,12.1,10.1,97.9,98.7,44.8,60,0.9918946301925026,0.7466666666666666 +45,Bahrain,0.824,76.6,14.4,9.4,38599,-20,45,0.265,22,13.8,15,56.7,51.4,39.2,86.9,1.1031128404669261,0.45109321058688145 +46,Latvia,0.819,74.2,15.2,11.5,22281,4,46,0.167,13,13.5,18,98.9,99,54.9,67.6,0.9989898989898991,0.812130177514793 +47,Croatia,0.818,77.3,14.8,11,19409,11,47,0.149,13,12.7,25.8,85,93.6,44.7,58.4,0.9081196581196582,0.7654109589041097 +48,Kuwait,0.816,74.4,14.7,7.2,83961,-46,48,0.387,14,14.5,1.5,55.6,56.3,43.6,83.1,0.9875666074600356,0.5246690734055356 +49,Montenegro,0.802,76.2,15.2,11.2,14558,27,49,0.171,7,15.2,17.3,84.2,94.7,43,57.3,0.8891235480464625,0.7504363001745201 +50,Belarus,0.798,71.3,15.7,12,16676,14,50,0.151,1,20.6,30.1,87,92.2,50.1,63.1,0.9436008676789588,0.7939778129952456 +50,Russian Federation,0.798,70.1,14.7,12,22352,-1,50,0.276,24,25.7,14.5,89.6,92.5,57.1,71.7,0.9686486486486486,0.796373779637378 +52,Oman,0.793,76.8,13.6,8,34858,-23,52,0.275,11,10.6,9.6,47.2,57.1,29,82.6,0.8266199649737304,0.35108958837772397 +52,Romania,0.793,74.7,14.2,10.8,18108,10,52,0.333,33,31,12,86.1,92,48.7,64.9,0.9358695652173913,0.7503852080123267 +52,Uruguay,0.793,77.2,15.5,8.5,19283,7,52,0.313,14,58.3,11.5,54.4,50.3,55.6,76.8,1.0815109343936382,0.7239583333333334 +55,Bahamas,0.79,75.4,12.6,10.9,21336,-3,55,0.298,37,28.5,16.7,91.2,87.6,69.3,79.3,1.041095890410959,0.8738965952080706 +56,Kazakhstan,0.788,69.4,15,11.4,20867,-1,56,0.267,26,29.9,20.1,95.3,98.8,67.7,77.9,0.9645748987854251,0.8690629011553274 +57,Barbados,0.785,75.6,15.4,10.5,12488,27,57,0.357,52,48.4,19.6,89.5,87.7,65.9,76.6,1.0205245153933866,0.8603133159268931 +58,Antigua and Barbuda,0.783,76.1,14,9.2,20070,-1,58,NA,NA,49.3,25.7,NA,NA,NA,NA,NA,NA +59,Bulgaria,0.782,74.2,14.4,10.6,15596,13,59,0.212,5,35.9,20.4,93,95.7,47.9,59,0.9717868338557993,0.811864406779661 +60,Palau,0.78,72.7,13.7,12.3,13496,18,60,NA,NA,NA,10.3,NA,NA,NA,NA,NA,NA +60,Panama,0.78,77.6,13.3,9.3,18192,1,60,0.454,85,78.5,19.3,54,49.9,49,81.8,1.0821643286573146,0.5990220048899756 +62,Malaysia,0.779,74.7,12.7,10,22762,-14,62,0.209,29,5.7,14.2,65.1,71.3,44.4,75.5,0.9130434782608695,0.5880794701986755 +63,Mauritius,0.777,74.4,15.6,8.5,17470,0,63,0.419,73,30.9,11.6,49.4,58,43.6,74.2,0.8517241379310344,0.5876010781671159 +64,Seychelles,0.772,73.1,13.4,9.4,23300,-19,64,NA,NA,56.3,43.8,66.9,66.6,NA,NA,1.0045045045045047,NA +64,Trinidad and Tobago,0.772,70.4,12.3,10.9,26090,-25,64,0.371,84,34.8,24.7,59.7,60.9,53,75.5,0.9802955665024631,0.7019867549668874 +66,Serbia,0.771,74.9,14.4,10.5,12190,20,66,0.176,16,16.9,34,58.4,73.6,44.5,60.9,0.7934782608695653,0.7307060755336617 +67,Cuba,0.769,79.4,13.8,11.5,7301,47,67,0.356,80,43.1,48.9,74.3,78.8,43.4,70,0.9428934010152284,0.62 +67,Lebanon,0.769,79.3,13.8,7.9,16509,-1,67,0.385,16,12,3.1,53,55.4,23.3,70.9,0.9566787003610109,0.32863187588152326 +69,Costa Rica,0.766,79.4,13.9,8.4,13413,10,69,0.349,38,60.8,33.3,50.7,50.5,46.6,79,1.003960396039604,0.589873417721519 +69,Iran (Islamic Republic of),0.766,75.4,15.1,8.2,15440,4,69,0.515,23,31.6,3.1,62.2,67.6,16.6,73.6,0.9201183431952664,0.2255434782608696 +71,Venezuela (Bolivarian Republic of),0.762,74.2,14.2,8.9,16159,-2,71,0.476,110,83.2,17,56.6,50.8,51.1,79.2,1.1141732283464567,0.6452020202020202 +72,Turkey,0.761,75.3,14.5,7.6,18677,-12,72,0.359,20,30.9,14.4,39,60,29.4,70.8,0.65,0.4152542372881356 +73,Sri Lanka,0.757,74.9,13.7,10.8,9779,29,73,0.37,29,16.9,5.8,72.7,76.4,35.1,76.3,0.9515706806282722,0.46002621231979035 +74,Mexico,0.756,76.8,13.1,8.5,16056,-4,74,0.373,49,63.4,37.1,55.7,60.6,45.1,79.9,0.9191419141914192,0.5644555694618273 +75,Brazil,0.755,74.5,15.2,7.7,15175,-1,75,0.457,69,70.8,9.6,54.6,52.4,59.4,80.8,1.0419847328244276,0.7351485148514851 +76,Georgia,0.754,74.9,13.8,12.1,7164,40,76,0.382,41,46.8,11.3,89.7,92.7,56.5,75.1,0.9676375404530745,0.7523302263648469 +77,Saint Kitts and Nevis,0.752,73.8,12.9,8.4,20805,-21,77,NA,NA,NA,6.7,NA,NA,NA,NA,NA,NA +78,Azerbaijan,0.751,70.8,11.9,11.2,16428,-11,78,0.303,26,40,15.6,93.7,97.4,62.9,69.6,0.9620123203285421,0.9037356321839081 +79,Grenada,0.75,73.4,15.8,8.6,10939,14,79,NA,23,35.4,25,NA,NA,NA,NA,NA,NA +80,Jordan,0.748,74,13.5,9.9,11365,11,80,0.473,50,26.5,11.6,69.5,78.5,15.6,66.6,0.8853503184713376,0.23423423423423426 +81,The former Yugoslav Republic of Macedonia,0.747,75.4,13.4,9.3,11780,9,81,0.164,7,18.3,33.3,40.2,55.6,43.1,67.5,0.723021582733813,0.6385185185185185 +81,Ukraine,0.747,71,15.1,11.3,8178,25,81,0.286,23,25.7,11.8,91.7,95.9,53.2,66.9,0.9562043795620437,0.7952167414050821 +83,Algeria,0.736,74.8,14,7.6,13054,-1,83,0.413,89,10,25.7,26.7,31,15.2,72.2,0.8612903225806451,0.21052631578947367 +84,Peru,0.734,74.6,13.1,9,11015,8,84,0.406,89,50.7,22.3,56.3,66.1,68.2,84.4,0.8517397881996974,0.8080568720379147 +85,Albania,0.733,77.8,11.8,9.3,9943,14,85,0.217,21,15.3,20.7,81.8,87.9,44.9,65.5,0.9306029579067121,0.6854961832061068 +85,Armenia,0.733,74.7,12.3,10.9,8124,22,85,0.318,29,27.1,10.7,94,95,54.2,72.6,0.9894736842105263,0.7465564738292012 +85,Bosnia and Herzegovina,0.733,76.5,13.6,8.3,9638,19,85,0.201,8,15.1,19.3,44.9,69.8,34.1,57.3,0.6432664756446992,0.5951134380453753 +88,Ecuador,0.732,75.9,14.2,7.6,10605,7,88,0.407,87,77,41.6,40.1,39.4,54.7,82.7,1.017766497461929,0.6614268440145102 +89,Saint Lucia,0.729,75.1,12.6,9.3,9765,14,89,NA,34,56.3,20.7,NA,NA,62.7,76.2,NA,0.8228346456692913 +90,China,0.727,75.8,13.1,7.5,12547,-7,90,0.191,32,8.6,23.6,58.7,71.9,63.9,78.3,0.8164116828929068,0.8160919540229885 +90,Fiji,0.727,70,15.7,9.9,7493,21,90,0.418,59,42.8,14,64.2,64.5,37.5,72,0.9953488372093023,0.5208333333333334 +90,Mongolia,0.727,69.4,14.6,9.3,10729,4,90,0.325,68,18.7,14.9,85.3,84.1,56.6,69.3,1.014268727705113,0.8167388167388168 +93,Thailand,0.726,74.4,13.5,7.3,13323,-13,93,0.38,26,41,6.1,35.7,40.8,64.3,80.7,0.8750000000000001,0.7967781908302354 +94,Dominica,0.724,77.8,12.7,7.9,9994,4,94,NA,NA,NA,21.9,29.7,23.2,NA,NA,1.2801724137931034,NA +94,Libya,0.724,71.6,14,7.3,14911,-19,94,0.134,15,2.5,16,55.5,41.9,30,76.4,1.324582338902148,0.3926701570680628 +96,Tunisia,0.721,74.8,14.6,6.8,10404,1,96,0.24,46,4.6,31.3,32.8,46.1,25.1,70.9,0.7114967462039045,0.3540197461212976 +97,Colombia,0.72,74,13.5,7.3,12040,-9,97,0.429,83,68.5,20.9,56.9,55.6,55.8,79.7,1.0233812949640286,0.7001254705144291 +97,Saint Vincent and the Grenadines,0.72,72.9,13.4,8.6,9937,3,97,NA,45,54.5,13,NA,NA,55.7,78,NA,0.7141025641025641 +99,Jamaica,0.719,75.7,12.4,9.7,7415,13,99,0.43,80,70.1,16.7,74,70.2,56.1,70.9,1.054131054131054,0.7912552891396333 +100,Tonga,0.717,72.8,14.7,10.7,5069,32,100,0.666,120,18.1,0,87.5,88.3,53.5,74.6,0.9909399773499434,0.7171581769436998 +101,Belize,0.715,70,13.6,10.5,7614,9,101,0.426,45,71.4,13.3,76.4,75.8,49.2,82.3,1.0079155672823221,0.597812879708384 +101,Dominican Republic,0.715,73.5,13.1,7.6,11883,-12,101,0.477,100,99.6,19.1,55.6,53.1,51.3,78.6,1.0470809792843692,0.6526717557251909 +103,Suriname,0.714,71.1,12.7,7.7,15617,-32,103,0.463,130,35.2,11.8,44.6,47.1,40.5,68.8,0.9469214437367304,0.5886627906976745 +104,Maldives,0.706,76.8,13,5.8,12328,-19,104,0.243,31,4.2,5.9,27.3,32.7,56.2,77.5,0.8348623853211009,0.7251612903225807 +105,Samoa,0.702,73.4,12.9,10.3,5327,24,105,0.457,58,28.3,6.1,64.3,60,23.5,58.4,1.0716666666666665,0.4023972602739726 +106,Botswana,0.698,64.5,12.5,8.9,16646,-41,106,0.48,170,44.2,9.5,73.6,77.9,71.9,81.6,0.9448010269576379,0.8811274509803922 +107,Moldova (Republic of),0.693,71.6,11.9,11.2,5223,23,107,0.248,21,29.3,20.8,93.6,96.6,37.6,44.2,0.968944099378882,0.8506787330316742 +108,Egypt,0.69,71.1,13.5,6.6,10512,-12,108,0.573,45,43,2.2,43.9,60.6,23.7,74.8,0.7244224422442244,0.31684491978609625 +109,Turkmenistan,0.688,65.6,10.8,9.9,13066,-28,109,NA,61,18,25.8,NA,NA,46.9,76.9,NA,0.6098829648894668 +110,Gabon,0.684,64.4,12.5,7.8,16367,-42,110,0.514,240,103,16.2,53.9,36.1,56.2,65.4,1.4930747922437673,0.8593272171253822 +110,Indonesia,0.684,68.9,13,7.6,9788,-9,110,0.494,190,48.3,17.1,39.9,49.2,51.4,84.2,0.8109756097560975,0.6104513064133016 +112,Paraguay,0.679,72.9,11.9,7.7,7643,-3,112,0.472,110,67,16.8,36.8,43,55.7,84.8,0.8558139534883721,0.6568396226415095 +113,"Palestine, State of",0.677,72.9,13,8.9,4699,21,113,NA,NA,45.8,NA,53.9,59.4,15.4,66.4,0.9074074074074074,0.2319277108433735 +114,Uzbekistan,0.675,68.4,11.5,10.9,5567,10,114,NA,36,38.8,16.4,NA,NA,48.1,75.6,NA,0.6362433862433863 +115,Philippines,0.668,68.2,11.3,8.9,7915,-7,115,0.42,120,46.8,27.1,65.9,63.7,51.1,79.7,1.0345368916797488,0.6411543287327478 +116,El Salvador,0.666,73,12.3,6.5,7349,-3,116,0.427,69,76,27.4,36.8,43.6,47.8,79,0.8440366972477064,0.6050632911392405 +116,South Africa,0.666,57.4,13.6,9.9,12122,-29,116,0.407,140,50.9,40.7,72.7,75.9,44.5,60.5,0.9578392621870883,0.7355371900826446 +116,Viet Nam,0.666,75.8,11.9,7.5,5092,15,116,0.308,49,29,24.3,59.4,71.2,73,82.2,0.8342696629213483,0.8880778588807785 +119,Bolivia (Plurinational State of),0.662,68.3,13.2,8.2,5760,4,119,0.444,200,71.9,51.8,47.6,59.1,64.2,80.9,0.805414551607445,0.7935723114956736 +120,Kyrgyzstan,0.655,70.6,12.5,10.6,3044,29,120,0.353,75,29.3,23.3,94.5,96.8,56,79.5,0.9762396694214877,0.7044025157232704 +121,Iraq,0.654,69.4,10.1,6.4,14003,-44,121,0.539,67,68.7,26.5,27.8,50.2,14.9,69.8,0.5537848605577689,0.21346704871060174 +122,Cabo Verde,0.646,73.3,13.5,4.7,6094,-1,122,NA,53,70.6,20.8,NA,NA,51.5,83.7,NA,0.6152927120669056 +123,Micronesia (Federated States of),0.64,69.1,11.7,9.7,3432,21,123,NA,96,18.6,0,NA,NA,NA,NA,NA,NA +124,Guyana,0.636,66.4,10.3,8.5,6522,-4,124,0.515,250,88.5,31.3,60.3,47.8,42.6,80.5,1.2615062761506277,0.529192546583851 +125,Nicaragua,0.631,74.9,11.5,6,4457,12,125,0.449,100,100.8,39.1,39.4,38.3,47.4,80.3,1.0287206266318538,0.5902864259028643 +126,Morocco,0.628,74,11.6,4.4,6850,-8,126,0.525,120,35.8,11,20.7,30.2,26.5,75.8,0.6854304635761589,0.3496042216358839 +126,Namibia,0.628,64.8,11.3,6.2,9418,-21,126,0.401,130,54.9,37.7,33.3,34.4,54.7,63.7,0.9680232558139534,0.858712715855573 +128,Guatemala,0.627,71.8,10.7,5.6,6929,-11,128,0.533,140,97.2,13.3,21.9,23.2,49.3,88.2,0.9439655172413792,0.5589569160997732 +129,Tajikistan,0.624,69.4,11.2,10.4,2517,27,129,0.357,44,42.8,15.2,95.1,91.2,58.9,77.1,1.0427631578947367,0.7639429312581064 +130,India,0.609,68,11.7,5.4,5497,-4,130,0.563,190,32.8,12.2,27,56.6,27,79.9,0.47703180212014135,0.33792240300375465 +131,Honduras,0.606,73.1,11.1,5.5,3938,7,131,0.48,120,84,25.8,28,25.8,42.8,82.9,1.0852713178294573,0.5162846803377563 +132,Bhutan,0.605,69.5,12.6,3,7176,-17,132,0.457,120,40.9,8.3,34,34.5,66.7,77.2,0.9855072463768116,0.8639896373056994 +133,Timor-Leste,0.595,68.2,11.7,4.4,5363,-6,133,NA,270,52.2,38.5,NA,NA,24.6,50.8,NA,0.48425196850393704 +134,Syrian Arab Republic,0.594,69.6,12.3,6.3,2728,21,134,0.533,49,41.6,12.4,29.5,40.5,13.5,72.7,0.7283950617283951,0.1856946354883081 +134,Vanuatu,0.594,71.9,10.6,6.8,2803,19,134,NA,86,44.8,0,NA,NA,61.5,80,NA,0.76875 +136,Congo,0.591,62.3,11.1,6.1,6012,-14,136,0.593,410,126.7,11.5,39.7,47,68.5,73,0.8446808510638298,0.9383561643835616 +137,Kiribati,0.59,66,12.3,7.8,2434,21,137,NA,130,16.6,8.7,NA,NA,NA,NA,NA,NA +138,Equatorial Guinea,0.587,57.6,9,5.5,21056,-84,138,NA,290,112.6,19.7,NA,NA,80.7,92.2,NA,0.8752711496746204 +139,Zambia,0.586,60.1,13.5,6.6,3734,2,139,0.587,280,125.4,12.7,25.8,44,73.1,85.6,0.5863636363636364,0.8539719626168224 +140,Ghana,0.579,61.4,11.5,7,3852,-1,140,0.554,380,58.4,10.9,45.2,64.7,67.3,71.4,0.6986089644513138,0.9425770308123248 +141,Lao People's Democratic Republic,0.575,66.2,10.6,5,4680,-6,141,NA,NA,65,25,22.9,37,76.3,79.1,0.6189189189189189,0.9646017699115045 +142,Bangladesh,0.57,71.6,10,5.1,3191,5,142,0.503,170,80.6,20,34.1,41.3,57.4,84.1,0.8256658595641647,0.6825208085612366 +143,Cambodia,0.555,68.4,10.9,4.4,2949,7,143,0.477,170,44.3,19,9.9,22.9,78.8,86.5,0.4323144104803494,0.9109826589595376 +143,Sao Tome and Principe,0.555,66.5,11.3,4.7,2918,8,143,NA,210,65.1,18.2,NA,NA,45.3,77.8,NA,0.5822622107969152 +145,Kenya,0.548,61.6,11,6.3,2762,9,145,0.552,400,93.6,20.8,25.3,31.4,62.2,72.4,0.8057324840764332,0.8591160220994475 +145,Nepal,0.548,69.6,12.4,3.3,2311,16,145,0.489,190,73.7,29.5,17.7,38.2,79.9,87.1,0.4633507853403141,0.9173363949483354 +147,Pakistan,0.538,66.2,7.8,4.7,4866,-14,147,0.536,170,27.3,19.7,19.3,46.1,24.6,82.9,0.41865509761388287,0.2967430639324487 +148,Myanmar,0.536,65.9,8.6,4.1,4608,-12,148,0.413,200,12.1,4.7,22.9,15.3,75.2,82.3,1.4967320261437906,0.91373025516403406 +149,Angola,0.532,52.3,11.4,4.7,6822,-30,149,NA,460,170.2,36.8,NA,NA,63.3,76.9,NA,0.8231469440832249 +150,Swaziland,0.531,49,11.3,7.1,5542,-25,150,0.557,310,72,14.7,21.9,26,43.9,71.6,0.8423076923076922,0.6131284916201117 +151,Tanzania (United Republic of),0.521,65,9.2,5.1,2411,8,151,0.547,410,122.7,36,5.6,9.5,88.1,90.2,0.5894736842105263,0.9767184035476717 +152,Nigeria,0.514,52.8,9,5.9,5341,-24,152,NA,560,119.6,6.6,NA,NA,48.2,63.7,NA,0.7566718995290423 +153,Cameroon,0.512,55.5,10.4,6,2803,-1,153,0.587,590,115.8,27.1,21.3,34.9,63.8,76.8,0.6103151862464183,0.8307291666666666 +154,Madagascar,0.51,65.1,10.3,6,1328,24,154,NA,440,122.8,20.5,NA,NA,86.6,90.5,NA,0.9569060773480662 +155,Zimbabwe,0.509,57.5,10.9,7.3,1615,13,155,0.504,470,60.3,35.1,48.7,62,83.2,89.7,0.785483870967742,0.927536231884058 +156,Mauritania,0.506,63.1,8.5,3.8,3560,-14,156,0.61,320,73.3,22.2,8.3,20.9,28.7,79.1,0.3971291866028709,0.36283185840707965 +156,Solomon Islands,0.506,67.9,9.2,5,1540,16,156,NA,130,64.9,2,NA,NA,53.4,79,NA,0.6759493670886075 +158,Papua New Guinea,0.505,62.6,9.9,4,2463,-1,158,0.611,220,62.1,2.7,7.6,14.5,70.5,74,0.5241379310344827,0.9527027027027027 +159,Comoros,0.503,63.3,11.5,4.6,1456,16,159,NA,350,51.1,3,NA,NA,35.2,80.1,NA,0.43945068664169795 +160,Yemen,0.498,63.8,9.2,2.6,3519,-17,160,0.744,270,47,0.7,8.6,26.7,25.4,72.2,0.32209737827715357,0.35180055401662047 +161,Lesotho,0.497,49.8,11.1,5.9,3306,-16,161,0.541,490,89.4,26.8,21.9,19,59,73.5,1.1526315789473685,0.8027210884353742 +162,Togo,0.484,59.7,12.2,4.5,1228,17,162,0.588,450,91.5,17.6,16.1,40.3,80.6,81.3,0.3995037220843673,0.991389913899139 +163,Haiti,0.483,62.8,8.7,4.9,1669,4,163,0.603,380,42,3.5,22.4,35.2,60.9,71,0.6363636363636362,0.8577464788732394 +163,Rwanda,0.483,64.2,10.3,3.7,1458,11,163,0.4,320,33.6,57.5,8,8.8,86.4,85.3,0.9090909090909091,1.0128956623681127 +163,Uganda,0.483,58.5,9.8,5.4,1613,6,163,0.538,360,126.6,35,22.9,33.5,75.8,79.2,0.6835820895522388,0.957070707070707 +166,Benin,0.48,59.6,11.1,3.3,1767,0,166,0.614,340,90.2,8.4,11.3,27,67.6,78.3,0.41851851851851857,0.8633461047254151 +167,Sudan,0.479,63.5,7,3.1,3809,-27,167,0.591,360,84,23.8,12.1,18.2,31.3,76,0.6648351648351648,0.4118421052631579 +168,Djibouti,0.47,62,6.4,3.8,3276,-22,168,NA,230,18.6,12.7,NA,NA,36.3,67.7,NA,0.5361890694239291 +169,South Sudan,0.467,55.7,7.6,5.4,2332,-9,169,NA,730,75.3,24.3,NA,NA,NA,NA,NA,NA +170,Senegal,0.466,66.5,7.9,2.5,2188,-8,170,0.528,320,94.4,42.7,7.2,15.4,66,88,0.4675324675324675,0.75 +171,Afghanistan,0.465,60.4,9.3,3.2,1885,-7,171,0.693,400,86.8,27.6,5.9,29.8,15.8,79.5,0.19798657718120807,0.19874213836477989 +172,Côte d'Ivoire,0.462,51.5,8.9,4.3,3171,-24,172,0.679,720,130.3,9.2,14,30.1,52.4,81.4,0.46511627906976744,0.6437346437346436 +173,Malawi,0.445,62.8,10.8,4.3,747,13,173,0.611,510,144.8,16.7,11.1,21.6,84.6,81.5,0.5138888888888888,1.038036809815951 +174,Ethiopia,0.442,64.1,8.5,2.4,1428,2,174,0.558,420,78.4,25.5,7.8,18.2,78.2,89.3,0.4285714285714286,0.8756998880179172 +175,Gambia,0.441,60.2,8.8,2.8,1507,-2,175,0.622,430,115.8,9.4,17.4,31.5,72.2,82.9,0.5523809523809523,0.8709288299155609 +176,Congo (Democratic Republic of the),0.433,58.7,9.8,6,680,11,176,0.673,730,135.3,8.2,12.8,32.4,70.7,73.2,0.39506172839506176,0.9658469945355191 +177,Liberia,0.43,60.9,9.5,4.1,805,7,177,0.651,640,117.4,10.7,15.4,39.3,58.2,64.8,0.3918575063613232,0.8981481481481483 +178,Guinea-Bissau,0.42,55.2,9,2.8,1362,-1,178,NA,560,99.3,13.7,NA,NA,68.2,78.5,NA,0.8687898089171975 +179,Mali,0.419,58,8.4,2,1583,-8,179,0.677,550,175.6,9.5,7.7,15.1,50.8,81.4,0.5099337748344371,0.624078624078624 +180,Mozambique,0.416,55.1,9.3,3.2,1123,1,180,0.591,480,137.8,39.6,1.4,6.2,85.5,82.8,0.2258064516129032,1.032608695652174 +181,Sierra Leone,0.413,50.9,8.6,3.1,1780,-16,181,0.65,1100,100.7,12.4,10,21.7,65.7,69,0.4608294930875576,0.9521739130434783 +182,Guinea,0.411,58.8,8.7,2.4,1096,0,182,NA,650,131,21.9,NA,NA,65.6,78.3,NA,0.8378033205619412 +183,Burkina Faso,0.402,58.7,7.8,1.4,1591,-13,183,0.631,400,115.4,13.3,0.9,3.2,77.1,90,0.28125,0.8566666666666666 +184,Burundi,0.4,56.7,10.1,2.7,758,1,184,0.492,740,30.3,34.9,5.3,8.3,83.3,82,0.6385542168674698,1.0158536585365854 +185,Chad,0.392,51.6,7.4,1.9,2085,-22,185,0.706,980,152,14.9,1.7,9.9,64,79.2,0.1717171717171717,0.8080808080808081 +186,Eritrea,0.391,63.7,4.1,3.9,1130,-6,186,NA,380,65.3,22,NA,NA,80,89.8,NA,0.89086859688196 +187,Central African Republic,0.35,50.7,7.2,4.2,581,1,187,0.655,880,98.3,12.5,10.1,26.7,72.6,85.1,0.3782771535580524,0.8531139835487661 +188,Niger,0.348,61.4,5.4,1.5,908,-5,188,0.713,630,204.8,13.3,2.4,7.8,40,89.7,0.3076923076923077,0.4459308807134894 +NA,Arab States,0.686,70.6,12,6.4,15722,NA,NA,0.537,155,45.4,14,34.7,47.6,23.2,75.3,0.7289915966386555,0.30810092961487384 +NA,East Asia and the Pacific,0.71,74,12.7,7.5,11449,NA,NA,0.328,72,21.2,18.7,54.7,66.3,62.6,79.4,0.8250377073906486,0.7884130982367757 +NA,Europe and Central Asia,0.748,72.3,13.6,10,12791,NA,NA,0.3,28,30.8,19,70.8,80.6,45.6,70,0.8784119106699753,0.6514285714285715 +NA,Latin America and the Caribbean,0.748,75,14,8.2,14242,NA,NA,0.415,85,68.3,27,54.3,55.2,53.7,79.8,0.983695652173913,0.6729323308270677 +NA,South Asia,0.607,68.4,11.2,5.5,5605,NA,NA,0.536,183,38.7,17.5,29.1,54.6,29.8,80.3,0.532967032967033,0.3711083437110835 +NA,Sub-Saharan Africa,0.518,58.5,9.6,5.2,3363,NA,NA,0.575,506,109.7,22.5,22.1,31.5,65.4,76.6,0.7015873015873016,0.85378590078329 +NA,World,0.711,71.5,12.2,7.9,14301,NA,NA,0.449,210,47.4,21.8,54.5,65.4,50.3,76.7,0.8333333333333333,0.6558018252933506 diff --git a/human_.csv b/human_.csv new file mode 100644 index 000000000..81012d98c --- /dev/null +++ b/human_.csv @@ -0,0 +1,156 @@ +"","SeEdu_FM","LFR_FM","Life_Exp","Exp_Edu","GNI","MMR","ABR","%PR" +"1",1.00723888314374,0.890829694323144,81.6,17.5,64992,4,7.8,39.6 +"2",0.996828752642706,0.818941504178273,82.4,20.2,42261,6,12.1,30.5 +"3",0.98343685300207,0.825100133511348,83,15.8,56431,6,1.9,28.5 +"4",0.988612836438924,0.884036144578313,80.2,18.7,44025,5,5.1,38 +"5",0.969060773480663,0.828611898016997,81.6,17.9,45435,6,6.2,36.9 +"6",0.992783505154639,0.807228915662651,80.9,16.5,43919,7,3.8,36.9 +"7",1.02417302798982,0.779735682819383,80.9,18.6,39568,9,8.2,19.9 +"8",1.00316455696203,0.817126269956459,79.1,16.5,52947,28,31,19.4 +"9",1,0.867605633802817,82,15.9,42155,11,14.5,28.2 +"10",0.99685204616999,0.840108401084011,81.8,19.2,32689,8,25.3,31.4 +"11",0.914814814814815,0.761658031088083,83,15.4,76628,6,6,25.3 +"12",0.990836197021764,0.888070692194403,82.2,15.8,45636,4,6.5,43.6 +"13",0.998998998998999,0.810771470160116,80.7,16.2,39267,8,25.8,23.5 +"14",0.993449781659389,0.910852713178294,82.6,19,35182,4,11.5,41.3 +"15",0.864197530864198,0.694868238557559,81.9,16.9,33890,27,2.2,16.3 +"16",0.966781214203895,0.837916063675832,82.4,16,30676,2,7.8,22.5 +"17",1,0.784829721362229,81.7,13.9,58711,11,8.3,28.3 +"18",1.01398601398601,0.693181818181818,83.5,15.3,36927,6,5.4,11.6 +"19",0.934861278648975,0.801011804384486,80.8,16.3,41187,6,6.7,42.4 +"20",0.9375,0.823051948051948,82.2,16,38056,12,5.7,25.7 +"21",1,0.806499261447563,81.4,15.7,43869,4,4.1,30.3 +"22",1,0.8703125,80.8,17.1,38695,4,9.2,42.5 +"23",0.977551020408163,0.82753164556962,80.4,16.8,27852,7,0.6,27.7 +"24",0.913816689466484,0.797872340425532,82.6,17.3,32045,4,10.6,38 +"25",0.884472049689441,0.665546218487395,83.1,16,33030,4,4,30.1 +"26",1.00200601805416,0.748169838945827,78.6,16.4,26660,5,4.9,18.9 +"27",0.888059701492537,0.7072,80.9,17.6,24524,5,11.9,21 +"28",1,0.815674891146589,76.8,16.5,25214,11,16.8,19.8 +"29",0.930232558139535,0.787623066104079,80.2,14,28633,10,5.5,12.5 +"30",1.13050847457627,0.531937172774869,78.2,13.8,123124,6,9.5,0 +"31",0.995979899497487,0.744897959183674,76.3,15.1,25845,7,15.9,18.7 +"32",0.928654970760234,0.75346687211094,77.4,15.5,23177,3,12.2,22.1 +"33",0.944856839872746,0.829123328380386,73.3,16.4,24500,11,10.6,23.4 +"34",0.877237851662404,0.571644042232277,80.6,14.4,27930,9,18.2,13 +"35",0.860597439544808,0.257982120051086,74.3,16.3,52821,16,10.2,19.9 +"36",0.977430555555556,0.633333333333333,76.3,17.9,22050,69,54.4,36.8 +"37",1.19444444444444,0.505434782608696,77,13.3,60868,8,27.6,17.5 +"38",0.959424083769633,0.657754010695187,81.7,15.2,21290,22,55.3,15.8 +"39",0.989626556016598,0.829305135951662,80.9,16.3,25757,8,12.6,31.3 +"40",0.991894630192503,0.746666666666667,75.2,15.4,22916,14,12.1,10.1 +"41",1.10311284046693,0.451093210586881,76.6,14.4,38599,22,13.8,15 +"42",0.998989898989899,0.812130177514793,74.2,15.2,22281,13,13.5,18 +"43",0.908119658119658,0.76541095890411,77.3,14.8,19409,13,12.7,25.8 +"44",0.987566607460036,0.524669073405536,74.4,14.7,83961,14,14.5,1.5 +"45",0.889123548046463,0.75043630017452,76.2,15.2,14558,7,15.2,17.3 +"46",0.943600867678959,0.793977812995246,71.3,15.7,16676,1,20.6,30.1 +"47",0.968648648648649,0.796373779637378,70.1,14.7,22352,24,25.7,14.5 +"48",0.82661996497373,0.351089588377724,76.8,13.6,34858,11,10.6,9.6 +"49",0.935869565217391,0.750385208012327,74.7,14.2,18108,33,31,12 +"50",1.08151093439364,0.723958333333333,77.2,15.5,19283,14,58.3,11.5 +"51",1.04109589041096,0.873896595208071,75.4,12.6,21336,37,28.5,16.7 +"52",0.964574898785425,0.869062901155327,69.4,15,20867,26,29.9,20.1 +"53",1.02052451539339,0.860313315926893,75.6,15.4,12488,52,48.4,19.6 +"54",0.971786833855799,0.811864406779661,74.2,14.4,15596,5,35.9,20.4 +"55",1.08216432865731,0.599022004889976,77.6,13.3,18192,85,78.5,19.3 +"56",0.91304347826087,0.588079470198675,74.7,12.7,22762,29,5.7,14.2 +"57",0.851724137931034,0.587601078167116,74.4,15.6,17470,73,30.9,11.6 +"58",0.980295566502463,0.701986754966887,70.4,12.3,26090,84,34.8,24.7 +"59",0.793478260869565,0.730706075533662,74.9,14.4,12190,16,16.9,34 +"60",0.942893401015228,0.62,79.4,13.8,7301,80,43.1,48.9 +"61",0.956678700361011,0.328631875881523,79.3,13.8,16509,16,12,3.1 +"62",1.0039603960396,0.589873417721519,79.4,13.9,13413,38,60.8,33.3 +"63",0.920118343195266,0.22554347826087,75.4,15.1,15440,23,31.6,3.1 +"64",1.11417322834646,0.64520202020202,74.2,14.2,16159,110,83.2,17 +"65",0.65,0.415254237288136,75.3,14.5,18677,20,30.9,14.4 +"66",0.951570680628272,0.46002621231979,74.9,13.7,9779,29,16.9,5.8 +"67",0.919141914191419,0.564455569461827,76.8,13.1,16056,49,63.4,37.1 +"68",1.04198473282443,0.735148514851485,74.5,15.2,15175,69,70.8,9.6 +"69",0.967637540453074,0.752330226364847,74.9,13.8,7164,41,46.8,11.3 +"70",0.962012320328542,0.903735632183908,70.8,11.9,16428,26,40,15.6 +"71",0.885350318471338,0.234234234234234,74,13.5,11365,50,26.5,11.6 +"72",0.723021582733813,0.638518518518518,75.4,13.4,11780,7,18.3,33.3 +"73",0.956204379562044,0.795216741405082,71,15.1,8178,23,25.7,11.8 +"74",0.861290322580645,0.210526315789474,74.8,14,13054,89,10,25.7 +"75",0.851739788199697,0.808056872037915,74.6,13.1,11015,89,50.7,22.3 +"76",0.930602957906712,0.685496183206107,77.8,11.8,9943,21,15.3,20.7 +"77",0.989473684210526,0.746556473829201,74.7,12.3,8124,29,27.1,10.7 +"78",0.643266475644699,0.595113438045375,76.5,13.6,9638,8,15.1,19.3 +"79",1.01776649746193,0.66142684401451,75.9,14.2,10605,87,77,41.6 +"80",0.816411682892907,0.816091954022989,75.8,13.1,12547,32,8.6,23.6 +"81",0.995348837209302,0.520833333333333,70,15.7,7493,59,42.8,14 +"82",1.01426872770511,0.816738816738817,69.4,14.6,10729,68,18.7,14.9 +"83",0.875,0.796778190830235,74.4,13.5,13323,26,41,6.1 +"84",1.32458233890215,0.392670157068063,71.6,14,14911,15,2.5,16 +"85",0.711496746203904,0.354019746121298,74.8,14.6,10404,46,4.6,31.3 +"86",1.02338129496403,0.700125470514429,74,13.5,12040,83,68.5,20.9 +"87",1.05413105413105,0.791255289139633,75.7,12.4,7415,80,70.1,16.7 +"88",0.990939977349943,0.7171581769437,72.8,14.7,5069,120,18.1,0 +"89",1.00791556728232,0.597812879708384,70,13.6,7614,45,71.4,13.3 +"90",1.04708097928437,0.652671755725191,73.5,13.1,11883,100,99.6,19.1 +"91",0.94692144373673,0.588662790697674,71.1,12.7,15617,130,35.2,11.8 +"92",0.834862385321101,0.725161290322581,76.8,13,12328,31,4.2,5.9 +"93",1.07166666666667,0.402397260273973,73.4,12.9,5327,58,28.3,6.1 +"94",0.944801026957638,0.881127450980392,64.5,12.5,16646,170,44.2,9.5 +"95",0.968944099378882,0.850678733031674,71.6,11.9,5223,21,29.3,20.8 +"96",0.724422442244224,0.316844919786096,71.1,13.5,10512,45,43,2.2 +"97",1.49307479224377,0.859327217125382,64.4,12.5,16367,240,103,16.2 +"98",0.810975609756098,0.610451306413302,68.9,13,9788,190,48.3,17.1 +"99",0.855813953488372,0.65683962264151,72.9,11.9,7643,110,67,16.8 +"100",1.03453689167975,0.641154328732748,68.2,11.3,7915,120,46.8,27.1 +"101",0.844036697247706,0.605063291139241,73,12.3,7349,69,76,27.4 +"102",0.957839262187088,0.735537190082645,57.4,13.6,12122,140,50.9,40.7 +"103",0.834269662921348,0.888077858880779,75.8,11.9,5092,49,29,24.3 +"104",0.805414551607445,0.793572311495674,68.3,13.2,5760,200,71.9,51.8 +"105",0.976239669421488,0.70440251572327,70.6,12.5,3044,75,29.3,23.3 +"106",0.553784860557769,0.213467048710602,69.4,10.1,14003,67,68.7,26.5 +"107",1.26150627615063,0.529192546583851,66.4,10.3,6522,250,88.5,31.3 +"108",1.02872062663185,0.590286425902864,74.9,11.5,4457,100,100.8,39.1 +"109",0.685430463576159,0.349604221635884,74,11.6,6850,120,35.8,11 +"110",0.968023255813953,0.858712715855573,64.8,11.3,9418,130,54.9,37.7 +"111",0.943965517241379,0.558956916099773,71.8,10.7,6929,140,97.2,13.3 +"112",1.04276315789474,0.763942931258106,69.4,11.2,2517,44,42.8,15.2 +"113",0.477031802120141,0.337922403003755,68,11.7,5497,190,32.8,12.2 +"114",1.08527131782946,0.516284680337756,73.1,11.1,3938,120,84,25.8 +"115",0.985507246376812,0.863989637305699,69.5,12.6,7176,120,40.9,8.3 +"116",0.728395061728395,0.185694635488308,69.6,12.3,2728,49,41.6,12.4 +"117",0.84468085106383,0.938356164383562,62.3,11.1,6012,410,126.7,11.5 +"118",0.586363636363636,0.853971962616822,60.1,13.5,3734,280,125.4,12.7 +"119",0.698608964451314,0.942577030812325,61.4,11.5,3852,380,58.4,10.9 +"120",0.825665859564165,0.682520808561237,71.6,10,3191,170,80.6,20 +"121",0.432314410480349,0.910982658959538,68.4,10.9,2949,170,44.3,19 +"122",0.805732484076433,0.859116022099447,61.6,11,2762,400,93.6,20.8 +"123",0.463350785340314,0.917336394948335,69.6,12.4,2311,190,73.7,29.5 +"124",0.418655097613883,0.296743063932449,66.2,7.8,4866,170,27.3,19.7 +"125",1.49673202614379,0.913730255164034,65.9,8.6,4608,200,12.1,4.7 +"126",0.842307692307692,0.613128491620112,49,11.3,5542,310,72,14.7 +"127",0.589473684210526,0.976718403547672,65,9.2,2411,410,122.7,36 +"128",0.610315186246418,0.830729166666667,55.5,10.4,2803,590,115.8,27.1 +"129",0.785483870967742,0.927536231884058,57.5,10.9,1615,470,60.3,35.1 +"130",0.397129186602871,0.36283185840708,63.1,8.5,3560,320,73.3,22.2 +"131",0.524137931034483,0.952702702702703,62.6,9.9,2463,220,62.1,2.7 +"132",0.322097378277154,0.35180055401662,63.8,9.2,3519,270,47,0.7 +"133",1.15263157894737,0.802721088435374,49.8,11.1,3306,490,89.4,26.8 +"134",0.399503722084367,0.991389913899139,59.7,12.2,1228,450,91.5,17.6 +"135",0.636363636363636,0.857746478873239,62.8,8.7,1669,380,42,3.5 +"136",0.909090909090909,1.01289566236811,64.2,10.3,1458,320,33.6,57.5 +"137",0.683582089552239,0.957070707070707,58.5,9.8,1613,360,126.6,35 +"138",0.418518518518519,0.863346104725415,59.6,11.1,1767,340,90.2,8.4 +"139",0.664835164835165,0.411842105263158,63.5,7,3809,360,84,23.8 +"140",0.467532467532468,0.75,66.5,7.9,2188,320,94.4,42.7 +"141",0.197986577181208,0.19874213836478,60.4,9.3,1885,400,86.8,27.6 +"142",0.465116279069767,0.643734643734644,51.5,8.9,3171,720,130.3,9.2 +"143",0.513888888888889,1.03803680981595,62.8,10.8,747,510,144.8,16.7 +"144",0.428571428571429,0.875699888017917,64.1,8.5,1428,420,78.4,25.5 +"145",0.552380952380952,0.870928829915561,60.2,8.8,1507,430,115.8,9.4 +"146",0.395061728395062,0.965846994535519,58.7,9.8,680,730,135.3,8.2 +"147",0.391857506361323,0.898148148148148,60.9,9.5,805,640,117.4,10.7 +"148",0.509933774834437,0.624078624078624,58,8.4,1583,550,175.6,9.5 +"149",0.225806451612903,1.03260869565217,55.1,9.3,1123,480,137.8,39.6 +"150",0.460829493087558,0.952173913043478,50.9,8.6,1780,1100,100.7,12.4 +"151",0.28125,0.856666666666667,58.7,7.8,1591,400,115.4,13.3 +"152",0.63855421686747,1.01585365853659,56.7,10.1,758,740,30.3,34.9 +"153",0.171717171717172,0.808080808080808,51.6,7.4,2085,980,152,14.9 +"154",0.378277153558052,0.853113983548766,50.7,7.2,581,880,98.3,12.5 +"155",0.307692307692308,0.445930880713489,61.4,5.4,908,630,204.8,13.3 diff --git a/index.Rmd b/index.Rmd index 6dcc629af..dae936ca9 100644 --- a/index.Rmd +++ b/index.Rmd @@ -1,13 +1,17 @@ --- title: "IODS course project" +author: "Subam Kathet" output: html_document: theme: cosmo - toc: true + toc: yes toc_depth: 2 - fig_caption: true + fig_caption: yes fig_width: 6 fig_height: 4 + pdf_document: + toc: yes + toc_depth: '2' --- *** @@ -15,16 +19,47 @@ output: # Introduction to Open Data Science - Course Project ```{r child = "chapter1.Rmd"} +install.packages("tidyverse") # This will include Chapter 1 (that is updated in its own file) in the document. ``` *** ```{r child = "chapter2.Rmd"} +#Testing to see if I can write something in this chunk # This will include Chapter 2 (that is updated in its own file) in the document. +#lets see if I can print the date ?? + +date() +``` + +*** + +```{r child = "chapter3.Rmd"} +#Includes exercise and assignment from exercise 3 - Logistic Regression +``` + +*** + +```{r child = "chapter4.Rmd"} +#This weeks exercises includes tasks from exercise 4, Clustering and classification +``` + +*** + +```{r child = "chapter5.Rmd"} +#This weeks exercises includes tasks from exercise 5, Dimensionality reduction techniques +``` + +*** + +```{r child = "chapter6.Rmd"} +#This weeks exercises includes tasks from exercise 5, Dimensionality reduction techniques ``` *** -(more chapters to be added similarly as we proceed with the course!) + + + diff --git a/index.html b/index.html index fb562889f..639d4e057 100644 --- a/index.html +++ b/index.html @@ -9,36 +9,47 @@ + IODS course project - + + - + + diff --git a/index.pdf b/index.pdf new file mode 100644 index 000000000..6b7dbeb92 Binary files /dev/null and b/index.pdf differ diff --git a/learning2014 copy.csv b/learning2014 copy.csv new file mode 100644 index 000000000..061b49a19 --- /dev/null +++ b/learning2014 copy.csv @@ -0,0 +1,167 @@ +gender,age,attitude,deep,stra,surf,points +F,53,3.7,3.5833333333333335,3.375,2.5833333333333335,25 +M,55,3.1,2.9166666666666665,2.75,3.1666666666666665,12 +F,49,2.5,3.5,3.625,2.25,24 +M,53,3.5,3.5,3.125,2.25,10 +M,49,3.7,3.6666666666666665,3.625,2.8333333333333335,22 +F,38,3.8,4.75,3.625,2.4166666666666665,21 +M,50,3.5,3.8333333333333335,2.25,1.9166666666666667,21 +F,37,2.9,3.25,4,2.8333333333333335,31 +M,37,3.8,4.333333333333333,4.25,2.1666666666666665,24 +F,42,2.1,4,3.5,3,26 +M,37,3.9,3.5833333333333335,3.625,2.6666666666666665,31 +F,34,3.8,3.8333333333333335,4.75,2.4166666666666665,31 +F,34,2.4,4.25,3.625,2.25,23 +F,34,3,3.3333333333333335,3.5,2.75,25 +M,35,2.6,4.166666666666667,1.75,2.3333333333333335,21 +F,33,4.1,3.6666666666666665,3.875,2.3333333333333335,31 +F,32,2.6,4.083333333333333,1.375,2.9166666666666665,20 +F,44,2.6,3.5,3.25,2.5,22 +M,29,1.7,4.083333333333333,3,3.75,9 +F,30,2.7,4,3.75,2.75,24 +M,27,3.9,3.9166666666666665,2.625,2.3333333333333335,28 +M,29,3.4,4,2.375,2.4166666666666665,30 +F,31,2.7,4,3.625,3,24 +F,37,2.3,3.6666666666666665,2.75,2.4166666666666665,9 +F,26,3.7,3.6666666666666665,1.75,2.8333333333333335,26 +F,26,4.4,4.416666666666667,3.25,3.1666666666666665,32 +M,30,4.1,3.9166666666666665,4,3,32 +F,33,3.7,3.75,3.625,2,33 +F,33,2.5,3.25,2.875,3.5,29 +M,28,3,3.5833333333333335,3,3.75,30 +M,26,3.4,4.916666666666667,1.625,2.5,19 +F,27,3.2,3.5833333333333335,3.25,2.0833333333333335,23 +F,25,2,2.9166666666666665,3.5,2.4166666666666665,19 +F,31,2.4,3.6666666666666665,3,2.5833333333333335,12 +M,20,4.2,4.5,3.25,1.5833333333333333,10 +F,39,1.6,4.083333333333333,1.875,2.8333333333333335,11 +M,38,3.1,3.8333333333333335,4.375,1.8333333333333333,20 +M,24,3.8,3.25,3.625,2.4166666666666665,26 +M,26,3.8,2.3333333333333335,2.5,3.25,31 +M,25,3.3,3.3333333333333335,1.25,3.4166666666666665,20 +F,30,1.7,4.083333333333333,4,3.4166666666666665,23 +F,25,2.5,2.9166666666666665,3,3.1666666666666665,12 +M,30,3.2,3.3333333333333335,2.5,3.5,24 +F,48,3.5,3.8333333333333335,4.875,2.6666666666666665,17 +F,24,3.2,3.6666666666666665,5,2.4166666666666665,29 +F,40,4.2,4.666666666666667,4.375,3.5833333333333335,23 +M,25,3.1,3.75,3.25,2.0833333333333335,28 +F,23,3.9,3.4166666666666665,4,3.75,31 +F,25,1.9,4.166666666666667,3.125,2.9166666666666665,23 +F,23,2.1,2.9166666666666665,2.5,2.9166666666666665,25 +M,27,2.5,4.166666666666667,3.125,2.4166666666666665,18 +M,25,3.2,3.5833333333333335,3.25,3,19 +M,23,3.2,2.8333333333333335,2.125,3.4166666666666665,22 +F,23,2.6,4,2.75,2.9166666666666665,25 +F,23,2.3,2.9166666666666665,2.375,3.25,21 +F,45,3.8,3,3.125,3.25,9 +F,22,2.8,4.083333333333333,4,2.3333333333333335,28 +F,23,3.3,2.9166666666666665,4,3.25,25 +M,21,4.8,3.5,2.25,2.5,29 +M,21,4,4.333333333333333,3.25,1.75,33 +F,21,4,4.25,3.625,2.25,33 +F,21,4.7,3.4166666666666665,3.625,2.0833333333333335,25 +F,26,2.3,3.0833333333333335,2.5,2.8333333333333335,18 +F,25,3.1,4.583333333333333,1.875,2.8333333333333335,22 +F,26,2.7,3.4166666666666665,2,2.4166666666666665,17 +M,21,4.1,3.4166666666666665,1.875,2.25,25 +F,23,3.4,3.4166666666666665,4,2.8333333333333335,28 +F,22,2.5,3.5833333333333335,2.875,2.25,22 +F,22,2.1,1.5833333333333333,3.875,1.8333333333333333,26 +F,22,1.4,3.3333333333333335,2.5,2.9166666666666665,11 +F,23,1.9,4.333333333333333,2.75,2.9166666666666665,29 +M,22,3.7,4.416666666666667,4.5,2.0833333333333335,22 +M,23,3.2,4.833333333333333,3.375,2.3333333333333335,21 +M,24,2.8,3.0833333333333335,2.625,2.4166666666666665,28 +F,22,4.1,3,4.125,2.75,33 +F,23,2.5,4.083333333333333,2.625,3.25,16 +M,22,2.8,4.083333333333333,2.25,1.75,31 +M,20,3.8,3.75,2.75,2.5833333333333335,22 +M,22,3.1,3.0833333333333335,3,3.3333333333333335,31 +M,21,3.5,4.75,1.625,2.8333333333333335,23 +F,22,3.6,4.25,1.875,2.5,26 +F,23,2.6,4.166666666666667,3.375,2.4166666666666665,12 +M,21,4.4,4.416666666666667,3.75,2.4166666666666665,26 +M,22,4.5,3.8333333333333335,2.125,2.5833333333333335,31 +M,29,3.2,3.3333333333333335,2.375,3,19 +F,29,3.9,3.1666666666666665,2.75,2,30 +F,21,2.5,3.1666666666666665,3.125,3.4166666666666665,12 +M,28,3.3,3.8333333333333335,3.5,2.8333333333333335,17 +F,21,3.3,4.25,2.625,2.25,18 +F,30,3,3.8333333333333335,3.375,2.75,19 +F,21,2.9,3.6666666666666665,2.25,3.9166666666666665,21 +M,23,3.3,3.8333333333333335,3,2.3333333333333335,24 +F,21,3.3,3.8333333333333335,4,2.75,28 +F,21,3.5,3.8333333333333335,3.5,2.75,17 +F,20,3.6,3.6666666666666665,2.625,2.9166666666666665,18 +M,22,3.7,4.333333333333333,2.5,2.0833333333333335,17 +M,21,4.2,3.75,3.75,3.6666666666666665,23 +M,21,3.2,4.166666666666667,3.625,2.8333333333333335,26 +F,20,5,4,4.125,3.4166666666666665,28 +M,22,4.7,4,4.375,1.5833333333333333,31 +F,20,3.6,4.583333333333333,2.625,2.9166666666666665,27 +F,20,3.6,3.6666666666666665,4,3,25 +M,24,2.9,3.6666666666666665,2.75,2.9166666666666665,23 +F,20,3.5,3.8333333333333335,2.75,2.6666666666666665,21 +F,19,4,2.5833333333333335,1.375,3,27 +F,21,3.5,3.5,2.25,2.75,28 +F,21,3.2,3.0833333333333335,3.625,3.0833333333333335,23 +F,22,2.6,4.25,3.75,2.5,21 +F,25,2,3.1666666666666665,4,2.3333333333333335,25 +F,21,2.7,3.0833333333333335,3.125,3,11 +F,22,3.2,4.166666666666667,3.25,3,19 +F,25,3.3,2.25,2.125,4,24 +F,20,3.9,3.3333333333333335,2.875,3.25,28 +M,24,3.3,3.0833333333333335,1.5,3.5,21 +F,20,3,2.75,2.5,3.5,24 +M,21,3.7,3.25,3.25,3.8333333333333335,24 +F,20,2.5,4,3.625,2.9166666666666665,20 +F,20,2.9,3.5833333333333335,3.875,2.1666666666666665,19 +M,31,3.9,4.083333333333333,3.875,1.6666666666666667,30 +F,20,3.6,4.25,2.375,2.0833333333333335,22 +F,22,2.9,3.4166666666666665,3,2.8333333333333335,16 +F,22,2.1,3.0833333333333335,3.375,3.4166666666666665,16 +M,21,3.1,3.5,2.75,3.3333333333333335,19 +M,22,4,3.6666666666666665,4.5,2.5833333333333335,30 +F,21,3.1,4.25,2.625,2.8333333333333335,23 +F,21,2.3,4.25,2.75,3.3333333333333335,19 +F,21,2.8,3.8333333333333335,3.25,3,18 +F,21,3.7,4.416666666666667,4.125,2.5833333333333335,28 +F,20,2.6,3.5,3.375,2.4166666666666665,21 +F,21,2.4,3.5833333333333335,2.75,3.5833333333333335,19 +F,25,3,3.6666666666666665,4.125,2.0833333333333335,27 +M,21,2.8,2.0833333333333335,3.25,4.333333333333333,24 +F,24,2.9,4.25,2.875,2.6666666666666665,21 +F,20,2.4,3.5833333333333335,2.875,3,20 +M,21,3.1,4,2.375,2.6666666666666665,28 +F,20,1.9,3.3333333333333335,3.875,2.1666666666666665,12 +F,20,2,3.5,2.125,2.6666666666666665,21 +F,18,3.8,3.1666666666666665,4,2.25,28 +F,21,3.4,3.5833333333333335,3.25,2.6666666666666665,31 +F,19,3.7,3.4166666666666665,2.625,3.3333333333333335,18 +F,21,2.9,4.25,2.75,3.5,25 +F,20,2.3,3.25,4,2.75,19 +M,21,4.1,4.416666666666667,3,2,21 +F,20,2.7,3.25,3.375,2.8333333333333335,16 +F,21,3.5,3.9166666666666665,3.875,3.5,7 +F,20,3.4,3.5833333333333335,3.25,2.5,21 +F,18,3.2,4.5,3.375,3.1666666666666665,17 +M,22,3.3,3.5833333333333335,4.125,3.0833333333333335,22 +F,22,3.3,3.6666666666666665,3.5,2.9166666666666665,18 +M,24,3.5,2.5833333333333335,2,3.1666666666666665,25 +F,19,3.2,4.166666666666667,3.625,2.5,24 +F,20,3.1,3.25,3.375,3.8333333333333335,23 +F,20,2.8,4.333333333333333,2.125,2.25,23 +F,17,1.7,3.9166666666666665,4.625,3.4166666666666665,26 +M,19,1.9,2.6666666666666665,2.5,3.75,12 +F,20,3.5,3.0833333333333335,2.875,3,32 +F,20,2.4,3.75,2.75,2.5833333333333335,22 +F,20,2.1,4.166666666666667,4,3.3333333333333335,20 +F,20,2.9,4.166666666666667,2.375,2.8333333333333335,21 +F,19,1.9,3.25,3.875,3,23 +F,19,2,4.083333333333333,3.375,2.8333333333333335,20 +F,22,4.2,2.9166666666666665,1.75,3.1666666666666665,28 +M,35,4.1,3.8333333333333335,3,2.75,31 +F,18,3.7,3.1666666666666665,2.625,3.4166666666666665,18 +F,19,3.6,3.4166666666666665,2.625,3,30 +M,21,1.8,4.083333333333333,3.375,2.6666666666666665,19 diff --git a/learning2014.csv b/learning2014.csv new file mode 100644 index 000000000..061b49a19 --- /dev/null +++ b/learning2014.csv @@ -0,0 +1,167 @@ +gender,age,attitude,deep,stra,surf,points +F,53,3.7,3.5833333333333335,3.375,2.5833333333333335,25 +M,55,3.1,2.9166666666666665,2.75,3.1666666666666665,12 +F,49,2.5,3.5,3.625,2.25,24 +M,53,3.5,3.5,3.125,2.25,10 +M,49,3.7,3.6666666666666665,3.625,2.8333333333333335,22 +F,38,3.8,4.75,3.625,2.4166666666666665,21 +M,50,3.5,3.8333333333333335,2.25,1.9166666666666667,21 +F,37,2.9,3.25,4,2.8333333333333335,31 +M,37,3.8,4.333333333333333,4.25,2.1666666666666665,24 +F,42,2.1,4,3.5,3,26 +M,37,3.9,3.5833333333333335,3.625,2.6666666666666665,31 +F,34,3.8,3.8333333333333335,4.75,2.4166666666666665,31 +F,34,2.4,4.25,3.625,2.25,23 +F,34,3,3.3333333333333335,3.5,2.75,25 +M,35,2.6,4.166666666666667,1.75,2.3333333333333335,21 +F,33,4.1,3.6666666666666665,3.875,2.3333333333333335,31 +F,32,2.6,4.083333333333333,1.375,2.9166666666666665,20 +F,44,2.6,3.5,3.25,2.5,22 +M,29,1.7,4.083333333333333,3,3.75,9 +F,30,2.7,4,3.75,2.75,24 +M,27,3.9,3.9166666666666665,2.625,2.3333333333333335,28 +M,29,3.4,4,2.375,2.4166666666666665,30 +F,31,2.7,4,3.625,3,24 +F,37,2.3,3.6666666666666665,2.75,2.4166666666666665,9 +F,26,3.7,3.6666666666666665,1.75,2.8333333333333335,26 +F,26,4.4,4.416666666666667,3.25,3.1666666666666665,32 +M,30,4.1,3.9166666666666665,4,3,32 +F,33,3.7,3.75,3.625,2,33 +F,33,2.5,3.25,2.875,3.5,29 +M,28,3,3.5833333333333335,3,3.75,30 +M,26,3.4,4.916666666666667,1.625,2.5,19 +F,27,3.2,3.5833333333333335,3.25,2.0833333333333335,23 +F,25,2,2.9166666666666665,3.5,2.4166666666666665,19 +F,31,2.4,3.6666666666666665,3,2.5833333333333335,12 +M,20,4.2,4.5,3.25,1.5833333333333333,10 +F,39,1.6,4.083333333333333,1.875,2.8333333333333335,11 +M,38,3.1,3.8333333333333335,4.375,1.8333333333333333,20 +M,24,3.8,3.25,3.625,2.4166666666666665,26 +M,26,3.8,2.3333333333333335,2.5,3.25,31 +M,25,3.3,3.3333333333333335,1.25,3.4166666666666665,20 +F,30,1.7,4.083333333333333,4,3.4166666666666665,23 +F,25,2.5,2.9166666666666665,3,3.1666666666666665,12 +M,30,3.2,3.3333333333333335,2.5,3.5,24 +F,48,3.5,3.8333333333333335,4.875,2.6666666666666665,17 +F,24,3.2,3.6666666666666665,5,2.4166666666666665,29 +F,40,4.2,4.666666666666667,4.375,3.5833333333333335,23 +M,25,3.1,3.75,3.25,2.0833333333333335,28 +F,23,3.9,3.4166666666666665,4,3.75,31 +F,25,1.9,4.166666666666667,3.125,2.9166666666666665,23 +F,23,2.1,2.9166666666666665,2.5,2.9166666666666665,25 +M,27,2.5,4.166666666666667,3.125,2.4166666666666665,18 +M,25,3.2,3.5833333333333335,3.25,3,19 +M,23,3.2,2.8333333333333335,2.125,3.4166666666666665,22 +F,23,2.6,4,2.75,2.9166666666666665,25 +F,23,2.3,2.9166666666666665,2.375,3.25,21 +F,45,3.8,3,3.125,3.25,9 +F,22,2.8,4.083333333333333,4,2.3333333333333335,28 +F,23,3.3,2.9166666666666665,4,3.25,25 +M,21,4.8,3.5,2.25,2.5,29 +M,21,4,4.333333333333333,3.25,1.75,33 +F,21,4,4.25,3.625,2.25,33 +F,21,4.7,3.4166666666666665,3.625,2.0833333333333335,25 +F,26,2.3,3.0833333333333335,2.5,2.8333333333333335,18 +F,25,3.1,4.583333333333333,1.875,2.8333333333333335,22 +F,26,2.7,3.4166666666666665,2,2.4166666666666665,17 +M,21,4.1,3.4166666666666665,1.875,2.25,25 +F,23,3.4,3.4166666666666665,4,2.8333333333333335,28 +F,22,2.5,3.5833333333333335,2.875,2.25,22 +F,22,2.1,1.5833333333333333,3.875,1.8333333333333333,26 +F,22,1.4,3.3333333333333335,2.5,2.9166666666666665,11 +F,23,1.9,4.333333333333333,2.75,2.9166666666666665,29 +M,22,3.7,4.416666666666667,4.5,2.0833333333333335,22 +M,23,3.2,4.833333333333333,3.375,2.3333333333333335,21 +M,24,2.8,3.0833333333333335,2.625,2.4166666666666665,28 +F,22,4.1,3,4.125,2.75,33 +F,23,2.5,4.083333333333333,2.625,3.25,16 +M,22,2.8,4.083333333333333,2.25,1.75,31 +M,20,3.8,3.75,2.75,2.5833333333333335,22 +M,22,3.1,3.0833333333333335,3,3.3333333333333335,31 +M,21,3.5,4.75,1.625,2.8333333333333335,23 +F,22,3.6,4.25,1.875,2.5,26 +F,23,2.6,4.166666666666667,3.375,2.4166666666666665,12 +M,21,4.4,4.416666666666667,3.75,2.4166666666666665,26 +M,22,4.5,3.8333333333333335,2.125,2.5833333333333335,31 +M,29,3.2,3.3333333333333335,2.375,3,19 +F,29,3.9,3.1666666666666665,2.75,2,30 +F,21,2.5,3.1666666666666665,3.125,3.4166666666666665,12 +M,28,3.3,3.8333333333333335,3.5,2.8333333333333335,17 +F,21,3.3,4.25,2.625,2.25,18 +F,30,3,3.8333333333333335,3.375,2.75,19 +F,21,2.9,3.6666666666666665,2.25,3.9166666666666665,21 +M,23,3.3,3.8333333333333335,3,2.3333333333333335,24 +F,21,3.3,3.8333333333333335,4,2.75,28 +F,21,3.5,3.8333333333333335,3.5,2.75,17 +F,20,3.6,3.6666666666666665,2.625,2.9166666666666665,18 +M,22,3.7,4.333333333333333,2.5,2.0833333333333335,17 +M,21,4.2,3.75,3.75,3.6666666666666665,23 +M,21,3.2,4.166666666666667,3.625,2.8333333333333335,26 +F,20,5,4,4.125,3.4166666666666665,28 +M,22,4.7,4,4.375,1.5833333333333333,31 +F,20,3.6,4.583333333333333,2.625,2.9166666666666665,27 +F,20,3.6,3.6666666666666665,4,3,25 +M,24,2.9,3.6666666666666665,2.75,2.9166666666666665,23 +F,20,3.5,3.8333333333333335,2.75,2.6666666666666665,21 +F,19,4,2.5833333333333335,1.375,3,27 +F,21,3.5,3.5,2.25,2.75,28 +F,21,3.2,3.0833333333333335,3.625,3.0833333333333335,23 +F,22,2.6,4.25,3.75,2.5,21 +F,25,2,3.1666666666666665,4,2.3333333333333335,25 +F,21,2.7,3.0833333333333335,3.125,3,11 +F,22,3.2,4.166666666666667,3.25,3,19 +F,25,3.3,2.25,2.125,4,24 +F,20,3.9,3.3333333333333335,2.875,3.25,28 +M,24,3.3,3.0833333333333335,1.5,3.5,21 +F,20,3,2.75,2.5,3.5,24 +M,21,3.7,3.25,3.25,3.8333333333333335,24 +F,20,2.5,4,3.625,2.9166666666666665,20 +F,20,2.9,3.5833333333333335,3.875,2.1666666666666665,19 +M,31,3.9,4.083333333333333,3.875,1.6666666666666667,30 +F,20,3.6,4.25,2.375,2.0833333333333335,22 +F,22,2.9,3.4166666666666665,3,2.8333333333333335,16 +F,22,2.1,3.0833333333333335,3.375,3.4166666666666665,16 +M,21,3.1,3.5,2.75,3.3333333333333335,19 +M,22,4,3.6666666666666665,4.5,2.5833333333333335,30 +F,21,3.1,4.25,2.625,2.8333333333333335,23 +F,21,2.3,4.25,2.75,3.3333333333333335,19 +F,21,2.8,3.8333333333333335,3.25,3,18 +F,21,3.7,4.416666666666667,4.125,2.5833333333333335,28 +F,20,2.6,3.5,3.375,2.4166666666666665,21 +F,21,2.4,3.5833333333333335,2.75,3.5833333333333335,19 +F,25,3,3.6666666666666665,4.125,2.0833333333333335,27 +M,21,2.8,2.0833333333333335,3.25,4.333333333333333,24 +F,24,2.9,4.25,2.875,2.6666666666666665,21 +F,20,2.4,3.5833333333333335,2.875,3,20 +M,21,3.1,4,2.375,2.6666666666666665,28 +F,20,1.9,3.3333333333333335,3.875,2.1666666666666665,12 +F,20,2,3.5,2.125,2.6666666666666665,21 +F,18,3.8,3.1666666666666665,4,2.25,28 +F,21,3.4,3.5833333333333335,3.25,2.6666666666666665,31 +F,19,3.7,3.4166666666666665,2.625,3.3333333333333335,18 +F,21,2.9,4.25,2.75,3.5,25 +F,20,2.3,3.25,4,2.75,19 +M,21,4.1,4.416666666666667,3,2,21 +F,20,2.7,3.25,3.375,2.8333333333333335,16 +F,21,3.5,3.9166666666666665,3.875,3.5,7 +F,20,3.4,3.5833333333333335,3.25,2.5,21 +F,18,3.2,4.5,3.375,3.1666666666666665,17 +M,22,3.3,3.5833333333333335,4.125,3.0833333333333335,22 +F,22,3.3,3.6666666666666665,3.5,2.9166666666666665,18 +M,24,3.5,2.5833333333333335,2,3.1666666666666665,25 +F,19,3.2,4.166666666666667,3.625,2.5,24 +F,20,3.1,3.25,3.375,3.8333333333333335,23 +F,20,2.8,4.333333333333333,2.125,2.25,23 +F,17,1.7,3.9166666666666665,4.625,3.4166666666666665,26 +M,19,1.9,2.6666666666666665,2.5,3.75,12 +F,20,3.5,3.0833333333333335,2.875,3,32 +F,20,2.4,3.75,2.75,2.5833333333333335,22 +F,20,2.1,4.166666666666667,4,3.3333333333333335,20 +F,20,2.9,4.166666666666667,2.375,2.8333333333333335,21 +F,19,1.9,3.25,3.875,3,23 +F,19,2,4.083333333333333,3.375,2.8333333333333335,20 +F,22,4.2,2.9166666666666665,1.75,3.1666666666666665,28 +M,35,4.1,3.8333333333333335,3,2.75,31 +F,18,3.7,3.1666666666666665,2.625,3.4166666666666665,18 +F,19,3.6,3.4166666666666665,2.625,3,30 +M,21,1.8,4.083333333333333,3.375,2.6666666666666665,19 diff --git a/meet_and_repeat.R b/meet_and_repeat.R new file mode 100644 index 000000000..6ebb07b7e --- /dev/null +++ b/meet_and_repeat.R @@ -0,0 +1,116 @@ +#Name: Subam Kathet + +#Date: 11/12/2022 + +#**Introduction to open data science**# + +#Week 6: Analysis of longitudinal data + +#This weeks learning tasks includes exercises from week 6 - Analysis of longitudinal Data + +#All exercises of this week are based on the Chapters 8 and 9 of Vehkalahti and Everitt (2019) + +#included in the special edition MABS4IODS (Part VI) + +#For more information https://github.com/KimmoVehkalahti/MABS + +#Data wrangling exercise will be completed by preparing two data sets for Analysis exercise + +#Important skill to work on is "Converting the data between the wide form and the long form" + +#**LETS START THE WRANGLING PROCESS**# + +#loading the packages first !! + +library(dplyr) +library(tidyr) +library(readr) + +###**STEP 1**### Loading the data set + +# Read the BPRS data +BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep =" ", header = T) + +# Look at the (column) names of BPRS +colnames(BPRS) + +# Look at the structure of BPRS +str(BPRS) + +# Print out summaries of the variables +summary(BPRS); dim(BPRS) + +#BPRS data set has 40 observations and 11 variable + +# Read the RATS data +RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", sep ="\t", header = T) + +# Look at the (column) names of RATS +colnames(RATS) + +# Look at the structure of RATS +str(RATS) + +# Print out summaries of the variables +summary(RATS); dim(RATS) + +#RATS data set has 16 observations and a single variable (this might change after wrangling, lets seee !!) + +###**STEP 2**### Converting the categorical variables to factors + +#BPRS data +BPRS$treatment <- factor(BPRS$treatment) +BPRS$subject <- factor(BPRS$subject) + +#RATS data +RATS$ID <- factor(RATS$ID) +RATS$Group <- factor(RATS$Group) + +###**STEP 3**### Converting the data sets to long form + +# Convert to long form and extract the week number (lets write the codes together) + +BPRSL <- pivot_longer(BPRS, cols = -c(treatment, subject), + names_to = "weeks", values_to = "bprs") %>% + arrange(weeks) #order by weeks variable + +# Extract the week number +BPRSL <- BPRSL %>% + mutate(week = as.integer(substr(weeks, 5, 5))) + +# Take a glimpse at the BPRSL data +glimpse(BPRSL) + +#And same with RATS data + +RATSL <- pivot_longer(RATS, cols = -c(ID, Group), + names_to = "WD", + values_to = "Weight") %>% + mutate(Time = as.integer(substr(WD, 3, 4))) %>% # Extract the week number + arrange(Time) #order by Time variable + +# Take a glimpse at the RATSL data +glimpse(RATSL) + + +###**STEP 4**### Checking the variables +str(BPRS); str(BPRSL) +str(RATS); str(RATSL) + + +###**STEP 4**### Save the data as .csv + +write.csv(BPRSL, "BPRSL.csv") +write_csv(RATSL, "RATSL.csv") + +#Lets check +read_csv("BPRSL.csv") +read_csv("RATSL.csv") + +###All done !!!! + +###DATA WRANGLING COMPLETE !!!! + + + + diff --git a/student-mat.csv b/student-mat.csv new file mode 100755 index 000000000..58efc299b --- /dev/null +++ b/student-mat.csv @@ -0,0 +1,396 @@ +school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3 +"GP";"F";18;"U";"GT3";"A";4;4;"at_home";"teacher";"course";"mother";2;2;0;"yes";"no";"no";"no";"yes";"yes";"no";"no";4;3;4;1;1;3;6;"5";"6";6 +"GP";"F";17;"U";"GT3";"T";1;1;"at_home";"other";"course";"father";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";5;3;3;1;1;3;4;"5";"5";6 +"GP";"F";15;"U";"LE3";"T";1;1;"at_home";"other";"other";"mother";1;2;3;"yes";"no";"yes";"no";"yes";"yes";"yes";"no";4;3;2;2;3;3;10;"7";"8";10 +"GP";"F";15;"U";"GT3";"T";4;2;"health";"services";"home";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";3;2;2;1;1;5;2;"15";"14";15 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"other";"home";"father";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";4;3;2;1;2;5;4;"6";"10";10 +"GP";"M";16;"U";"LE3";"T";4;3;"services";"other";"reputation";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;4;2;1;2;5;10;"15";"15";15 +"GP";"M";16;"U";"LE3";"T";2;2;"other";"other";"home";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;4;1;1;3;0;"12";"12";11 +"GP";"F";17;"U";"GT3";"A";4;4;"other";"teacher";"home";"mother";2;2;0;"yes";"yes";"no";"no";"yes";"yes";"no";"no";4;1;4;1;1;1;6;"6";"5";6 +"GP";"M";15;"U";"LE3";"A";3;2;"services";"other";"home";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;2;2;1;1;1;0;"16";"18";19 +"GP";"M";15;"U";"GT3";"T";3;4;"other";"other";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;5;1;1;1;5;0;"14";"15";15 +"GP";"F";15;"U";"GT3";"T";4;4;"teacher";"health";"reputation";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";3;3;3;1;2;2;0;"10";"8";9 +"GP";"F";15;"U";"GT3";"T";2;1;"services";"other";"reputation";"father";3;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;2;2;1;1;4;4;"10";"12";12 +"GP";"M";15;"U";"LE3";"T";4;4;"health";"services";"course";"father";1;1;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;3;1;3;5;2;"14";"14";14 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"other";"course";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";5;4;3;1;2;3;2;"10";"10";11 +"GP";"M";15;"U";"GT3";"A";2;2;"other";"other";"home";"other";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;5;2;1;1;3;0;"14";"16";16 +"GP";"F";16;"U";"GT3";"T";4;4;"health";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;4;1;2;2;4;"14";"14";14 +"GP";"F";16;"U";"GT3";"T";4;4;"services";"services";"reputation";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;2;3;1;2;2;6;"13";"14";14 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"other";"reputation";"mother";3;2;0;"yes";"yes";"no";"yes";"yes";"yes";"no";"no";5;3;2;1;1;4;4;"8";"10";10 +"GP";"M";17;"U";"GT3";"T";3;2;"services";"services";"course";"mother";1;1;3;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;5;5;2;4;5;16;"6";"5";5 +"GP";"M";16;"U";"LE3";"T";4;3;"health";"other";"home";"father";1;1;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";3;1;3;1;3;5;4;"8";"10";10 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"other";"reputation";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;1;1;1;1;0;"13";"14";15 +"GP";"M";15;"U";"GT3";"T";4;4;"health";"health";"other";"father";1;1;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";5;4;2;1;1;5;0;"12";"15";15 +"GP";"M";16;"U";"LE3";"T";4;2;"teacher";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;5;1;1;3;5;2;"15";"15";16 +"GP";"M";16;"U";"LE3";"T";2;2;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;4;2;4;5;0;"13";"13";12 +"GP";"F";15;"R";"GT3";"T";2;4;"services";"health";"course";"mother";1;3;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;2;"10";"9";8 +"GP";"F";16;"U";"GT3";"T";2;2;"services";"services";"home";"mother";1;1;2;"no";"yes";"yes";"no";"no";"yes";"yes";"no";1;2;2;1;3;5;14;"6";"9";8 +"GP";"M";15;"U";"GT3";"T";2;2;"other";"other";"home";"mother";1;1;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;2;2;1;2;5;2;"12";"12";11 +"GP";"M";15;"U";"GT3";"T";4;2;"health";"services";"other";"mother";1;1;0;"no";"no";"yes";"no";"yes";"yes";"yes";"no";2;2;4;2;4;1;4;"15";"16";15 +"GP";"M";16;"U";"LE3";"A";3;4;"services";"other";"home";"mother";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;5;4;"11";"11";11 +"GP";"M";16;"U";"GT3";"T";4;4;"teacher";"teacher";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;4;5;5;5;5;16;"10";"12";11 +"GP";"M";15;"U";"GT3";"T";4;4;"health";"services";"home";"mother";1;2;0;"no";"yes";"yes";"no";"no";"yes";"yes";"no";5;4;2;3;4;5;0;"9";"11";12 +"GP";"M";15;"U";"GT3";"T";4;4;"services";"services";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;1;1;1;5;0;"17";"16";17 +"GP";"M";15;"R";"GT3";"T";4;3;"teacher";"at_home";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;5;2;1;1;5;0;"17";"16";16 +"GP";"M";15;"U";"LE3";"T";3;3;"other";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;3;2;1;1;2;0;"8";"10";12 +"GP";"M";16;"U";"GT3";"T";3;2;"other";"other";"home";"mother";1;1;0;"no";"yes";"yes";"no";"no";"yes";"yes";"no";5;4;3;1;1;5;0;"12";"14";15 +"GP";"F";15;"U";"GT3";"T";2;3;"other";"other";"other";"father";2;1;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";3;5;1;1;1;5;0;"8";"7";6 +"GP";"M";15;"U";"LE3";"T";4;3;"teacher";"services";"home";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;3;1;1;4;2;"15";"16";18 +"GP";"M";16;"R";"GT3";"A";4;4;"other";"teacher";"reputation";"mother";2;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";2;4;3;1;1;5;7;"15";"16";15 +"GP";"F";15;"R";"GT3";"T";3;4;"services";"health";"course";"mother";1;3;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;2;"12";"12";11 +"GP";"F";15;"R";"GT3";"T";2;2;"at_home";"other";"reputation";"mother";1;1;0;"yes";"yes";"yes";"yes";"yes";"yes";"no";"no";4;3;1;1;1;2;8;"14";"13";13 +"GP";"F";16;"U";"LE3";"T";2;2;"other";"other";"home";"mother";2;2;1;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";3;3;3;1;2;3;25;"7";"10";11 +"GP";"M";15;"U";"LE3";"T";4;4;"teacher";"other";"home";"other";1;1;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";5;4;3;2;4;5;8;"12";"12";12 +"GP";"M";15;"U";"GT3";"T";4;4;"services";"teacher";"course";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;1;5;2;"19";"18";18 +"GP";"M";15;"U";"GT3";"T";2;2;"services";"services";"course";"father";1;1;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;1;1;1;1;0;"8";"8";11 +"GP";"F";16;"U";"LE3";"T";2;2;"other";"at_home";"course";"father";2;2;1;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;2;5;14;"10";"10";9 +"GP";"F";15;"U";"LE3";"A";4;3;"other";"other";"course";"mother";1;2;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"yes";5;2;2;1;1;5;8;"8";"8";6 +"GP";"F";16;"U";"LE3";"A";3;3;"other";"services";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";2;3;5;1;4;3;12;"11";"12";11 +"GP";"M";16;"U";"GT3";"T";4;3;"health";"services";"reputation";"mother";1;4;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;2;2;1;1;2;4;"19";"19";20 +"GP";"M";15;"U";"GT3";"T";4;2;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";4;3;3;2;2;5;2;"15";"15";14 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"teacher";"other";"father";1;2;1;"yes";"yes";"no";"yes";"no";"yes";"yes";"no";4;4;4;1;1;3;2;"7";"7";7 +"GP";"F";16;"U";"LE3";"T";2;2;"services";"services";"course";"mother";3;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;2;3;4;2;"12";"13";13 +"GP";"F";15;"U";"LE3";"T";4;2;"health";"other";"other";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;1;1;5;2;"11";"13";13 +"GP";"M";15;"U";"LE3";"A";4;2;"health";"health";"other";"father";2;1;1;"no";"no";"no";"no";"yes";"yes";"no";"no";5;5;5;3;4;5;6;"11";"11";10 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"services";"course";"mother";1;1;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"no";3;3;4;2;3;5;0;"8";"10";11 +"GP";"F";15;"U";"LE3";"A";3;3;"other";"other";"other";"mother";1;1;0;"no";"no";"yes";"no";"yes";"yes";"yes";"no";5;3;4;4;4;1;6;"10";"13";13 +"GP";"F";16;"U";"GT3";"A";2;1;"other";"other";"other";"mother";1;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";5;3;4;1;1;2;8;"8";"9";10 +"GP";"F";15;"U";"GT3";"A";4;3;"services";"services";"reputation";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;1;0;"14";"15";15 +"GP";"M";15;"U";"GT3";"T";4;4;"teacher";"health";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";3;2;2;1;1;5;4;"14";"15";15 +"GP";"M";15;"U";"LE3";"T";1;2;"other";"at_home";"home";"father";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;2;"9";"10";9 +"GP";"F";16;"U";"GT3";"T";4;2;"services";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;2;3;1;1;5;2;"15";"16";16 +"GP";"F";16;"R";"GT3";"T";4;4;"health";"teacher";"other";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";2;4;4;2;3;4;6;"10";"11";11 +"GP";"F";16;"U";"GT3";"T";1;1;"services";"services";"course";"father";4;1;0;"yes";"yes";"no";"yes";"no";"yes";"yes";"yes";5;5;5;5;5;5;6;"10";"8";11 +"GP";"F";16;"U";"LE3";"T";1;2;"other";"services";"reputation";"father";1;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";4;4;3;1;1;1;4;"8";"10";9 +"GP";"F";16;"U";"GT3";"T";4;3;"teacher";"health";"home";"mother";1;3;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;4;4;2;4;4;2;"10";"9";9 +"GP";"F";15;"U";"LE3";"T";4;3;"services";"services";"reputation";"father";1;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"yes";4;4;4;2;4;2;0;"10";"10";10 +"GP";"F";16;"U";"LE3";"T";4;3;"teacher";"services";"course";"mother";3;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;3;1;2;1;2;"16";"15";15 +"GP";"M";15;"U";"GT3";"A";4;4;"other";"services";"reputation";"mother";1;4;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";1;3;3;5;5;3;4;"13";"13";12 +"GP";"F";16;"U";"GT3";"T";3;1;"services";"other";"course";"mother";1;4;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;1;2;5;4;"7";"7";6 +"GP";"F";15;"R";"LE3";"T";2;2;"health";"services";"reputation";"mother";2;2;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"no";4;1;3;1;3;4;2;"8";"9";8 +"GP";"F";15;"R";"LE3";"T";3;1;"other";"other";"reputation";"father";2;4;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";4;4;2;2;3;3;12;"16";"16";16 +"GP";"M";16;"U";"GT3";"T";3;1;"other";"other";"reputation";"father";2;4;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;2;1;1;5;0;"13";"15";15 +"GP";"M";15;"U";"GT3";"T";4;2;"other";"other";"course";"mother";1;4;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;3;3;1;1;3;0;"10";"10";10 +"GP";"F";15;"R";"GT3";"T";1;1;"other";"other";"reputation";"mother";1;2;2;"yes";"yes";"no";"no";"no";"yes";"yes";"yes";3;3;4;2;4;5;2;"8";"6";5 +"GP";"M";16;"U";"GT3";"T";3;1;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";5;3;2;2;2;5;2;"12";"12";14 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"services";"home";"mother";1;2;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;3;2;4;5;54;"11";"12";11 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;3;2;3;5;6;"9";"9";10 +"GP";"M";15;"U";"GT3";"T";4;0;"teacher";"other";"course";"mother";2;4;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;4;3;1;1;1;8;"11";"11";10 +"GP";"F";16;"U";"GT3";"T";2;2;"other";"other";"reputation";"mother";1;4;0;"no";"no";"yes";"no";"yes";"yes";"yes";"yes";5;2;3;1;3;3;0;"11";"11";11 +"GP";"M";17;"U";"GT3";"T";2;1;"other";"other";"home";"mother";2;1;3;"yes";"yes";"no";"yes";"yes";"no";"yes";"no";4;5;1;1;1;3;2;"8";"8";10 +"GP";"F";16;"U";"GT3";"T";3;4;"at_home";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";2;4;3;1;2;3;12;"5";"5";5 +"GP";"M";15;"U";"GT3";"T";2;3;"other";"services";"course";"father";1;1;0;"yes";"yes";"yes";"yes";"no";"yes";"yes";"yes";3;2;2;1;3;3;2;"10";"12";12 +"GP";"M";15;"U";"GT3";"T";2;3;"other";"other";"home";"mother";1;3;0;"yes";"no";"yes";"no";"no";"yes";"yes";"no";5;3;2;1;2;5;4;"11";"10";11 +"GP";"F";15;"U";"LE3";"T";3;2;"services";"other";"reputation";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;4;4;1;1;5;10;"7";"6";6 +"GP";"M";15;"U";"LE3";"T";2;2;"services";"services";"home";"mother";2;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";5;3;3;1;3;4;4;"15";"15";15 +"GP";"F";15;"U";"GT3";"T";1;1;"other";"other";"home";"father";1;2;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";4;3;2;2;3;4;2;"9";"10";10 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"services";"reputation";"father";2;2;2;"no";"no";"yes";"no";"yes";"yes";"yes";"yes";4;4;4;2;3;5;6;"7";"9";8 +"GP";"F";16;"U";"LE3";"T";2;2;"at_home";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;3;4;1;2;2;4;"8";"7";6 +"GP";"F";15;"U";"GT3";"T";4;2;"other";"other";"reputation";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;3;1;4;"13";"14";14 +"GP";"M";16;"U";"GT3";"T";2;2;"services";"other";"reputation";"father";2;2;1;"no";"no";"yes";"yes";"no";"yes";"yes";"no";4;4;2;1;1;3;12;"11";"10";10 +"GP";"M";16;"U";"LE3";"A";4;4;"teacher";"health";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;1;3;3;5;5;18;"8";"6";7 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"other";"home";"mother";1;3;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;3;3;1;3;4;0;"7";"7";8 +"GP";"F";15;"U";"GT3";"T";4;3;"services";"other";"reputation";"mother";1;1;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";4;5;5;1;3;1;4;"16";"17";18 +"GP";"F";16;"U";"LE3";"T";3;1;"other";"other";"home";"father";1;2;0;"yes";"yes";"no";"no";"yes";"yes";"no";"no";3;3;3;2;3;2;4;"7";"6";6 +"GP";"F";16;"U";"GT3";"T";4;2;"teacher";"services";"home";"mother";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;1;0;"11";"10";10 +"GP";"M";15;"U";"LE3";"T";2;2;"services";"health";"reputation";"mother";1;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;4;1;1;4;6;"11";"13";14 +"GP";"F";15;"R";"GT3";"T";1;1;"at_home";"other";"home";"mother";2;4;1;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;1;2;1;1;1;2;"7";"10";10 +"GP";"M";16;"R";"GT3";"T";4;3;"services";"other";"reputation";"mother";2;1;0;"yes";"yes";"no";"yes";"no";"yes";"yes";"no";3;3;3;1;1;4;2;"11";"15";15 +"GP";"F";16;"U";"GT3";"T";2;1;"other";"other";"course";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"no";"yes";4;3;5;1;1;5;2;"8";"9";10 +"GP";"F";16;"U";"GT3";"T";4;4;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;3;4;1;2;1;6;"11";"14";14 +"GP";"F";16;"U";"GT3";"T";4;3;"other";"at_home";"course";"mother";1;3;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"no";5;3;5;1;1;3;0;"7";"9";8 +"GP";"M";16;"U";"GT3";"T";4;4;"services";"services";"other";"mother";1;1;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;5;5;5;5;4;14;"7";"7";5 +"GP";"M";16;"U";"GT3";"T";4;4;"services";"teacher";"other";"father";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;4;3;1;1;4;0;"16";"17";17 +"GP";"M";15;"U";"GT3";"T";4;4;"services";"other";"course";"mother";1;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";5;3;3;1;1;5;4;"10";"13";14 +"GP";"F";15;"U";"GT3";"T";3;2;"services";"other";"home";"mother";2;2;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;5;1;1;2;26;"7";"6";6 +"GP";"M";15;"U";"GT3";"A";3;4;"services";"other";"course";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;4;4;1;1;1;0;"16";"18";18 +"GP";"F";15;"U";"GT3";"A";3;3;"other";"health";"reputation";"father";1;4;0;"yes";"no";"no";"no";"yes";"yes";"no";"no";4;3;3;1;1;4;10;"10";"11";11 +"GP";"F";15;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;4;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"no";5;1;2;1;1;3;8;"7";"8";8 +"GP";"M";16;"U";"GT3";"T";3;3;"services";"other";"home";"father";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;5;2;"16";"18";18 +"GP";"M";15;"R";"GT3";"T";4;4;"other";"other";"home";"father";4;4;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";1;3;5;3;5;1;6;"10";"13";13 +"GP";"F";16;"U";"LE3";"T";4;4;"health";"health";"other";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";5;4;5;1;1;4;4;"14";"15";16 +"GP";"M";15;"U";"LE3";"A";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;5;3;1;1;4;6;"18";"19";19 +"GP";"F";16;"R";"GT3";"T";3;3;"services";"other";"reputation";"father";1;3;1;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;1;2;1;1;2;0;"7";"10";10 +"GP";"F";16;"U";"GT3";"T";2;2;"at_home";"other";"home";"mother";1;2;1;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";3;1;2;1;1;5;6;"10";"13";13 +"GP";"M";15;"U";"LE3";"T";4;2;"teacher";"other";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;5;2;1;1;3;10;"18";"19";19 +"GP";"M";15;"R";"GT3";"T";2;1;"health";"services";"reputation";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;4;2;1;1;5;8;"9";"9";9 +"GP";"M";16;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;4;1;2;5;2;"15";"15";16 +"GP";"M";15;"U";"GT3";"T";4;4;"other";"teacher";"reputation";"father";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";4;4;3;1;1;2;2;"11";"13";14 +"GP";"M";16;"U";"GT3";"T";3;3;"other";"services";"home";"father";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;4;2;1;1;5;0;"13";"14";13 +"GP";"M";17;"R";"GT3";"T";1;3;"other";"other";"course";"father";3;2;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;2;4;1;4;5;20;"9";"7";8 +"GP";"M";15;"U";"GT3";"T";3;4;"other";"other";"reputation";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;4;3;1;2;4;6;"14";"13";13 +"GP";"F";15;"U";"GT3";"T";1;2;"at_home";"services";"course";"mother";1;2;0;"no";"no";"no";"no";"no";"yes";"yes";"no";3;2;3;1;2;1;2;"16";"15";15 +"GP";"M";15;"U";"GT3";"T";2;2;"services";"services";"home";"father";1;4;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;5;4;1;2;5;6;"16";"14";15 +"GP";"F";16;"U";"LE3";"T";2;4;"other";"health";"course";"father";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;2;2;1;2;5;2;"13";"13";13 +"GP";"M";16;"U";"GT3";"T";4;4;"health";"other";"course";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";3;4;4;1;4;5;18;"14";"11";13 +"GP";"F";16;"U";"GT3";"T";2;2;"other";"other";"home";"mother";1;2;0;"no";"no";"yes";"no";"yes";"yes";"yes";"yes";5;4;4;1;1;5;0;"8";"7";8 +"GP";"M";15;"U";"GT3";"T";3;4;"services";"services";"home";"father";1;1;0;"yes";"no";"no";"no";"yes";"yes";"yes";"no";5;5;5;3;2;5;0;"13";"13";12 +"GP";"F";15;"U";"LE3";"A";3;4;"other";"other";"home";"mother";1;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;2;1;1;1;0;"7";"10";11 +"GP";"F";19;"U";"GT3";"T";0;1;"at_home";"other";"course";"other";1;2;3;"no";"yes";"no";"no";"no";"no";"no";"no";3;4;2;1;1;5;2;"7";"8";9 +"GP";"M";18;"R";"GT3";"T";2;2;"services";"other";"reputation";"mother";1;1;2;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";3;3;3;1;2;4;0;"7";"4";0 +"GP";"M";16;"R";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";3;5;5;2;5;4;8;"18";"18";18 +"GP";"F";15;"R";"GT3";"T";3;4;"services";"teacher";"course";"father";2;3;2;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;2;2;2;2;5;0;"12";"0";0 +"GP";"F";15;"U";"GT3";"T";1;1;"at_home";"other";"course";"mother";3;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";4;3;3;1;2;4;0;"8";"0";0 +"GP";"F";17;"U";"LE3";"T";2;2;"other";"other";"course";"father";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";3;4;4;1;3;5;12;"10";"13";12 +"GP";"F";16;"U";"GT3";"A";3;4;"services";"other";"course";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;2;1;1;4;5;16;"12";"11";11 +"GP";"M";15;"R";"GT3";"T";3;4;"at_home";"teacher";"course";"mother";4;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";5;3;3;1;1;5;0;"9";"0";0 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"at_home";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;3;1;1;5;0;"11";"0";0 +"GP";"M";17;"R";"GT3";"T";3;4;"at_home";"other";"course";"mother";3;2;0;"no";"no";"no";"no";"yes";"yes";"no";"no";5;4;5;2;4;5;0;"10";"0";0 +"GP";"F";16;"U";"GT3";"A";3;3;"other";"other";"course";"other";2;1;2;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";4;3;2;1;1;5;0;"4";"0";0 +"GP";"M";16;"U";"LE3";"T";1;1;"services";"other";"course";"mother";1;2;1;"no";"no";"no";"no";"yes";"yes";"no";"yes";4;4;4;1;3;5;0;"14";"12";12 +"GP";"F";15;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;0;"16";"16";15 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"services";"course";"father";2;4;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";2;2;2;1;1;3;0;"7";"9";0 +"GP";"M";16;"U";"LE3";"T";2;2;"services";"services";"reputation";"father";2;1;2;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";2;3;3;2;2;2;8;"9";"9";9 +"GP";"F";15;"U";"GT3";"T";4;4;"teacher";"services";"course";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;2;2;1;1;5;2;"9";"11";11 +"GP";"F";16;"U";"LE3";"T";1;1;"at_home";"at_home";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;4;4;3;3;1;2;"14";"14";13 +"GP";"M";17;"U";"GT3";"T";2;1;"other";"other";"home";"mother";1;1;3;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;5;1;2;5;0;"5";"0";0 +"GP";"F";15;"U";"GT3";"T";1;1;"other";"services";"course";"father";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;4;2;1;2;5;0;"8";"11";11 +"GP";"F";15;"U";"GT3";"T";3;2;"health";"services";"home";"father";1;2;3;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;3;2;1;1;3;0;"6";"7";0 +"GP";"F";15;"U";"GT3";"T";1;2;"at_home";"other";"course";"mother";1;2;0;"no";"yes";"yes";"no";"no";"yes";"yes";"no";4;3;2;1;1;5;2;"10";"11";11 +"GP";"M";16;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"yes";"no";"no";"yes";"no";"yes";"yes";3;3;2;2;1;5;0;"7";"6";0 +"GP";"M";15;"U";"LE3";"A";2;1;"services";"other";"course";"mother";4;1;3;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;5;5;2;5;5;0;"8";"9";10 +"GP";"M";18;"U";"LE3";"T";1;1;"other";"other";"course";"mother";1;1;3;"no";"no";"no";"no";"yes";"no";"yes";"yes";2;3;5;2;5;4;0;"6";"5";0 +"GP";"M";16;"U";"LE3";"T";2;1;"at_home";"other";"course";"mother";1;1;1;"no";"no";"no";"yes";"yes";"yes";"no";"yes";4;4;4;3;5;5;6;"12";"13";14 +"GP";"F";15;"R";"GT3";"T";3;3;"services";"services";"reputation";"other";2;3;2;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;2;1;2;3;3;8;"10";"10";10 +"GP";"M";19;"U";"GT3";"T";3;2;"services";"at_home";"home";"mother";1;1;3;"no";"yes";"no";"no";"yes";"no";"yes";"yes";4;5;4;1;1;4;0;"5";"0";0 +"GP";"F";17;"U";"GT3";"T";4;4;"other";"teacher";"course";"mother";1;1;0;"yes";"yes";"no";"no";"yes";"yes";"no";"yes";4;2;1;1;1;4;0;"11";"11";12 +"GP";"M";15;"R";"GT3";"T";2;3;"at_home";"services";"course";"mother";1;2;0;"yes";"no";"yes";"yes";"yes";"yes";"no";"no";4;4;4;1;1;1;2;"11";"8";8 +"GP";"M";17;"R";"LE3";"T";1;2;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"no";"no";2;2;2;3;3;5;8;"16";"12";13 +"GP";"F";18;"R";"GT3";"T";1;1;"at_home";"other";"course";"mother";3;1;3;"no";"yes";"no";"yes";"no";"yes";"no";"no";5;2;5;1;5;4;6;"9";"8";10 +"GP";"M";16;"R";"GT3";"T";2;2;"at_home";"other";"course";"mother";3;1;0;"no";"no";"no";"no";"no";"yes";"no";"no";4;2;2;1;2;3;2;"17";"15";15 +"GP";"M";16;"U";"GT3";"T";3;3;"other";"services";"course";"father";1;2;1;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;5;5;4;4;5;4;"10";"12";12 +"GP";"M";17;"R";"LE3";"T";2;1;"at_home";"other";"course";"mother";2;1;2;"no";"no";"no";"yes";"yes";"no";"yes";"yes";3;3;2;2;2;5;0;"7";"6";0 +"GP";"M";15;"R";"GT3";"T";3;2;"other";"other";"course";"mother";2;2;2;"yes";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;4;1;4;3;6;"5";"9";7 +"GP";"M";16;"U";"LE3";"T";1;2;"other";"other";"course";"mother";2;1;1;"no";"no";"no";"yes";"yes";"yes";"no";"no";4;4;4;2;4;5;0;"7";"0";0 +"GP";"M";17;"U";"GT3";"T";1;3;"at_home";"services";"course";"father";1;1;0;"no";"no";"no";"no";"yes";"no";"yes";"no";5;3;3;1;4;2;2;"10";"10";10 +"GP";"M";17;"R";"LE3";"T";1;1;"other";"services";"course";"mother";4;2;3;"no";"no";"no";"yes";"yes";"no";"no";"yes";5;3;5;1;5;5;0;"5";"8";7 +"GP";"M";16;"U";"GT3";"T";3;2;"services";"services";"course";"mother";2;1;1;"no";"yes";"no";"yes";"no";"no";"no";"no";4;5;2;1;1;2;16;"12";"11";12 +"GP";"M";16;"U";"GT3";"T";2;2;"other";"other";"course";"father";1;2;0;"no";"no";"no";"no";"yes";"no";"yes";"no";4;3;5;2;4;4;4;"10";"10";10 +"GP";"F";16;"U";"GT3";"T";4;2;"health";"services";"home";"father";1;2;0;"no";"no";"yes";"no";"yes";"yes";"yes";"yes";4;2;3;1;1;3;0;"14";"15";16 +"GP";"F";16;"U";"GT3";"T";2;2;"other";"other";"home";"mother";1;2;0;"no";"yes";"yes";"no";"no";"yes";"yes";"no";5;1;5;1;1;4;0;"6";"7";0 +"GP";"F";16;"U";"GT3";"T";4;4;"health";"health";"reputation";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;4;2;1;1;3;0;"14";"14";14 +"GP";"M";16;"U";"GT3";"T";3;4;"other";"other";"course";"father";3;1;2;"no";"yes";"no";"yes";"no";"yes";"yes";"no";3;4;5;2;4;2;0;"6";"5";0 +"GP";"M";16;"U";"GT3";"T";1;0;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;3;2;1;1;3;2;"13";"15";16 +"GP";"M";17;"U";"LE3";"T";4;4;"teacher";"other";"reputation";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;4;4;1;3;5;0;"13";"11";10 +"GP";"F";16;"U";"GT3";"T";1;3;"at_home";"services";"home";"mother";1;2;3;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;3;5;1;1;3;0;"8";"7";0 +"GP";"F";16;"U";"LE3";"T";3;3;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;4;5;1;1;4;4;"10";"11";9 +"GP";"M";17;"U";"LE3";"T";4;3;"teacher";"other";"course";"mother";2;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";4;4;4;4;4;4;4;"10";"9";9 +"GP";"F";16;"U";"GT3";"T";2;2;"services";"other";"reputation";"mother";2;2;0;"no";"no";"yes";"yes";"no";"yes";"yes";"no";3;4;4;1;4;5;2;"13";"13";11 +"GP";"M";17;"U";"GT3";"T";3;3;"other";"other";"reputation";"father";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;3;4;1;4;4;4;"6";"5";6 +"GP";"M";16;"R";"GT3";"T";4;2;"teacher";"services";"other";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;3;3;4;3;10;"10";"8";9 +"GP";"M";17;"U";"GT3";"T";4;3;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;2;3;1;1;2;4;"10";"10";11 +"GP";"M";16;"U";"GT3";"T";4;3;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;4;3;2;3;3;10;"9";"8";8 +"GP";"M";16;"U";"GT3";"T";3;3;"services";"other";"home";"mother";1;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";4;2;3;1;2;3;2;"12";"13";12 +"GP";"F";17;"U";"GT3";"T";2;4;"services";"services";"reputation";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";5;4;2;2;3;5;0;"16";"17";17 +"GP";"F";17;"U";"LE3";"T";3;3;"other";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;2;3;1;56;"9";"9";8 +"GP";"F";16;"U";"GT3";"T";3;2;"other";"other";"reputation";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";1;2;2;1;2;1;14;"12";"13";12 +"GP";"M";17;"U";"GT3";"T";3;3;"services";"services";"other";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;4;2;3;4;12;"12";"12";11 +"GP";"M";16;"U";"GT3";"T";1;2;"services";"services";"other";"mother";1;1;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";3;3;3;1;2;3;2;"11";"12";11 +"GP";"M";16;"U";"LE3";"T";2;1;"other";"other";"course";"mother";1;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";4;2;3;1;2;5;0;"15";"15";15 +"GP";"F";17;"U";"GT3";"A";3;3;"health";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";3;3;3;1;3;3;6;"8";"7";9 +"GP";"M";17;"R";"GT3";"T";1;2;"at_home";"other";"home";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"no";"no";3;1;3;1;5;3;4;"8";"9";10 +"GP";"F";16;"U";"GT3";"T";2;3;"services";"services";"course";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;2;10;"11";"12";13 +"GP";"F";17;"U";"GT3";"T";1;1;"at_home";"services";"course";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;3;0;"8";"8";9 +"GP";"M";17;"U";"GT3";"T";1;2;"at_home";"services";"other";"other";2;2;0;"no";"no";"yes";"yes";"no";"yes";"yes";"no";4;4;4;4;5;5;12;"7";"8";8 +"GP";"M";16;"R";"GT3";"T";3;3;"services";"services";"reputation";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;2;3;4;5;8;"8";"9";10 +"GP";"M";16;"U";"GT3";"T";2;3;"other";"other";"home";"father";2;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;3;3;1;1;3;0;"13";"14";14 +"GP";"F";17;"U";"LE3";"T";2;4;"services";"services";"course";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;3;2;1;1;5;0;"14";"15";15 +"GP";"M";17;"U";"GT3";"T";4;4;"services";"teacher";"home";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;2;3;1;2;5;4;"17";"15";16 +"GP";"M";16;"R";"LE3";"T";3;3;"teacher";"other";"home";"father";3;1;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;3;4;3;5;3;8;"9";"9";10 +"GP";"F";17;"U";"GT3";"T";4;4;"services";"teacher";"home";"mother";2;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;2;4;2;3;2;24;"18";"18";18 +"GP";"F";16;"U";"LE3";"T";4;4;"teacher";"teacher";"reputation";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;5;2;1;2;3;0;"9";"9";10 +"GP";"F";16;"U";"GT3";"T";4;3;"health";"other";"home";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;5;1;5;2;2;"16";"16";16 +"GP";"F";16;"U";"GT3";"T";2;3;"other";"other";"reputation";"mother";1;2;0;"yes";"yes";"yes";"yes";"yes";"yes";"no";"no";4;4;3;1;3;4;6;"8";"10";10 +"GP";"F";17;"U";"GT3";"T";1;1;"other";"other";"course";"mother";1;2;0;"no";"yes";"yes";"no";"no";"yes";"no";"no";4;4;4;1;3;1;4;"9";"9";10 +"GP";"F";17;"R";"GT3";"T";2;2;"other";"other";"reputation";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;3;2;1;2;3;18;"7";"6";6 +"GP";"F";16;"R";"GT3";"T";2;2;"services";"services";"reputation";"mother";2;4;0;"no";"yes";"yes";"yes";"no";"yes";"yes";"no";5;3;5;1;1;5;6;"10";"10";11 +"GP";"F";17;"U";"GT3";"T";3;4;"at_home";"services";"home";"mother";1;3;1;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;4;3;3;4;5;28;"10";"9";9 +"GP";"F";16;"U";"GT3";"A";3;1;"services";"other";"course";"mother";1;2;3;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";2;3;3;2;2;4;5;"7";"7";7 +"GP";"F";16;"U";"GT3";"T";4;3;"teacher";"other";"other";"mother";1;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";1;3;2;1;1;1;10;"11";"12";13 +"GP";"F";16;"U";"GT3";"T";1;1;"at_home";"other";"home";"mother";2;1;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";4;3;2;1;4;5;6;"9";"9";10 +"GP";"F";17;"R";"GT3";"T";4;3;"teacher";"other";"reputation";"mother";2;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;4;2;1;1;4;6;"7";"7";7 +"GP";"F";19;"U";"GT3";"T";3;3;"other";"other";"reputation";"other";1;4;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;3;1;2;3;10;"8";"8";8 +"GP";"M";17;"U";"LE3";"T";4;4;"services";"other";"home";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";5;3;5;4;5;3;13;"12";"12";13 +"GP";"F";16;"U";"GT3";"A";2;2;"other";"other";"reputation";"mother";1;2;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"no";3;3;4;1;1;4;0;"12";"13";14 +"GP";"M";18;"U";"GT3";"T";2;2;"services";"other";"home";"mother";1;2;1;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;4;4;2;4;5;15;"6";"7";8 +"GP";"F";17;"R";"LE3";"T";4;4;"services";"other";"other";"mother";1;1;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";5;2;1;1;2;3;12;"8";"10";10 +"GP";"F";17;"U";"LE3";"T";3;2;"other";"other";"reputation";"mother";2;2;0;"no";"no";"yes";"no";"yes";"yes";"yes";"no";4;4;4;1;3;1;2;"14";"15";15 +"GP";"F";17;"U";"GT3";"T";4;3;"other";"other";"reputation";"mother";1;2;2;"no";"no";"yes";"no";"yes";"yes";"yes";"yes";3;4;5;2;4;1;22;"6";"6";4 +"GP";"M";18;"U";"LE3";"T";3;3;"services";"health";"home";"father";1;2;1;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";3;2;4;2;4;4;13;"6";"6";8 +"GP";"F";17;"U";"GT3";"T";2;3;"at_home";"other";"home";"father";2;1;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";3;3;3;1;4;3;3;"7";"7";8 +"GP";"F";17;"U";"GT3";"T";2;2;"at_home";"at_home";"course";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;3;1;1;4;4;"9";"10";10 +"GP";"F";17;"R";"GT3";"T";2;1;"at_home";"services";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;5;1;2;5;2;"6";"6";6 +"GP";"F";17;"U";"GT3";"T";1;1;"at_home";"other";"reputation";"mother";1;3;1;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";4;3;4;1;1;5;0;"6";"5";0 +"GP";"F";16;"U";"GT3";"T";2;3;"services";"teacher";"other";"mother";1;2;0;"yes";"no";"no";"no";"yes";"yes";"yes";"no";2;3;1;1;1;3;2;"16";"16";17 +"GP";"M";18;"U";"GT3";"T";2;2;"other";"other";"home";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";3;3;3;5;5;4;0;"12";"13";13 +"GP";"F";16;"U";"GT3";"T";4;4;"teacher";"services";"home";"mother";1;3;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";5;3;2;1;1;5;0;"13";"13";14 +"GP";"F";18;"R";"GT3";"T";3;1;"other";"other";"reputation";"mother";1;2;1;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;1;1;4;16;"9";"8";7 +"GP";"F";17;"U";"GT3";"T";3;2;"other";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;3;4;1;3;3;10;"16";"15";15 +"GP";"M";17;"U";"LE3";"T";2;3;"services";"services";"reputation";"father";1;2;0;"no";"yes";"yes";"no";"no";"yes";"yes";"no";5;3;3;1;3;3;2;"12";"11";12 +"GP";"M";18;"U";"LE3";"T";2;1;"at_home";"other";"course";"mother";4;2;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;3;2;4;5;3;14;"10";"8";9 +"GP";"F";17;"U";"GT3";"A";2;1;"other";"other";"course";"mother";2;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;2;3;1;2;3;10;"12";"10";12 +"GP";"F";17;"U";"LE3";"T";4;3;"health";"other";"reputation";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;2;3;1;2;3;14;"13";"13";14 +"GP";"M";17;"R";"GT3";"T";2;2;"other";"other";"course";"father";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;5;2;1;1;1;4;"11";"11";11 +"GP";"M";17;"U";"GT3";"T";4;4;"teacher";"teacher";"reputation";"mother";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;5;5;1;3;2;14;"11";"9";9 +"GP";"M";16;"U";"GT3";"T";4;4;"health";"other";"reputation";"father";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;2;4;2;4;1;2;"14";"13";13 +"GP";"M";16;"U";"LE3";"T";1;1;"other";"other";"home";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";3;4;2;1;1;5;18;"9";"7";6 +"GP";"M";16;"U";"GT3";"T";3;2;"at_home";"other";"reputation";"mother";2;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;1;3;2;10;"11";"9";10 +"GP";"M";17;"U";"LE3";"T";2;2;"other";"other";"home";"father";1;2;0;"no";"no";"yes";"yes";"no";"yes";"yes";"yes";4;4;2;5;5;4;4;"14";"13";13 +"GP";"F";16;"U";"GT3";"T";2;1;"other";"other";"home";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;5;2;1;1;5;20;"13";"12";12 +"GP";"F";17;"R";"GT3";"T";2;1;"at_home";"services";"course";"mother";3;2;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";2;1;1;1;1;3;2;"13";"11";11 +"GP";"M";18;"U";"GT3";"T";2;2;"other";"services";"reputation";"father";1;2;1;"no";"no";"no";"no";"yes";"no";"yes";"no";5;5;4;3;5;2;0;"7";"7";0 +"GP";"M";17;"U";"LE3";"T";4;3;"health";"other";"course";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";2;5;5;1;4;5;14;"12";"12";12 +"GP";"M";17;"R";"LE3";"A";4;4;"teacher";"other";"course";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";3;3;3;2;3;4;2;"10";"11";12 +"GP";"M";16;"U";"LE3";"T";4;3;"teacher";"other";"course";"mother";1;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;4;5;1;1;3;0;"6";"0";0 +"GP";"M";16;"U";"GT3";"T";4;4;"services";"services";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;3;2;1;2;5;0;"13";"12";12 +"GP";"F";18;"U";"GT3";"T";2;1;"other";"other";"course";"other";2;3;0;"no";"yes";"yes";"no";"no";"yes";"yes";"yes";4;4;4;1;1;3;0;"7";"0";0 +"GP";"M";16;"U";"GT3";"T";2;1;"other";"other";"course";"mother";3;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;4;6;"18";"18";18 +"GP";"M";17;"U";"GT3";"T";2;3;"other";"other";"course";"father";2;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;2;2;1;1;2;4;"12";"12";13 +"GP";"M";22;"U";"GT3";"T";3;1;"services";"services";"other";"mother";1;1;3;"no";"no";"no";"no";"no";"no";"yes";"yes";5;4;5;5;5;1;16;"6";"8";8 +"GP";"M";18;"R";"LE3";"T";3;3;"other";"services";"course";"mother";1;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;3;1;3;5;8;"3";"5";5 +"GP";"M";16;"U";"GT3";"T";0;2;"other";"other";"other";"mother";1;1;0;"no";"no";"yes";"no";"no";"yes";"yes";"no";4;3;2;2;4;5;0;"13";"15";15 +"GP";"M";18;"U";"GT3";"T";3;2;"services";"other";"course";"mother";2;1;1;"no";"no";"no";"no";"yes";"no";"yes";"no";4;4;5;2;4;5;0;"6";"8";8 +"GP";"M";16;"U";"GT3";"T";3;3;"at_home";"other";"reputation";"other";3;2;0;"yes";"yes";"no";"no";"no";"yes";"yes";"no";5;3;3;1;3;2;6;"7";"10";10 +"GP";"M";18;"U";"GT3";"T";2;1;"services";"services";"other";"mother";1;1;1;"no";"no";"no";"no";"no";"no";"yes";"no";3;2;5;2;5;5;4;"6";"9";8 +"GP";"M";16;"R";"GT3";"T";2;1;"other";"other";"course";"mother";2;1;0;"no";"no";"no";"yes";"no";"yes";"no";"no";3;3;2;1;3;3;0;"8";"9";8 +"GP";"M";17;"R";"GT3";"T";2;1;"other";"other";"course";"mother";1;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";4;4;2;2;4;5;0;"8";"12";12 +"GP";"M";17;"U";"LE3";"T";1;1;"health";"other";"course";"mother";2;1;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;4;4;1;2;5;2;"7";"9";8 +"GP";"F";17;"U";"LE3";"T";4;2;"teacher";"services";"reputation";"mother";1;4;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;2;3;1;1;4;6;"14";"12";13 +"GP";"M";19;"U";"LE3";"A";4;3;"services";"at_home";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;1;1;1;1;12;"11";"11";11 +"GP";"M";18;"U";"GT3";"T";2;1;"other";"other";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;2;4;1;2;4;8;"15";"14";14 +"GP";"F";17;"U";"LE3";"T";2;2;"services";"services";"course";"father";1;4;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";3;4;1;1;1;2;0;"10";"9";0 +"GP";"F";18;"U";"GT3";"T";4;3;"services";"other";"home";"father";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";3;1;2;1;3;2;21;"17";"18";18 +"GP";"M";18;"U";"GT3";"T";4;3;"teacher";"other";"course";"mother";1;2;0;"no";"yes";"yes";"no";"no";"yes";"yes";"no";4;3;2;1;1;3;2;"8";"8";8 +"GP";"M";18;"R";"GT3";"T";3;2;"other";"other";"course";"mother";1;3;0;"no";"no";"no";"yes";"no";"yes";"no";"no";5;3;2;1;1;3;1;"13";"12";12 +"GP";"F";17;"U";"GT3";"T";3;3;"other";"other";"home";"mother";1;3;0;"no";"no";"no";"yes";"no";"yes";"no";"no";3;2;3;1;1;4;4;"10";"9";9 +"GP";"F";18;"U";"GT3";"T";2;2;"at_home";"services";"home";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;3;3;1;1;3;0;"9";"10";0 +"GP";"M";18;"R";"LE3";"A";3;4;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;2;5;3;4;1;13;"17";"17";17 +"GP";"M";17;"U";"GT3";"T";3;1;"services";"other";"other";"mother";1;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";5;4;4;3;4;5;2;"9";"9";10 +"GP";"F";18;"R";"GT3";"T";4;4;"teacher";"other";"reputation";"mother";2;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";4;3;4;2;2;4;8;"12";"10";11 +"GP";"M";18;"U";"GT3";"T";4;2;"health";"other";"reputation";"father";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";5;4;5;1;3;5;10;"10";"9";10 +"GP";"F";18;"R";"GT3";"T";2;1;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"no";"yes";"no";"yes";"yes";4;3;5;1;2;3;0;"6";"0";0 +"GP";"F";19;"U";"GT3";"T";3;3;"other";"services";"home";"other";1;2;2;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;5;3;3;5;15;"9";"9";9 +"GP";"F";18;"U";"GT3";"T";2;3;"other";"services";"reputation";"father";1;4;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;5;5;1;3;2;4;"15";"14";14 +"GP";"F";18;"U";"LE3";"T";1;1;"other";"other";"home";"mother";2;2;0;"no";"yes";"yes";"no";"no";"yes";"no";"no";4;4;3;1;1;3;2;"11";"11";11 +"GP";"M";17;"R";"GT3";"T";1;2;"at_home";"at_home";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"no";"yes";"no";"yes";3;5;2;2;2;1;2;"15";"14";14 +"GP";"F";17;"U";"GT3";"T";2;4;"at_home";"health";"reputation";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;3;3;1;1;1;2;"10";"10";10 +"GP";"F";17;"U";"LE3";"T";2;2;"services";"other";"course";"mother";2;2;0;"yes";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;4;4;2;3;5;6;"12";"12";12 +"GP";"F";18;"R";"GT3";"A";3;2;"other";"services";"home";"mother";2;2;0;"no";"no";"no";"no";"no";"no";"yes";"yes";4;1;1;1;1;5;75;"10";"9";9 +"GP";"M";18;"U";"GT3";"T";4;4;"teacher";"services";"home";"mother";2;1;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";3;2;4;1;4;3;22;"9";"9";9 +"GP";"F";18;"U";"GT3";"T";4;4;"health";"health";"reputation";"father";1;2;1;"yes";"yes";"no";"yes";"yes";"yes";"yes";"yes";2;4;4;1;1;4;15;"9";"8";8 +"GP";"M";18;"U";"LE3";"T";4;3;"teacher";"services";"course";"mother";2;1;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";4;2;3;1;2;1;8;"10";"11";10 +"GP";"M";17;"U";"LE3";"A";4;1;"services";"other";"home";"mother";2;1;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";4;5;4;2;4;5;30;"8";"8";8 +"GP";"M";17;"U";"LE3";"A";3;2;"teacher";"services";"home";"mother";1;1;1;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;4;3;4;3;19;"11";"9";10 +"GP";"F";18;"R";"LE3";"T";1;1;"at_home";"other";"reputation";"mother";2;4;0;"no";"yes";"yes";"yes";"yes";"yes";"no";"no";5;2;2;1;1;3;1;"12";"12";12 +"GP";"F";18;"U";"GT3";"T";1;1;"other";"other";"home";"mother";2;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";5;4;4;1;1;4;4;"8";"9";10 +"GP";"F";17;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";5;4;5;1;2;5;4;"10";"9";11 +"GP";"M";17;"U";"GT3";"T";1;1;"other";"other";"reputation";"father";1;2;0;"no";"no";"yes";"no";"no";"yes";"yes";"no";4;3;3;1;2;4;2;"12";"10";11 +"GP";"F";18;"U";"GT3";"T";2;2;"at_home";"at_home";"other";"mother";1;3;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;1;2;2;5;"18";"18";19 +"GP";"F";17;"U";"GT3";"T";1;1;"services";"teacher";"reputation";"mother";1;3;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;1;1;3;6;"13";"12";12 +"GP";"M";18;"U";"GT3";"T";2;1;"services";"services";"reputation";"mother";1;3;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";4;2;4;1;3;2;6;"15";"14";14 +"GP";"M";18;"U";"LE3";"A";4;4;"teacher";"teacher";"reputation";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;4;3;1;1;2;9;"15";"13";15 +"GP";"M";18;"U";"GT3";"T";4;2;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;3;2;1;4;5;11;"12";"11";11 +"GP";"F";17;"U";"GT3";"T";4;3;"health";"services";"reputation";"mother";1;3;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;2;2;1;2;3;0;"15";"15";15 +"GP";"F";18;"U";"LE3";"T";2;1;"services";"at_home";"reputation";"mother";1;2;1;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;4;3;1;1;5;12;"12";"12";13 +"GP";"F";17;"R";"LE3";"T";3;1;"services";"other";"reputation";"mother";2;4;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";3;1;2;1;1;3;6;"18";"18";18 +"GP";"M";18;"R";"LE3";"T";3;2;"services";"other";"reputation";"mother";2;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;4;2;1;1;4;8;"14";"13";14 +"GP";"M";17;"U";"GT3";"T";3;3;"health";"other";"home";"mother";1;1;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;4;3;1;3;5;4;"14";"12";11 +"GP";"F";19;"U";"GT3";"T";4;4;"health";"other";"reputation";"other";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";2;3;4;2;3;2;0;"10";"9";0 +"GP";"F";18;"U";"LE3";"T";4;3;"other";"other";"home";"other";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;4;5;1;2;2;10;"10";"8";8 +"GP";"F";18;"U";"GT3";"T";4;3;"other";"other";"reputation";"father";1;4;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;1;1;3;0;"14";"13";14 +"GP";"M";18;"U";"LE3";"T";4;4;"teacher";"teacher";"home";"mother";1;1;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";1;4;2;2;2;1;5;"16";"15";16 +"GP";"F";18;"U";"LE3";"A";4;4;"health";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;2;4;1;1;4;14;"12";"10";11 +"GP";"M";17;"U";"LE3";"T";4;4;"other";"teacher";"home";"father";2;1;0;"no";"no";"yes";"no";"yes";"yes";"yes";"no";4;1;1;2;2;5;0;"11";"11";10 +"GP";"F";17;"U";"GT3";"T";4;2;"other";"other";"reputation";"mother";2;3;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;1;1;3;0;"15";"12";14 +"GP";"F";17;"U";"GT3";"T";3;2;"health";"health";"reputation";"father";1;4;0;"no";"yes";"yes";"yes";"no";"yes";"yes";"no";5;2;2;1;2;5;0;"17";"17";18 +"GP";"M";19;"U";"GT3";"T";3;3;"other";"other";"home";"other";1;2;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;4;4;1;1;3;20;"15";"14";13 +"GP";"F";18;"U";"GT3";"T";2;4;"services";"at_home";"reputation";"other";1;2;1;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;4;3;1;1;3;8;"14";"12";12 +"GP";"M";20;"U";"GT3";"A";3;2;"services";"other";"course";"other";1;1;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";5;5;3;1;1;5;0;"17";"18";18 +"GP";"M";19;"U";"GT3";"T";4;4;"teacher";"services";"reputation";"other";2;1;1;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;3;4;1;1;4;38;"8";"9";8 +"GP";"M";19;"R";"GT3";"T";3;3;"other";"services";"reputation";"father";1;2;1;"no";"no";"no";"yes";"yes";"yes";"no";"yes";4;5;3;1;2;5;0;"15";"12";12 +"GP";"F";19;"U";"LE3";"T";1;1;"at_home";"other";"reputation";"other";1;2;1;"yes";"yes";"no";"yes";"no";"yes";"yes";"no";4;4;3;1;3;3;18;"12";"10";10 +"GP";"F";19;"U";"LE3";"T";1;2;"services";"services";"home";"other";1;2;1;"no";"no";"no";"yes";"no";"yes";"no";"yes";4;2;4;2;2;3;0;"9";"9";0 +"GP";"F";19;"U";"GT3";"T";2;1;"at_home";"other";"other";"other";3;2;0;"no";"yes";"no";"no";"yes";"no";"yes";"yes";3;4;1;1;1;2;20;"14";"12";13 +"GP";"M";19;"U";"GT3";"T";1;2;"other";"services";"course";"other";1;2;1;"no";"no";"no";"no";"no";"yes";"yes";"no";4;5;2;2;2;4;3;"13";"11";11 +"GP";"F";19;"U";"LE3";"T";3;2;"services";"other";"reputation";"other";2;2;1;"no";"yes";"yes";"no";"no";"yes";"yes";"yes";4;2;2;1;2;1;22;"13";"10";11 +"GP";"F";19;"U";"GT3";"T";1;1;"at_home";"health";"home";"other";1;3;2;"no";"no";"no";"no";"no";"yes";"yes";"yes";4;1;2;1;1;3;14;"15";"13";13 +"GP";"F";19;"R";"GT3";"T";2;3;"other";"other";"reputation";"other";1;3;1;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;1;2;1;1;3;40;"13";"11";11 +"GP";"F";18;"U";"GT3";"T";2;1;"services";"other";"course";"mother";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;3;3;1;2;1;0;"8";"8";0 +"GP";"F";18;"U";"GT3";"T";4;3;"other";"other";"course";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;3;4;1;1;5;9;"9";"10";9 +"GP";"F";17;"R";"GT3";"T";3;4;"at_home";"services";"course";"father";1;3;0;"no";"yes";"yes";"yes";"no";"yes";"yes";"no";4;3;4;2;5;5;0;"11";"11";10 +"GP";"F";18;"U";"GT3";"T";4;4;"teacher";"other";"course";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;4;4;3;3;5;2;"11";"11";11 +"GP";"F";17;"U";"GT3";"A";4;3;"services";"services";"course";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";5;2;2;1;2;5;23;"13";"13";13 +"GP";"F";17;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";4;2;2;1;1;3;12;"11";"9";9 +"GP";"F";17;"R";"LE3";"T";2;2;"services";"services";"course";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;3;2;2;2;3;3;"11";"11";11 +"GP";"F";17;"U";"GT3";"T";3;1;"services";"services";"course";"father";1;3;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";3;4;3;2;3;5;1;"12";"14";15 +"GP";"F";17;"U";"LE3";"T";0;2;"at_home";"at_home";"home";"father";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;3;3;2;3;2;0;"16";"15";15 +"GP";"M";18;"U";"GT3";"T";4;4;"other";"other";"course";"mother";1;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;2;3;3;"9";"12";11 +"GP";"M";17;"U";"GT3";"T";3;3;"other";"services";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;3;5;3;5;5;3;"14";"15";16 +"GP";"M";17;"R";"GT3";"T";2;2;"services";"other";"course";"mother";4;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;5;5;5;4;8;"11";"10";10 +"GP";"F";17;"U";"GT3";"T";4;4;"teacher";"services";"course";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;4;4;1;3;4;7;"10";"9";9 +"GP";"F";17;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";2;3;0;"no";"yes";"yes";"no";"no";"yes";"yes";"yes";4;3;3;1;2;4;4;"14";"14";14 +"GP";"M";18;"U";"LE3";"T";2;2;"other";"other";"course";"mother";1;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;5;5;2;4;5;2;"9";"8";8 +"GP";"F";17;"R";"GT3";"T";2;4;"at_home";"other";"course";"father";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;3;1;1;5;7;"12";"14";14 +"GP";"F";18;"U";"GT3";"T";3;3;"services";"services";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;3;4;1;1;4;0;"7";"0";0 +"GP";"F";18;"U";"LE3";"T";2;2;"other";"other";"home";"other";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;3;3;1;1;2;0;"8";"8";0 +"GP";"F";18;"R";"GT3";"T";2;2;"at_home";"other";"course";"mother";2;4;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";4;4;4;1;1;4;0;"10";"9";0 +"GP";"F";17;"U";"GT3";"T";3;4;"services";"other";"course";"mother";1;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;5;1;3;5;16;"16";"15";15 +"GP";"F";19;"R";"GT3";"A";3;1;"services";"at_home";"home";"other";1;3;1;"no";"no";"yes";"no";"yes";"yes";"no";"no";5;4;3;1;2;5;12;"14";"13";13 +"GP";"F";17;"U";"GT3";"T";3;2;"other";"other";"home";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;3;2;2;3;2;0;"7";"8";0 +"GP";"F";18;"U";"LE3";"T";3;3;"services";"services";"home";"mother";1;4;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;3;3;1;1;1;7;"16";"15";17 +"GP";"F";17;"R";"GT3";"A";3;2;"other";"other";"home";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;3;2;3;2;4;"9";"10";10 +"GP";"F";19;"U";"GT3";"T";2;1;"services";"services";"home";"other";1;3;1;"no";"no";"yes";"yes";"yes";"yes";"yes";"yes";4;3;4;1;3;3;4;"11";"12";11 +"GP";"M";18;"U";"GT3";"T";4;4;"teacher";"services";"home";"father";1;2;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;2;2;0;"10";"10";0 +"GP";"M";18;"U";"LE3";"T";3;4;"services";"other";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;3;3;1;3;5;11;"16";"15";15 +"GP";"F";17;"U";"GT3";"A";2;2;"at_home";"at_home";"home";"father";1;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";3;3;1;1;2;4;0;"9";"8";0 +"GP";"F";18;"U";"GT3";"T";2;3;"at_home";"other";"course";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;2;3;4;"11";"10";10 +"GP";"F";18;"U";"GT3";"T";3;2;"other";"services";"other";"mother";1;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;4;3;2;3;1;7;"13";"13";14 +"GP";"M";18;"R";"GT3";"T";4;3;"teacher";"services";"course";"mother";1;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;3;2;1;2;4;9;"16";"15";16 +"GP";"M";18;"U";"GT3";"T";4;3;"teacher";"other";"course";"mother";1;3;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";5;4;5;2;3;5;0;"10";"10";9 +"GP";"F";17;"U";"GT3";"T";4;3;"health";"other";"reputation";"mother";1;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;4;3;1;3;4;0;"13";"15";15 +"MS";"M";18;"R";"GT3";"T";3;2;"other";"other";"course";"mother";2;1;1;"no";"yes";"no";"no";"no";"yes";"yes";"no";2;5;5;5;5;5;10;"11";"13";13 +"MS";"M";19;"R";"GT3";"T";1;1;"other";"services";"home";"other";3;2;3;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;4;4;3;3;2;8;"8";"7";8 +"MS";"M";17;"U";"GT3";"T";3;3;"health";"other";"course";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;5;4;2;3;3;2;"13";"13";13 +"MS";"M";18;"U";"LE3";"T";1;3;"at_home";"services";"course";"mother";1;1;1;"no";"no";"no";"no";"yes";"no";"yes";"yes";4;3;3;2;3;3;7;"8";"7";8 +"MS";"M";19;"R";"GT3";"T";1;1;"other";"other";"home";"other";3;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;4;3;3;5;4;"8";"8";8 +"MS";"M";17;"R";"GT3";"T";4;3;"services";"other";"home";"mother";2;2;0;"no";"yes";"yes";"yes";"no";"yes";"yes";"yes";4;5;5;1;3;2;4;"13";"11";11 +"MS";"F";18;"U";"GT3";"T";3;3;"services";"services";"course";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";5;3;4;1;1;5;0;"10";"9";9 +"MS";"F";17;"R";"GT3";"T";4;4;"teacher";"services";"other";"father";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;3;1;2;5;4;"12";"13";13 +"MS";"F";17;"U";"LE3";"A";3;2;"services";"other";"reputation";"mother";2;2;0;"no";"no";"no";"no";"yes";"yes";"no";"yes";1;2;3;1;2;5;2;"12";"12";11 +"MS";"M";18;"U";"LE3";"T";1;1;"other";"services";"home";"father";2;1;0;"no";"no";"no";"no";"no";"yes";"yes";"yes";3;3;2;1;2;3;4;"10";"10";10 +"MS";"F";18;"U";"LE3";"T";1;1;"at_home";"services";"course";"father";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;3;2;1;1;4;0;"18";"16";16 +"MS";"F";18;"R";"LE3";"A";1;4;"at_home";"other";"course";"mother";3;2;0;"no";"no";"no";"no";"yes";"yes";"no";"yes";4;3;4;1;4;5;0;"13";"13";13 +"MS";"M";18;"R";"LE3";"T";1;1;"at_home";"other";"other";"mother";2;2;1;"no";"no";"no";"yes";"no";"no";"no";"no";4;4;3;2;3;5;2;"13";"12";12 +"MS";"F";18;"U";"GT3";"T";3;3;"services";"services";"other";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;2;1;3;3;0;"11";"11";10 +"MS";"F";17;"U";"LE3";"T";4;4;"at_home";"at_home";"course";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";2;3;4;1;1;1;0;"16";"15";15 +"MS";"F";17;"R";"GT3";"T";1;2;"other";"services";"course";"father";2;2;0;"no";"no";"no";"no";"no";"yes";"no";"no";3;2;2;1;2;3;0;"12";"11";12 +"MS";"M";18;"R";"GT3";"T";1;3;"at_home";"other";"course";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";3;3;4;2;4;3;4;"10";"10";10 +"MS";"M";18;"U";"LE3";"T";4;4;"teacher";"services";"other";"mother";2;3;0;"no";"no";"yes";"no";"yes";"yes";"yes";"yes";4;2;2;2;2;5;0;"13";"13";13 +"MS";"F";17;"R";"GT3";"T";1;1;"other";"services";"reputation";"mother";3;1;1;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";5;2;1;1;2;1;0;"7";"6";0 +"MS";"F";18;"U";"GT3";"T";2;3;"at_home";"services";"course";"father";2;1;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";5;2;3;1;2;4;0;"11";"10";10 +"MS";"F";18;"R";"GT3";"T";4;4;"other";"teacher";"other";"father";3;2;0;"no";"yes";"yes";"no";"no";"yes";"yes";"yes";3;2;2;4;2;5;10;"14";"12";11 +"MS";"F";19;"U";"LE3";"T";3;2;"services";"services";"home";"other";2;2;2;"no";"no";"no";"yes";"yes";"yes";"no";"yes";3;2;2;1;1;3;4;"7";"7";9 +"MS";"M";18;"R";"LE3";"T";1;2;"at_home";"services";"other";"father";3;1;0;"no";"yes";"yes";"yes";"yes";"no";"yes";"yes";4;3;3;2;3;3;3;"14";"12";12 +"MS";"F";17;"U";"GT3";"T";2;2;"other";"at_home";"home";"mother";1;3;0;"no";"no";"no";"yes";"yes";"yes";"no";"yes";3;4;3;1;1;3;8;"13";"11";11 +"MS";"F";17;"R";"GT3";"T";1;2;"other";"other";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;5;5;1;3;1;14;"6";"5";5 +"MS";"F";18;"R";"LE3";"T";4;4;"other";"other";"reputation";"mother";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;4;4;1;1;1;0;"19";"18";19 +"MS";"F";18;"R";"GT3";"T";1;1;"other";"other";"home";"mother";4;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;3;2;1;2;4;2;"8";"8";10 +"MS";"F";20;"U";"GT3";"T";4;2;"health";"other";"course";"other";2;3;2;"no";"yes";"yes";"no";"no";"yes";"yes";"yes";5;4;3;1;1;3;4;"15";"14";15 +"MS";"F";18;"R";"LE3";"T";4;4;"teacher";"services";"course";"mother";1;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";5;4;3;3;4;2;4;"8";"9";10 +"MS";"F";18;"U";"GT3";"T";3;3;"other";"other";"home";"mother";1;2;0;"no";"no";"yes";"no";"yes";"yes";"yes";"yes";4;1;3;1;2;1;0;"15";"15";15 +"MS";"F";17;"R";"GT3";"T";3;1;"at_home";"other";"reputation";"mother";1;2;0;"no";"yes";"yes";"yes";"no";"yes";"yes";"no";4;5;4;2;3;1;17;"10";"10";10 +"MS";"M";18;"U";"GT3";"T";4;4;"teacher";"teacher";"home";"father";1;2;0;"no";"no";"yes";"yes";"no";"yes";"yes";"no";3;2;4;1;4;2;4;"15";"14";14 +"MS";"M";18;"R";"GT3";"T";2;1;"other";"other";"other";"mother";2;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;4;3;1;3;5;5;"7";"6";7 +"MS";"M";17;"U";"GT3";"T";2;3;"other";"services";"home";"father";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;4;3;1;1;3;2;"11";"11";10 +"MS";"M";19;"R";"GT3";"T";1;1;"other";"services";"other";"mother";2;1;1;"no";"no";"no";"no";"yes";"yes";"no";"no";4;3;2;1;3;5;0;"6";"5";0 +"MS";"M";18;"R";"GT3";"T";4;2;"other";"other";"home";"father";2;1;1;"no";"no";"yes";"no";"yes";"yes";"no";"no";5;4;3;4;3;3;14;"6";"5";5 +"MS";"F";18;"R";"GT3";"T";2;2;"at_home";"other";"other";"mother";2;3;0;"no";"no";"yes";"no";"yes";"yes";"no";"no";5;3;3;1;3;4;2;"10";"9";10 +"MS";"F";18;"R";"GT3";"T";4;4;"teacher";"at_home";"reputation";"mother";3;1;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;4;3;2;2;5;7;"6";"5";6 +"MS";"F";19;"R";"GT3";"T";2;3;"services";"other";"course";"mother";1;3;1;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;4;2;1;2;5;0;"7";"5";0 +"MS";"F";18;"U";"LE3";"T";3;1;"teacher";"services";"course";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;3;4;1;1;1;0;"7";"9";8 +"MS";"F";18;"U";"GT3";"T";1;1;"other";"other";"course";"mother";2;2;1;"no";"no";"no";"yes";"yes";"yes";"no";"no";1;1;1;1;1;5;0;"6";"5";0 +"MS";"M";20;"U";"LE3";"A";2;2;"services";"services";"course";"other";1;2;2;"no";"yes";"yes";"no";"yes";"yes";"no";"no";5;5;4;4;5;4;11;"9";"9";9 +"MS";"M";17;"U";"LE3";"T";3;1;"services";"services";"course";"mother";2;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";2;4;5;3;4;2;3;"14";"16";16 +"MS";"M";21;"R";"GT3";"T";1;1;"other";"other";"course";"other";1;1;3;"no";"no";"no";"no";"no";"yes";"no";"no";5;5;3;3;3;3;3;"10";"8";7 +"MS";"M";18;"R";"LE3";"T";3;2;"services";"other";"course";"mother";3;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";4;4;1;3;4;5;0;"11";"12";10 +"MS";"M";19;"U";"LE3";"T";1;1;"other";"at_home";"course";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;2;3;3;3;5;5;"8";"9";9 diff --git a/student-por.csv b/student-por.csv new file mode 100755 index 000000000..2e461960b --- /dev/null +++ b/student-por.csv @@ -0,0 +1,650 @@ +school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3 +"GP";"F";18;"U";"GT3";"A";4;4;"at_home";"teacher";"course";"mother";2;2;0;"yes";"no";"no";"no";"yes";"yes";"no";"no";4;3;4;1;1;3;4;"0";"11";11 +"GP";"F";17;"U";"GT3";"T";1;1;"at_home";"other";"course";"father";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";5;3;3;1;1;3;2;"9";"11";11 +"GP";"F";15;"U";"LE3";"T";1;1;"at_home";"other";"other";"mother";1;2;0;"yes";"no";"no";"no";"yes";"yes";"yes";"no";4;3;2;2;3;3;6;"12";"13";12 +"GP";"F";15;"U";"GT3";"T";4;2;"health";"services";"home";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";3;2;2;1;1;5;0;"14";"14";14 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"other";"home";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;3;2;1;2;5;0;"11";"13";13 +"GP";"M";16;"U";"LE3";"T";4;3;"services";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;2;1;2;5;6;"12";"12";13 +"GP";"M";16;"U";"LE3";"T";2;2;"other";"other";"home";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;4;1;1;3;0;"13";"12";13 +"GP";"F";17;"U";"GT3";"A";4;4;"other";"teacher";"home";"mother";2;2;0;"yes";"yes";"no";"no";"yes";"yes";"no";"no";4;1;4;1;1;1;2;"10";"13";13 +"GP";"M";15;"U";"LE3";"A";3;2;"services";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;2;2;1;1;1;0;"15";"16";17 +"GP";"M";15;"U";"GT3";"T";3;4;"other";"other";"home";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;5;1;1;1;5;0;"12";"12";13 +"GP";"F";15;"U";"GT3";"T";4;4;"teacher";"health";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;3;3;1;2;2;2;"14";"14";14 +"GP";"F";15;"U";"GT3";"T";2;1;"services";"other";"reputation";"father";3;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;2;2;1;1;4;0;"10";"12";13 +"GP";"M";15;"U";"LE3";"T";4;4;"health";"services";"course";"father";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;3;5;0;"12";"13";12 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"other";"course";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;3;1;2;3;0;"12";"12";13 +"GP";"M";15;"U";"GT3";"A";2;2;"other";"other";"home";"other";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;5;2;1;1;3;0;"14";"14";15 +"GP";"F";16;"U";"GT3";"T";4;4;"health";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;4;1;2;2;6;"17";"17";17 +"GP";"F";16;"U";"GT3";"T";4;4;"services";"services";"reputation";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";3;2;3;1;2;2;10;"13";"13";14 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"other";"reputation";"mother";3;2;0;"yes";"yes";"no";"yes";"yes";"yes";"no";"no";5;3;2;1;1;4;2;"13";"14";14 +"GP";"M";17;"U";"GT3";"T";3;2;"services";"services";"course";"mother";1;1;3;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;5;5;2;4;5;2;"8";"8";7 +"GP";"M";16;"U";"LE3";"T";4;3;"health";"other";"home";"father";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;1;3;1;3;5;6;"12";"12";12 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"other";"reputation";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;1;1;1;1;0;"12";"13";14 +"GP";"M";15;"U";"GT3";"T";4;4;"health";"health";"other";"father";1;1;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";5;4;2;1;1;5;0;"11";"12";12 +"GP";"M";16;"U";"LE3";"T";4;2;"teacher";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;5;1;1;3;5;0;"12";"13";14 +"GP";"M";16;"U";"LE3";"T";2;2;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;4;2;4;5;2;"10";"10";10 +"GP";"F";15;"R";"GT3";"T";2;4;"services";"health";"course";"mother";1;3;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;2;"10";"11";10 +"GP";"F";16;"U";"GT3";"T";2;2;"services";"services";"home";"mother";1;1;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";1;2;2;1;3;5;6;"10";"11";12 +"GP";"M";15;"U";"GT3";"T";2;2;"other";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;2;2;1;2;5;8;"11";"12";12 +"GP";"M";15;"U";"GT3";"T";4;2;"health";"services";"other";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";2;2;4;2;4;1;0;"11";"11";11 +"GP";"M";16;"U";"LE3";"A";3;4;"services";"other";"home";"mother";1;2;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;5;2;"12";"12";13 +"GP";"M";16;"U";"GT3";"T";4;4;"teacher";"teacher";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;4;5;5;5;5;4;"12";"11";12 +"GP";"M";15;"U";"GT3";"T";4;4;"health";"services";"home";"mother";1;2;0;"no";"yes";"yes";"no";"no";"yes";"yes";"no";5;4;2;3;4;5;0;"10";"11";11 +"GP";"M";15;"U";"GT3";"T";4;4;"services";"services";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;1;1;1;5;2;"15";"15";15 +"GP";"M";15;"R";"GT3";"T";4;3;"teacher";"at_home";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;5;2;1;1;5;0;"13";"14";15 +"GP";"M";15;"U";"LE3";"T";3;3;"other";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;3;2;1;1;2;0;"13";"12";12 +"GP";"M";16;"U";"GT3";"T";3;2;"other";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";5;4;3;1;1;5;4;"12";"12";12 +"GP";"F";15;"U";"GT3";"T";2;3;"other";"other";"other";"father";2;1;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";3;5;1;1;1;5;4;"11";"11";11 +"GP";"M";15;"U";"LE3";"T";4;3;"teacher";"services";"home";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;3;1;1;4;0;"14";"14";14 +"GP";"M";16;"R";"GT3";"A";4;4;"other";"teacher";"reputation";"mother";2;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";2;4;3;1;1;5;4;"13";"13";13 +"GP";"F";15;"R";"GT3";"T";3;4;"services";"health";"course";"mother";1;3;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;2;"11";"12";12 +"GP";"F";15;"R";"GT3";"T";2;2;"at_home";"other";"reputation";"mother";1;1;0;"yes";"yes";"no";"yes";"yes";"yes";"no";"no";4;3;1;1;1;2;8;"14";"13";12 +"GP";"F";16;"U";"LE3";"T";2;2;"other";"other";"home";"mother";2;2;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";3;3;3;1;2;3;16;"11";"11";10 +"GP";"M";15;"U";"LE3";"T";4;4;"teacher";"other";"home";"other";1;1;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";5;4;3;2;4;5;8;"10";"11";11 +"GP";"M";15;"U";"GT3";"T";4;4;"services";"teacher";"course";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;1;5;0;"14";"15";15 +"GP";"M";15;"U";"GT3";"T";2;2;"services";"services";"course";"father";1;1;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;1;1;1;1;0;"9";"10";10 +"GP";"F";16;"U";"LE3";"T";2;2;"other";"at_home";"course";"father";2;2;1;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;2;5;14;"10";"11";11 +"GP";"F";15;"U";"LE3";"A";4;3;"other";"other";"course";"mother";1;2;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"yes";5;2;2;1;1;5;4;"10";"11";11 +"GP";"F";16;"U";"LE3";"A";3;3;"other";"services";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";2;3;5;1;4;3;6;"13";"12";13 +"GP";"M";16;"U";"GT3";"T";4;3;"health";"services";"reputation";"mother";1;4;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;2;2;1;1;2;2;"17";"17";17 +"GP";"M";15;"U";"GT3";"T";4;2;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;3;3;2;2;5;4;"11";"12";13 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"teacher";"other";"father";1;2;0;"yes";"yes";"no";"yes";"no";"yes";"yes";"no";4;4;4;1;1;3;2;"13";"12";12 +"GP";"F";16;"U";"LE3";"T";2;2;"services";"services";"course";"mother";3;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;2;3;4;0;"14";"13";13 +"GP";"F";15;"U";"LE3";"T";4;2;"health";"other";"other";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;5;0;"16";"14";16 +"GP";"M";15;"U";"LE3";"A";4;2;"health";"health";"other";"father";2;1;0;"no";"no";"no";"no";"yes";"yes";"no";"no";5;5;5;3;4;5;4;"10";"9";9 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"services";"course";"mother";1;1;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";3;3;4;2;3;5;0;"13";"12";12 +"GP";"F";15;"U";"LE3";"A";3;3;"other";"other";"other";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;3;4;4;4;1;0;"13";"12";13 +"GP";"F";16;"U";"GT3";"A";2;1;"other";"other";"other";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;4;1;1;2;2;"12";"13";12 +"GP";"F";15;"U";"GT3";"A";4;3;"services";"services";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;1;0;"15";"14";15 +"GP";"M";15;"U";"GT3";"T";4;4;"teacher";"health";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";3;2;2;1;1;5;8;"15";"15";16 +"GP";"M";15;"U";"LE3";"T";1;2;"other";"at_home";"home";"father";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;0;"14";"13";14 +"GP";"F";16;"U";"GT3";"T";4;2;"services";"other";"course";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;2;3;1;1;5;2;"16";"15";16 +"GP";"F";16;"R";"GT3";"T";4;4;"health";"teacher";"other";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";2;4;4;2;3;4;0;"17";"16";16 +"GP";"F";16;"U";"GT3";"T";1;1;"services";"services";"course";"father";4;1;0;"yes";"yes";"no";"yes";"no";"yes";"yes";"yes";5;5;5;5;5;5;0;"10";"10";16 +"GP";"F";16;"U";"LE3";"T";1;2;"other";"services";"reputation";"father";1;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";4;4;3;1;1;1;0;"13";"13";10 +"GP";"F";16;"U";"GT3";"T";4;3;"teacher";"health";"home";"mother";1;3;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";3;4;4;2;4;4;0;"14";"13";13 +"GP";"F";15;"U";"LE3";"T";4;3;"services";"services";"reputation";"father";1;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"yes";4;4;4;2;4;2;0;"13";"12";12 +"GP";"F";16;"U";"LE3";"T";4;3;"teacher";"services";"course";"mother";3;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;3;1;2;1;2;"16";"15";16 +"GP";"M";15;"U";"GT3";"A";4;4;"other";"services";"reputation";"mother";1;4;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";1;3;3;5;5;3;0;"11";"12";12 +"GP";"F";16;"U";"GT3";"T";3;1;"services";"other";"course";"mother";1;4;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;2;5;0;"10";"9";10 +"GP";"F";15;"R";"LE3";"T";2;2;"health";"services";"reputation";"mother";2;2;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";4;1;3;1;3;4;0;"11";"10";11 +"GP";"F";15;"R";"LE3";"T";3;1;"other";"other";"reputation";"father";2;4;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";4;4;2;2;3;3;6;"15";"15";15 +"GP";"M";16;"U";"GT3";"T";3;1;"other";"other";"reputation";"father";2;4;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;2;1;1;5;2;"13";"11";11 +"GP";"M";15;"U";"GT3";"T";4;2;"other";"other";"course";"mother";1;4;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;3;3;1;1;3;0;"11";"9";10 +"GP";"F";15;"R";"GT3";"T";1;1;"other";"other";"reputation";"mother";1;2;0;"yes";"yes";"no";"no";"no";"yes";"yes";"yes";3;3;4;2;4;5;2;"13";"11";11 +"GP";"M";16;"U";"GT3";"T";3;1;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";5;3;2;2;2;5;0;"13";"13";14 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"services";"home";"mother";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;4;5;4;"11";"11";11 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;3;5;0;"11";"11";11 +"GP";"M";15;"U";"GT3";"T";4;0;"teacher";"other";"course";"mother";2;4;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;4;3;1;1;1;0;"12";"11";11 +"GP";"F";16;"U";"GT3";"T";2;2;"other";"other";"reputation";"mother";1;4;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;2;3;1;3;3;1;"13";"13";13 +"GP";"M";17;"U";"GT3";"T";2;1;"other";"other";"home";"mother";2;1;3;"yes";"yes";"no";"yes";"yes";"no";"yes";"no";4;5;1;1;1;3;0;"9";"9";10 +"GP";"F";16;"U";"GT3";"T";3;4;"at_home";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";2;4;3;1;2;3;14;"12";"11";11 +"GP";"M";15;"U";"GT3";"T";2;3;"other";"services";"course";"father";1;1;0;"yes";"yes";"no";"yes";"no";"yes";"yes";"yes";3;2;2;1;3;3;0;"11";"11";12 +"GP";"M";15;"U";"GT3";"T";2;3;"other";"other";"home";"mother";1;3;0;"yes";"no";"no";"no";"no";"yes";"yes";"no";5;3;2;1;2;5;2;"10";"9";9 +"GP";"F";15;"U";"LE3";"T";3;2;"services";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;4;1;1;5;4;"12";"11";11 +"GP";"M";15;"U";"LE3";"T";2;2;"services";"services";"home";"mother";2;2;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";5;3;3;1;3;4;2;"13";"12";13 +"GP";"F";15;"U";"GT3";"T";1;1;"other";"other";"home";"father";1;2;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";4;3;2;2;3;4;2;"13";"12";12 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"services";"reputation";"father";2;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;4;4;2;3;5;4;"12";"11";12 +"GP";"F";16;"U";"LE3";"T";2;2;"at_home";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;3;4;1;2;2;6;"13";"11";11 +"GP";"F";15;"U";"GT3";"T";4;2;"other";"other";"reputation";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;3;1;4;"15";"15";15 +"GP";"M";16;"U";"GT3";"T";2;2;"services";"other";"reputation";"father";2;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;4;2;1;1;3;6;"12";"10";11 +"GP";"M";16;"U";"LE3";"A";4;4;"teacher";"health";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;1;3;3;5;5;6;"9";"9";10 +"GP";"F";16;"U";"GT3";"T";3;3;"other";"other";"home";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;3;1;3;4;2;"9";"11";11 +"GP";"F";15;"U";"GT3";"T";4;3;"services";"other";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;5;5;1;3;1;6;"14";"13";13 +"GP";"F";16;"U";"LE3";"T";3;1;"other";"other";"home";"father";1;2;0;"yes";"yes";"no";"no";"yes";"yes";"no";"no";3;3;3;2;3;2;0;"12";"13";12 +"GP";"F";16;"U";"GT3";"T";4;2;"teacher";"services";"home";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;1;2;"13";"14";14 +"GP";"M";15;"U";"LE3";"T";2;2;"services";"health";"reputation";"mother";1;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;4;1;1;4;2;"11";"12";12 +"GP";"F";15;"R";"GT3";"T";1;1;"at_home";"other";"home";"mother";2;4;0;"yes";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;1;2;1;1;1;4;"13";"13";13 +"GP";"M";16;"R";"GT3";"T";4;3;"services";"other";"reputation";"mother";2;1;0;"yes";"yes";"yes";"yes";"no";"yes";"yes";"no";3;3;3;1;1;4;6;"9";"11";11 +"GP";"F";16;"U";"GT3";"T";2;1;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";4;3;5;1;1;5;0;"13";"12";12 +"GP";"F";16;"U";"GT3";"T";4;4;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;3;4;1;2;1;4;"12";"13";13 +"GP";"F";16;"U";"GT3";"T";4;3;"other";"at_home";"course";"mother";1;3;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";5;3;5;1;1;3;2;"12";"13";13 +"GP";"M";16;"U";"GT3";"T";4;4;"services";"services";"other";"mother";1;1;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;5;5;5;5;4;12;"9";"9";8 +"GP";"M";16;"U";"GT3";"T";4;4;"services";"teacher";"other";"father";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;4;3;1;1;4;0;"16";"16";16 +"GP";"M";15;"U";"GT3";"T";4;4;"services";"other";"course";"mother";1;1;0;"no";"yes";"yes";"yes";"no";"yes";"yes";"no";5;3;3;1;1;5;2;"12";"13";12 +"GP";"F";15;"U";"GT3";"T";3;2;"services";"other";"home";"mother";2;2;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;5;1;1;2;16;"11";"10";10 +"GP";"M";15;"U";"GT3";"A";3;4;"services";"other";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;4;1;1;1;0;"16";"16";16 +"GP";"F";15;"U";"GT3";"A";3;3;"other";"health";"reputation";"father";1;4;0;"yes";"no";"no";"no";"yes";"yes";"no";"no";4;3;3;1;1;4;10;"10";"10";10 +"GP";"F";15;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;4;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";5;1;2;1;1;3;4;"10";"10";10 +"GP";"M";16;"U";"GT3";"T";3;3;"services";"other";"home";"father";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;5;4;"13";"14";14 +"GP";"M";15;"R";"GT3";"T";4;4;"other";"other";"home";"father";4;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";1;3;5;3;5;1;8;"12";"10";11 +"GP";"F";16;"U";"LE3";"T";4;4;"health";"health";"other";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;4;5;1;1;4;2;"15";"15";14 +"GP";"M";15;"U";"LE3";"A";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;5;3;1;1;4;4;"13";"14";14 +"GP";"F";16;"R";"GT3";"T";3;3;"services";"other";"reputation";"father";1;3;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";4;1;2;1;1;2;4;"11";"11";11 +"GP";"F";16;"U";"GT3";"T";2;2;"at_home";"other";"home";"mother";1;2;1;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";3;1;2;1;1;5;12;"8";"10";10 +"GP";"M";15;"U";"LE3";"T";4;2;"teacher";"other";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;5;2;1;1;3;10;"18";"17";18 +"GP";"M";15;"R";"GT3";"T";2;1;"health";"services";"reputation";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;4;2;1;1;5;4;"10";"9";10 +"GP";"M";16;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;4;1;2;5;6;"16";"14";14 +"GP";"M";15;"U";"GT3";"T";4;4;"other";"teacher";"reputation";"father";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";4;4;3;1;1;2;4;"16";"15";16 +"GP";"M";16;"U";"GT3";"T";3;3;"other";"services";"home";"father";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;4;2;1;1;5;6;"14";"14";15 +"GP";"M";17;"R";"GT3";"T";1;3;"other";"other";"course";"father";3;2;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;2;4;1;4;5;14;"12";"11";11 +"GP";"M";15;"U";"GT3";"T";3;4;"other";"other";"reputation";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;4;3;1;2;4;2;"14";"13";14 +"GP";"F";15;"U";"GT3";"T";1;2;"at_home";"services";"course";"mother";1;2;0;"no";"no";"no";"no";"no";"yes";"yes";"no";3;2;3;1;2;1;0;"14";"14";14 +"GP";"M";15;"U";"GT3";"T";2;2;"services";"services";"home";"father";1;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;5;4;1;2;5;6;"14";"13";13 +"GP";"F";16;"U";"LE3";"T";2;4;"other";"health";"course";"father";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;2;2;1;2;5;2;"14";"12";13 +"GP";"M";16;"U";"GT3";"T";4;4;"health";"other";"course";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";3;4;4;1;4;5;4;"12";"13";13 +"GP";"F";16;"U";"GT3";"T";2;2;"other";"other";"home";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;4;4;1;1;5;0;"12";"11";11 +"GP";"M";15;"U";"GT3";"T";3;4;"services";"services";"home";"father";1;1;0;"yes";"no";"no";"no";"yes";"yes";"yes";"no";5;5;5;3;2;5;2;"9";"9";9 +"GP";"F";15;"U";"LE3";"A";3;4;"other";"other";"home";"mother";1;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;2;1;1;1;0;"10";"11";11 +"GP";"F";19;"U";"GT3";"T";0;1;"at_home";"other";"course";"other";1;2;2;"no";"yes";"no";"no";"no";"no";"no";"no";3;4;2;1;1;5;0;"9";"10";11 +"GP";"M";16;"R";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;5;5;2;5;4;8;"14";"14";15 +"GP";"M";16;"U";"GT3";"T";2;3;"other";"other";"course";"mother";2;3;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";3;2;3;2;2;1;4;"13";"12";13 +"GP";"F";15;"R";"GT3";"T";3;4;"services";"teacher";"course";"father";2;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;2;2;2;2;5;0;"10";"11";12 +"GP";"F";18;"U";"GT3";"T";2;1;"services";"other";"reputation";"mother";1;2;3;"no";"yes";"no";"yes";"yes";"no";"yes";"yes";5;4;5;1;3;5;10;"10";"9";8 +"GP";"F";17;"U";"LE3";"A";2;1;"other";"other";"course";"mother";3;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;2;2;1;2;5;8;"11";"10";11 +"GP";"F";15;"U";"GT3";"T";1;1;"at_home";"other";"course";"mother";3;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";4;3;3;1;2;4;6;"11";"12";13 +"GP";"F";17;"U";"LE3";"T";2;2;"other";"other";"course";"father";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";3;4;4;1;3;5;2;"13";"12";12 +"GP";"F";16;"U";"GT3";"A";3;4;"services";"other";"course";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;2;1;1;4;5;12;"15";"13";14 +"GP";"M";16;"U";"GT3";"T";2;1;"at_home";"other";"course";"mother";4;1;0;"no";"no";"no";"no";"yes";"yes";"no";"no";3;2;1;1;1;2;4;"9";"9";11 +"GP";"F";16;"U";"GT3";"A";2;2;"other";"other";"home";"mother";1;1;1;"no";"no";"no";"no";"yes";"yes";"no";"no";5;3;4;1;1;5;12;"13";"11";11 +"GP";"M";15;"R";"GT3";"T";3;4;"at_home";"teacher";"course";"mother";4;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";5;3;3;1;1;5;2;"12";"11";11 +"GP";"F";15;"U";"GT3";"T";4;4;"services";"at_home";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;3;1;1;5;4;"13";"14";15 +"GP";"M";17;"R";"GT3";"T";3;4;"at_home";"other";"course";"mother";3;2;0;"no";"no";"no";"no";"yes";"yes";"no";"no";5;4;5;2;4;5;2;"10";"9";10 +"GP";"F";16;"R";"GT3";"T";1;1;"at_home";"other";"course";"mother";4;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";5;1;3;1;1;3;0;"14";"13";13 +"GP";"M";18;"U";"LE3";"T";3;1;"services";"services";"course";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;3;4;4;5;4;2;"11";"11";12 +"GP";"F";18;"U";"GT3";"A";3;2;"other";"services";"course";"other";1;3;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";4;3;3;5;1;5;10;"12";"11";11 +"GP";"F";16;"R";"GT3";"T";1;1;"other";"services";"reputation";"mother";2;1;0;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";3;3;3;1;2;1;8;"12";"11";11 +"GP";"F";16;"U";"GT3";"A";3;3;"other";"other";"course";"other";2;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";4;3;2;1;1;5;4;"9";"9";10 +"GP";"M";16;"U";"LE3";"T";1;1;"services";"other";"course";"mother";1;2;2;"no";"no";"no";"no";"yes";"yes";"no";"yes";4;4;4;1;3;5;0;"10";"10";10 +"GP";"F";15;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;2;1;1;5;6;"13";"14";14 +"GP";"F";15;"R";"GT3";"T";1;1;"other";"other";"course";"mother";3;1;1;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;5;5;1;1;1;2;"8";"9";9 +"GP";"M";15;"U";"GT3";"T";4;3;"teacher";"services";"course";"father";2;4;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";2;2;2;1;1;3;6;"9";"11";11 +"GP";"F";15;"U";"GT3";"A";3;3;"services";"services";"home";"mother";1;2;0;"no";"no";"no";"no";"no";"yes";"no";"yes";1;3;2;2;3;1;24;"9";"8";9 +"GP";"M";16;"U";"GT3";"T";4;4;"services";"services";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;3;5;0;"15";"13";13 +"GP";"M";16;"U";"LE3";"T";2;2;"services";"services";"reputation";"father";2;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";2;3;3;2;2;2;4;"12";"11";11 +"GP";"F";15;"U";"GT3";"T";4;4;"teacher";"services";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;2;1;1;5;2;"13";"13";13 +"GP";"F";16;"U";"LE3";"T";1;1;"at_home";"at_home";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;4;4;3;3;1;4;"10";"11";11 +"GP";"M";17;"U";"GT3";"T";2;1;"other";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;5;1;2;5;22;"9";"7";6 +"GP";"F";15;"U";"GT3";"T";1;1;"other";"services";"course";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;2;1;2;5;0;"12";"12";12 +"GP";"F";15;"U";"LE3";"A";2;1;"at_home";"other";"home";"mother";2;1;0;"no";"yes";"no";"yes";"yes";"no";"yes";"yes";4;4;2;1;1;5;0;"11";"10";10 +"GP";"F";15;"U";"GT3";"T";3;2;"health";"services";"home";"father";1;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;3;2;1;1;3;2;"11";"11";11 +"GP";"F";15;"U";"GT3";"T";1;2;"at_home";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";4;3;2;1;1;5;6;"13";"12";13 +"GP";"F";15;"U";"GT3";"T";1;2;"at_home";"services";"course";"father";1;2;0;"no";"no";"no";"no";"no";"yes";"no";"yes";2;3;4;2;4;1;6;"11";"11";11 +"GP";"M";16;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"yes";"no";"no";"yes";"no";"yes";"yes";3;3;2;2;1;5;16;"9";"9";8 +"GP";"M";15;"U";"LE3";"A";2;1;"services";"other";"course";"mother";4;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;5;5;2;5;5;0;"12";"11";11 +"GP";"M";18;"U";"LE3";"T";1;1;"other";"other";"course";"mother";1;1;2;"no";"no";"no";"no";"yes";"no";"yes";"yes";2;3;5;2;5;4;0;"11";"9";0 +"GP";"M";16;"U";"LE3";"T";2;1;"at_home";"other";"course";"mother";1;1;1;"no";"no";"no";"yes";"yes";"yes";"no";"yes";4;4;4;3;5;5;6;"9";"10";10 +"GP";"F";15;"R";"GT3";"T";3;3;"services";"services";"reputation";"other";2;3;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";4;2;1;2;3;3;2;"13";"13";13 +"GP";"M";19;"U";"GT3";"T";3;2;"services";"at_home";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"no";"yes";"yes";4;5;4;1;1;4;6;"11";"9";11 +"GP";"F";17;"U";"GT3";"T";4;4;"other";"teacher";"course";"mother";1;1;0;"yes";"yes";"no";"no";"yes";"yes";"no";"yes";4;2;1;1;1;4;0;"13";"13";13 +"GP";"M";15;"R";"GT3";"T";2;3;"at_home";"services";"course";"mother";1;2;0;"yes";"no";"yes";"yes";"yes";"yes";"no";"no";4;4;4;1;1;1;0;"7";"8";8 +"GP";"M";17;"R";"LE3";"T";1;2;"other";"other";"reputation";"mother";1;1;3;"no";"no";"no";"no";"yes";"yes";"no";"no";2;2;2;3;3;5;14;"9";"8";10 +"GP";"F";18;"R";"GT3";"T";1;1;"at_home";"other";"course";"mother";3;1;3;"no";"yes";"no";"yes";"no";"yes";"no";"no";5;2;5;1;5;4;6;"11";"10";11 +"GP";"M";16;"R";"GT3";"T";2;2;"at_home";"other";"course";"mother";3;1;0;"no";"no";"no";"no";"no";"yes";"no";"no";4;2;2;1;2;3;4;"12";"10";11 +"GP";"M";16;"U";"GT3";"T";3;3;"other";"services";"course";"father";1;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;5;5;4;4;5;0;"10";"10";1 +"GP";"M";16;"U";"LE3";"T";1;2;"health";"services";"course";"mother";2;1;2;"no";"no";"no";"no";"no";"yes";"yes";"no";4;4;5;3;5;5;0;"9";"8";10 +"GP";"M";17;"R";"LE3";"T";2;1;"at_home";"other";"course";"mother";2;1;1;"no";"no";"yes";"yes";"yes";"no";"yes";"yes";3;3;2;2;2;5;8;"8";"8";9 +"GP";"M";17;"R";"GT3";"T";3;2;"other";"other";"course";"mother";2;2;2;"yes";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;4;1;4;3;4;"7";"6";8 +"GP";"M";15;"U";"LE3";"T";1;2;"other";"other";"course";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";4;4;4;2;4;5;2;"8";"9";10 +"GP";"M";16;"U";"GT3";"T";1;3;"at_home";"services";"course";"father";1;1;1;"no";"no";"no";"no";"yes";"no";"yes";"no";5;3;3;1;4;2;2;"9";"8";8 +"GP";"M";17;"R";"LE3";"T";1;1;"other";"services";"course";"mother";4;2;0;"no";"no";"no";"yes";"yes";"no";"no";"yes";5;3;5;1;5;5;0;"8";"8";8 +"GP";"M";17;"U";"GT3";"T";3;2;"services";"services";"course";"mother";2;1;3;"no";"yes";"no";"yes";"no";"no";"no";"no";4;5;2;1;1;2;10;"8";"7";8 +"GP";"M";16;"U";"GT3";"T";2;2;"other";"other";"course";"father";1;2;0;"no";"no";"no";"no";"yes";"no";"yes";"no";4;3;5;2;4;4;0;"9";"10";11 +"GP";"F";16;"U";"GT3";"T";4;2;"health";"services";"home";"father";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;2;3;1;1;3;0;"17";"17";18 +"GP";"F";16;"U";"GT3";"T";2;2;"other";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";5;1;5;1;1;4;0;"12";"12";13 +"GP";"F";16;"U";"GT3";"T";4;4;"health";"health";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;2;1;1;3;0;"16";"16";17 +"GP";"M";16;"U";"GT3";"T";3;4;"other";"other";"course";"father";3;1;1;"no";"yes";"no";"yes";"no";"yes";"yes";"no";3;4;5;2;4;2;4;"9";"9";10 +"GP";"M";16;"U";"GT3";"T";1;0;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;2;1;1;3;0;"16";"17";18 +"GP";"M";17;"U";"LE3";"T";4;4;"teacher";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;4;4;1;3;5;0;"11";"9";10 +"GP";"F";16;"U";"GT3";"T";1;3;"at_home";"services";"home";"mother";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;3;5;1;1;3;0;"14";"13";13 +"GP";"F";16;"U";"LE3";"T";3;3;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;4;5;1;1;4;0;"14";"14";15 +"GP";"M";17;"U";"LE3";"T";4;3;"teacher";"other";"course";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;4;4;4;4;4;0;"10";"11";11 +"GP";"F";16;"U";"GT3";"T";2;2;"services";"other";"reputation";"mother";2;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";3;4;4;1;4;5;0;"13";"12";14 +"GP";"M";17;"U";"GT3";"T";3;3;"other";"other";"reputation";"father";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;3;4;1;4;4;4;"11";"9";10 +"GP";"M";16;"R";"GT3";"T";4;2;"teacher";"services";"other";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;3;3;4;3;8;"10";"9";11 +"GP";"M";17;"U";"GT3";"T";4;3;"other";"other";"course";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";5;2;3;1;1;2;4;"11";"11";13 +"GP";"M";16;"U";"GT3";"T";4;3;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";3;4;3;2;3;3;4;"11";"10";11 +"GP";"M";16;"U";"GT3";"T";3;3;"services";"other";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;2;3;1;2;3;0;"11";"12";13 +"GP";"F";17;"U";"GT3";"T";2;4;"services";"services";"reputation";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";5;4;2;2;3;5;0;"17";"18";17 +"GP";"F";17;"U";"LE3";"T";3;3;"other";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;2;3;1;32;"14";"13";14 +"GP";"F";16;"U";"GT3";"T";3;2;"other";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";1;2;2;1;2;1;8;"14";"15";16 +"GP";"M";17;"U";"GT3";"T";3;3;"services";"services";"other";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;4;2;3;4;6;"11";"13";14 +"GP";"M";16;"U";"GT3";"T";1;2;"services";"services";"other";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";3;3;3;1;2;3;0;"10";"9";11 +"GP";"M";16;"U";"LE3";"T";2;1;"other";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;2;3;1;2;5;0;"13";"14";16 +"GP";"F";17;"U";"GT3";"A";3;3;"health";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";3;3;3;1;3;3;10;"12";"13";14 +"GP";"M";17;"R";"GT3";"T";1;2;"at_home";"other";"home";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"no";"no";3;1;3;1;5;3;6;"9";"9";10 +"GP";"F";16;"U";"GT3";"T";2;3;"services";"services";"course";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;2;6;"12";"12";13 +"GP";"F";17;"U";"GT3";"T";1;1;"at_home";"services";"course";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;3;0;"12";"11";12 +"GP";"M";17;"U";"GT3";"T";1;2;"at_home";"services";"other";"other";2;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;4;4;4;5;5;16;"10";"11";12 +"GP";"M";16;"R";"GT3";"T";3;3;"services";"services";"reputation";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;2;3;4;5;0;"11";"10";10 +"GP";"M";16;"U";"GT3";"T";2;3;"other";"other";"home";"father";2;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;3;3;1;1;3;0;"13";"12";12 +"GP";"F";17;"U";"LE3";"T";2;4;"services";"services";"course";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;3;2;1;1;5;8;"14";"15";16 +"GP";"M";17;"U";"GT3";"T";4;4;"services";"teacher";"home";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;2;3;1;2;5;4;"13";"13";14 +"GP";"M";16;"R";"LE3";"T";3;3;"teacher";"other";"home";"father";3;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";3;3;4;3;5;3;16;"10";"11";12 +"GP";"F";17;"U";"GT3";"T";4;4;"services";"teacher";"home";"mother";2;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;2;4;2;3;2;30;"14";"15";16 +"GP";"F";16;"U";"LE3";"T";4;4;"teacher";"teacher";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;5;2;1;2;3;0;"11";"10";11 +"GP";"F";16;"U";"GT3";"T";4;3;"health";"other";"home";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;5;1;5;2;2;"14";"14";15 +"GP";"F";16;"U";"GT3";"T";2;3;"other";"other";"reputation";"mother";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"no";"no";4;4;3;1;3;4;4;"11";"12";12 +"GP";"F";17;"U";"GT3";"T";1;1;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"no";"no";4;4;4;1;3;1;0;"14";"15";15 +"GP";"F";17;"R";"GT3";"T";2;2;"other";"other";"reputation";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;3;2;1;2;3;21;"13";"13";13 +"GP";"F";16;"R";"GT3";"T";2;2;"services";"services";"reputation";"mother";2;4;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";5;3;5;1;1;5;6;"13";"13";13 +"GP";"F";17;"U";"GT3";"T";3;4;"at_home";"services";"home";"mother";1;3;1;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;4;3;3;4;5;14;"8";"9";8 +"GP";"F";16;"U";"GT3";"A";3;1;"services";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";2;3;3;2;2;4;2;"11";"11";12 +"GP";"F";16;"U";"GT3";"T";4;3;"teacher";"other";"other";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";1;3;2;1;1;1;4;"14";"15";15 +"GP";"F";16;"U";"GT3";"T";1;1;"at_home";"other";"home";"mother";2;1;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;3;2;1;4;5;2;"12";"13";13 +"GP";"F";17;"R";"GT3";"T";4;3;"teacher";"other";"reputation";"mother";2;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;4;2;1;1;4;0;"11";"12";12 +"GP";"F";19;"U";"GT3";"T";3;3;"other";"other";"reputation";"other";1;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;2;3;4;"12";"12";12 +"GP";"M";17;"U";"LE3";"T";4;4;"services";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";5;3;5;4;5;3;15;"13";"12";12 +"GP";"F";16;"U";"GT3";"A";2;2;"other";"other";"reputation";"mother";1;2;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";3;3;4;1;1;4;0;"13";"13";13 +"GP";"M";18;"U";"GT3";"T";2;2;"services";"other";"home";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;4;4;2;4;5;10;"12";"11";11 +"GP";"F";17;"R";"LE3";"T";4;4;"services";"other";"other";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";5;2;1;1;2;3;6;"12";"11";11 +"GP";"F";17;"U";"LE3";"T";3;2;"other";"other";"reputation";"mother";2;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;4;1;3;1;2;"14";"16";15 +"GP";"F";17;"U";"GT3";"T";4;3;"other";"other";"reputation";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";3;4;5;2;4;1;16;"11";"9";10 +"GP";"M";18;"U";"LE3";"T";3;3;"services";"health";"home";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;2;4;2;4;4;10;"10";"10";10 +"GP";"F";17;"U";"GT3";"T";2;3;"at_home";"other";"home";"father";2;1;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";3;3;3;1;4;3;4;"12";"13";13 +"GP";"F";17;"U";"GT3";"T";2;2;"at_home";"at_home";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;1;4;0;"12";"12";13 +"GP";"F";17;"R";"GT3";"T";2;1;"at_home";"services";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;5;1;2;5;0;"11";"10";11 +"GP";"F";17;"U";"GT3";"T";1;1;"at_home";"other";"reputation";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";4;3;4;1;1;5;12;"12";"12";12 +"GP";"F";16;"U";"GT3";"T";2;3;"services";"teacher";"other";"mother";1;2;0;"yes";"no";"no";"no";"yes";"yes";"yes";"no";2;3;1;1;1;3;0;"13";"13";14 +"GP";"M";18;"U";"GT3";"T";2;2;"other";"other";"home";"mother";2;2;3;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";3;3;3;5;5;4;9;"10";"9";10 +"GP";"F";16;"U";"GT3";"T";4;4;"teacher";"services";"home";"mother";1;3;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";5;3;2;1;1;5;4;"15";"16";16 +"GP";"F";18;"R";"GT3";"T";3;1;"other";"other";"reputation";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;1;1;4;4;"8";"8";8 +"GP";"F";17;"U";"GT3";"T";3;2;"other";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;3;4;1;3;3;2;"17";"18";17 +"GP";"M";17;"U";"LE3";"T";2;3;"services";"services";"reputation";"father";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";5;3;3;1;3;3;0;"10";"11";11 +"GP";"M";18;"U";"LE3";"T";2;1;"at_home";"other";"course";"mother";4;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;2;4;5;3;2;"9";"10";11 +"GP";"F";17;"U";"GT3";"A";2;1;"other";"other";"course";"mother";2;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;2;3;1;2;3;0;"15";"15";16 +"GP";"F";17;"U";"LE3";"T";4;3;"health";"other";"reputation";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;2;3;1;2;3;0;"14";"12";12 +"GP";"M";17;"R";"GT3";"T";2;2;"other";"other";"course";"father";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;5;2;1;1;1;0;"12";"13";13 +"GP";"M";17;"U";"GT3";"T";4;4;"teacher";"teacher";"reputation";"mother";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;5;5;1;3;2;0;"13";"13";13 +"GP";"M";16;"U";"GT3";"T";4;4;"health";"other";"reputation";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;4;2;4;1;0;"13";"13";14 +"GP";"M";16;"U";"LE3";"T";1;1;"other";"other";"home";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;4;2;1;1;5;2;"9";"9";9 +"GP";"M";16;"U";"GT3";"T";3;2;"at_home";"other";"reputation";"mother";2;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;1;3;2;0;"12";"12";12 +"GP";"M";17;"U";"LE3";"T";2;2;"other";"other";"home";"father";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;4;2;5;5;4;0;"16";"16";16 +"GP";"F";16;"U";"GT3";"T";2;1;"other";"other";"home";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;5;2;1;1;5;4;"9";"10";10 +"GP";"F";16;"U";"GT3";"A";4;1;"other";"other";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;1;2;5;0;"14";"13";13 +"GP";"F";18;"U";"LE3";"A";2;4;"services";"other";"course";"mother";2;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;3;18;"10";"10";10 +"GP";"F";18;"U";"LE3";"T";2;2;"at_home";"services";"course";"mother";1;2;1;"no";"yes";"no";"no";"no";"yes";"yes";"yes";5;3;1;1;1;5;16;"9";"8";10 +"GP";"F";18;"U";"GT3";"T";3;3;"other";"other";"course";"mother";2;1;1;"no";"no";"no";"no";"yes";"no";"yes";"no";4;1;1;1;1;3;14;"8";"7";7 +"GP";"M";18;"U";"GT3";"T";2;2;"other";"at_home";"course";"other";1;1;1;"no";"yes";"no";"yes";"no";"no";"yes";"yes";4;4;3;2;2;1;26;"7";"8";8 +"GP";"M";17;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";4;2;1;1;2;5;6;"10";"8";9 +"GP";"F";17;"U";"GT3";"T";3;2;"other";"other";"course";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";5;4;2;1;1;3;4;"14";"14";15 +"GP";"F";17;"U";"LE3";"T";1;1;"at_home";"at_home";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;2;1;2;4;10;"11";"10";10 +"GP";"F";16;"U";"GT3";"T";1;2;"other";"other";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"no";"yes";"no";5;3;5;1;2;5;4;"12";"11";11 +"GP";"F";17;"R";"GT3";"T";2;1;"at_home";"services";"course";"mother";3;2;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";2;1;1;1;1;3;2;"13";"13";13 +"GP";"F";17;"R";"LE3";"A";1;4;"other";"other";"course";"other";4;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;5;4;1;1;5;14;"9";"9";8 +"GP";"M";18;"U";"GT3";"T";2;2;"other";"services";"reputation";"father";1;2;0;"no";"no";"no";"no";"yes";"no";"yes";"no";5;5;4;3;5;2;16;"8";"7";8 +"GP";"F";17;"U";"LE3";"A";2;2;"other";"other";"home";"mother";1;1;1;"no";"yes";"no";"no";"no";"no";"yes";"no";3;1;2;1;1;1;8;"11";"9";10 +"GP";"F";17;"R";"LE3";"T";1;1;"at_home";"other";"course";"mother";2;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;3;5;4;"15";"14";15 +"GP";"F";17;"U";"LE3";"A";4;2;"teacher";"other";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";4;3;2;1;1;4;4;"15";"14";14 +"GP";"M";17;"U";"LE3";"T";4;3;"health";"other";"course";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";2;5;5;1;4;5;8;"15";"15";15 +"GP";"M";17;"R";"LE3";"A";4;4;"teacher";"other";"course";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;3;3;2;3;4;0;"12";"12";12 +"GP";"M";16;"U";"LE3";"T";4;3;"teacher";"other";"course";"mother";1;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;4;5;1;1;3;7;"14";"14";15 +"GP";"M";16;"U";"GT3";"T";4;4;"services";"services";"course";"mother";1;1;0;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";5;3;2;1;2;5;4;"14";"15";15 +"GP";"F";17;"U";"GT3";"T";4;4;"teacher";"services";"course";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";5;3;1;1;4;5;2;"11";"11";12 +"GP";"M";17;"R";"GT3";"T";1;1;"other";"other";"home";"father";2;3;0;"no";"no";"no";"no";"no";"yes";"yes";"yes";4;3;3;1;1;1;2;"13";"14";15 +"GP";"F";17;"U";"GT3";"T";3;3;"services";"other";"home";"mother";2;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;2;2;2;3;5;10;"11";"11";11 +"GP";"F";17;"U";"GT3";"T";1;1;"at_home";"other";"course";"mother";1;2;0;"yes";"no";"no";"no";"no";"yes";"no";"yes";4;3;2;1;1;4;10;"10";"9";10 +"GP";"F";18;"U";"GT3";"T";2;1;"other";"other";"course";"other";2;3;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";4;4;4;1;1;3;10;"12";"10";11 +"GP";"M";16;"U";"GT3";"T";2;1;"other";"other";"course";"mother";3;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;4;7;"15";"16";16 +"GP";"F";17;"U";"GT3";"T";1;1;"other";"services";"course";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;3;4;1;2;5;4;"11";"10";11 +"GP";"M";17;"U";"GT3";"T";2;3;"other";"other";"course";"father";2;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;2;2;1;1;2;2;"9";"12";13 +"GP";"M";22;"U";"GT3";"T";3;1;"services";"services";"other";"mother";1;1;3;"no";"no";"no";"no";"no";"no";"yes";"yes";5;4;5;5;5;1;12;"7";"8";5 +"GP";"M";18;"R";"LE3";"T";3;3;"other";"services";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;3;1;3;5;8;"10";"9";10 +"GP";"M";16;"U";"GT3";"T";0;2;"other";"other";"other";"mother";1;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";4;3;2;2;4;5;0;"11";"12";11 +"GP";"M";18;"U";"GT3";"T";3;2;"services";"other";"course";"mother";2;1;0;"no";"no";"no";"no";"yes";"no";"yes";"no";4;4;5;2;4;5;8;"7";"8";7 +"GP";"M";16;"U";"GT3";"T";3;3;"at_home";"other";"reputation";"other";3;2;1;"yes";"yes";"no";"no";"no";"yes";"yes";"no";5;3;3;1;3;2;4;"9";"11";10 +"GP";"M";18;"U";"GT3";"T";2;1;"services";"services";"other";"mother";1;1;2;"no";"no";"no";"no";"no";"no";"yes";"no";3;2;5;2;5;5;4;"7";"8";6 +"GP";"M";16;"R";"GT3";"T";2;1;"other";"other";"course";"mother";2;1;0;"no";"no";"no";"yes";"no";"yes";"no";"no";3;3;2;1;3;3;2;"14";"13";12 +"GP";"M";17;"R";"GT3";"T";2;1;"other";"other";"course";"mother";1;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";4;4;2;2;4;5;0;"12";"12";13 +"GP";"M";17;"U";"LE3";"T";1;1;"health";"other";"course";"mother";2;1;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;4;4;1;2;5;0;"9";"10";10 +"GP";"F";18;"U";"LE3";"A";2;1;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";4;3;4;1;3;5;2;"12";"12";13 +"GP";"F";17;"U";"LE3";"T";4;2;"teacher";"services";"reputation";"mother";1;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;3;1;1;4;2;"14";"15";17 +"GP";"F";19;"U";"GT3";"T";2;2;"services";"services";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;5;0;"10";"10";11 +"GP";"M";18;"U";"LE3";"T";2;1;"services";"other";"course";"mother";3;2;1;"no";"no";"no";"yes";"no";"no";"yes";"no";4;4;5;4;4;5;4;"11";"10";11 +"GP";"F";17;"R";"GT3";"T";4;2;"other";"other";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;4;1;3;5;2;"11";"12";14 +"GP";"F";18;"U";"LE3";"T";1;1;"other";"at_home";"home";"mother";1;3;0;"no";"yes";"no";"no";"no";"yes";"no";"no";4;4;3;2;3;3;4;"11";"12";14 +"GP";"F";18;"R";"GT3";"T";2;2;"other";"other";"home";"mother";1;2;0;"yes";"no";"no";"no";"yes";"yes";"no";"no";3;2;3;1;1;5;4;"11";"11";13 +"GP";"M";19;"U";"LE3";"A";4;3;"services";"at_home";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;1;1;1;1;4;"11";"13";14 +"GP";"M";18;"U";"GT3";"T";2;1;"other";"other";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;2;4;1;2;4;2;"16";"16";16 +"GP";"M";17;"R";"GT3";"T";2;2;"other";"services";"other";"mother";2;1;0;"no";"no";"no";"no";"no";"no";"no";"no";5;2;2;1;1;4;0;"9";"10";10 +"GP";"F";17;"U";"LE3";"T";2;2;"services";"services";"course";"father";1;4;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;4;1;1;1;2;2;"10";"11";12 +"GP";"F";20;"R";"GT3";"T";2;1;"other";"other";"course";"other";2;2;0;"no";"yes";"yes";"yes";"yes";"no";"yes";"yes";1;2;3;1;2;2;8;"10";"12";12 +"GP";"F";18;"U";"GT3";"T";4;3;"services";"other";"home";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";3;1;2;1;3;2;2;"15";"15";15 +"GP";"M";18;"U";"GT3";"T";4;3;"teacher";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";4;3;2;1;1;3;2;"10";"10";11 +"GP";"M";18;"R";"GT3";"T";3;2;"other";"other";"course";"mother";1;3;0;"no";"no";"no";"yes";"no";"yes";"no";"no";5;3;2;1;1;3;2;"10";"11";12 +"GP";"F";17;"U";"GT3";"T";3;3;"other";"other";"home";"mother";1;3;0;"no";"no";"no";"yes";"no";"yes";"no";"no";3;2;3;1;1;4;2;"15";"12";13 +"GP";"F";18;"U";"GT3";"T";2;2;"at_home";"services";"home";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;3;1;1;3;0;"11";"12";13 +"GP";"M";17;"U";"GT3";"T";2;2;"other";"other";"home";"father";2;1;0;"no";"no";"no";"no";"yes";"no";"yes";"no";4;4;4;2;3;4;8;"8";"8";9 +"GP";"M";18;"R";"LE3";"A";3;4;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;5;3;4;1;6;"15";"16";16 +"GP";"M";17;"U";"GT3";"T";3;1;"services";"other";"other";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;4;4;3;4;5;0;"11";"11";14 +"GP";"F";18;"R";"GT3";"T";4;4;"teacher";"other";"reputation";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;4;2;2;4;8;"10";"11";12 +"GP";"M";18;"U";"GT3";"T";4;2;"health";"other";"reputation";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;4;5;1;3;5;4;"10";"12";14 +"GP";"F";18;"R";"GT3";"T";2;1;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"no";"yes";"no";"yes";"yes";4;3;5;1;2;3;12;"8";"9";10 +"GP";"F";19;"U";"GT3";"T";3;3;"other";"services";"home";"other";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;5;3;3;5;16;"11";"12";12 +"GP";"F";18;"U";"GT3";"T";2;3;"other";"services";"reputation";"father";1;4;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;5;5;1;3;2;10;"16";"16";16 +"GP";"F";18;"U";"LE3";"T";1;1;"other";"other";"home";"mother";2;2;0;"no";"yes";"no";"no";"no";"yes";"no";"no";4;4;3;1;1;3;2;"13";"13";13 +"GP";"M";17;"R";"GT3";"T";1;2;"at_home";"at_home";"home";"mother";1;2;0;"no";"yes";"no";"yes";"no";"yes";"no";"yes";3;5;2;2;2;1;2;"16";"17";18 +"GP";"F";18;"U";"GT3";"T";2;1;"other";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;2;5;1;2;1;8;"14";"14";15 +"GP";"F";17;"U";"GT3";"T";2;4;"at_home";"health";"reputation";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;3;1;1;1;6;"15";"16";16 +"GP";"F";17;"U";"LE3";"T";2;2;"services";"other";"course";"mother";2;2;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;4;2;3;5;6;"12";"12";12 +"GP";"F";18;"R";"GT3";"A";3;2;"other";"services";"home";"mother";2;2;0;"no";"no";"no";"no";"no";"no";"yes";"yes";4;1;1;1;1;5;15;"12";"9";10 +"GP";"M";18;"U";"GT3";"T";4;4;"teacher";"services";"home";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;2;4;1;4;3;6;"11";"12";12 +"GP";"F";18;"U";"GT3";"T";4;4;"health";"health";"reputation";"father";1;2;1;"yes";"yes";"no";"yes";"yes";"yes";"yes";"yes";2;4;4;1;1;4;2;"14";"12";13 +"GP";"F";17;"U";"GT3";"T";2;2;"other";"services";"reputation";"father";3;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;2;3;1;1;1;8;"13";"15";15 +"GP";"F";19;"R";"GT3";"T";3;2;"services";"services";"reputation";"father";1;2;1;"yes";"yes";"no";"no";"yes";"no";"yes";"no";3;3;3;4;3;3;0;"9";"8";10 +"GP";"M";18;"U";"LE3";"T";4;3;"teacher";"services";"course";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;2;3;1;2;1;0;"10";"10";10 +"GP";"M";18;"U";"GT3";"T";1;2;"at_home";"other";"home";"other";2;1;0;"no";"no";"no";"no";"no";"no";"yes";"no";3;4;4;2;4;4;10;"10";"10";11 +"GP";"M";17;"U";"LE3";"A";4;1;"services";"other";"home";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;5;4;2;4;5;22;"11";"11";10 +"GP";"M";17;"U";"LE3";"A";3;2;"teacher";"services";"home";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;4;3;4;3;18;"13";"13";13 +"GP";"F";18;"R";"LE3";"T";1;1;"at_home";"other";"reputation";"mother";2;4;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";5;2;2;1;1;3;2;"17";"17";18 +"GP";"F";18;"U";"GT3";"T";1;1;"other";"other";"home";"mother";2;2;0;"yes";"no";"no";"yes";"yes";"yes";"yes";"no";5;4;4;1;1;4;0;"12";"13";13 +"GP";"F";17;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";5;4;5;1;2;5;12;"12";"12";14 +"GP";"F";18;"U";"GT3";"T";2;1;"other";"other";"reputation";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;3;1;1;1;5;10;"12";"13";14 +"GP";"M";17;"U";"GT3";"T";1;1;"other";"other";"reputation";"father";1;2;0;"no";"no";"no";"no";"no";"yes";"yes";"no";4;3;3;1;2;4;0;"12";"12";12 +"GP";"F";18;"U";"GT3";"T";2;2;"at_home";"at_home";"other";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;2;2;0;"18";"18";18 +"GP";"F";17;"U";"GT3";"T";1;1;"services";"teacher";"reputation";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;3;0;"13";"13";14 +"GP";"M";18;"U";"GT3";"T";2;1;"services";"services";"reputation";"mother";1;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;2;4;1;3;2;0;"14";"15";15 +"GP";"M";18;"U";"LE3";"A";4;4;"teacher";"teacher";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;3;1;1;2;0;"17";"17";17 +"GP";"M";18;"U";"GT3";"T";4;2;"teacher";"other";"home";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;2;1;4;5;2;"15";"16";16 +"GP";"F";17;"U";"GT3";"T";4;3;"health";"services";"reputation";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;2;2;1;2;3;0;"17";"18";18 +"GP";"F";17;"R";"LE3";"T";3;1;"services";"other";"reputation";"mother";2;4;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";3;1;2;1;1;3;0;"18";"19";19 +"GP";"M";18;"R";"LE3";"T";3;2;"services";"other";"reputation";"mother";2;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;2;1;1;4;0;"14";"15";15 +"GP";"M";17;"U";"GT3";"T";3;3;"health";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;3;1;3;5;0;"14";"15";15 +"GP";"F";19;"U";"GT3";"T";4;4;"health";"other";"reputation";"other";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";2;3;4;2;3;2;2;"14";"13";13 +"GP";"F";18;"U";"LE3";"T";4;3;"other";"other";"home";"other";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;5;1;2;2;0;"13";"14";14 +"GP";"F";18;"U";"GT3";"T";4;3;"other";"other";"reputation";"father";1;4;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;3;0;"16";"17";17 +"GP";"M";18;"U";"LE3";"T";4;4;"teacher";"teacher";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";1;4;2;2;2;1;0;"18";"18";17 +"GP";"F";18;"U";"LE3";"A";4;4;"health";"other";"home";"mother";1;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;2;4;1;1;4;0;"14";"15";15 +"GP";"M";17;"U";"LE3";"T";4;4;"other";"teacher";"home";"father";2;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;1;1;2;2;5;0;"12";"13";13 +"GP";"F";17;"R";"GT3";"T";4;4;"services";"services";"reputation";"mother";2;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;4;1;1;5;0;"7";"7";8 +"GP";"F";17;"U";"GT3";"T";4;2;"other";"other";"reputation";"mother";2;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;1;3;0;"16";"16";16 +"GP";"F";17;"U";"GT3";"T";3;2;"health";"health";"reputation";"father";1;4;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";5;2;2;1;2;5;0;"18";"18";18 +"GP";"M";19;"R";"LE3";"T";2;1;"at_home";"services";"course";"mother";2;3;1;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;3;1;1;1;5;0;"9";"10";11 +"GP";"M";20;"U";"GT3";"A";3;2;"services";"other";"course";"other";1;1;2;"no";"no";"no";"yes";"yes";"yes";"no";"no";5;5;3;1;1;5;0;"14";"15";15 +"GP";"M";19;"R";"GT3";"T";3;3;"other";"services";"reputation";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"no";"yes";4;5;3;1;2;5;0;"10";"10";11 +"GP";"F";18;"U";"GT3";"T";1;4;"other";"teacher";"home";"mother";1;2;0;"yes";"yes";"no";"no";"no";"yes";"no";"yes";3;4;4;1;2;5;2;"10";"10";11 +"GP";"F";18;"U";"GT3";"T";2;1;"services";"other";"course";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;2;1;2;"12";"12";15 +"GP";"F";17;"U";"GT3";"T";2;3;"other";"other";"course";"father";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;2;1;1;1;3;2;"11";"12";14 +"GP";"F";17;"R";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;4;4;1;1;5;2;"15";"16";17 +"GP";"F";18;"U";"GT3";"T";4;3;"other";"other";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;4;1;1;5;2;"14";"15";17 +"GP";"F";18;"U";"LE3";"T";4;3;"health";"services";"course";"mother";2;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";3;2;4;1;4;1;8;"12";"12";15 +"GP";"F";17;"R";"GT3";"T";3;4;"at_home";"services";"course";"father";1;3;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";4;3;4;2;5;5;2;"15";"15";17 +"GP";"F";18;"U";"GT3";"T";3;3;"at_home";"other";"course";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;1;4;1;1;3;8;"11";"12";14 +"GP";"M";19;"U";"GT3";"T";4;2;"health";"other";"course";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;4;4;1;1;1;9;"11";"10";10 +"GP";"F";18;"U";"GT3";"T";4;4;"teacher";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;4;3;3;5;0;"12";"11";13 +"GP";"F";18;"U";"GT3";"T";3;4;"other";"other";"course";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;4;4;1;1;1;4;"11";"12";14 +"GP";"F";17;"U";"GT3";"T";4;4;"health";"health";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;3;4;1;2;5;2;"14";"15";17 +"GP";"F";17;"U";"GT3";"A";4;3;"services";"services";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";5;2;2;1;2;5;14;"15";"14";17 +"GP";"F";17;"U";"LE3";"A";3;3;"services";"other";"home";"mother";1;2;0;"yes";"yes";"no";"no";"yes";"yes";"yes";"no";5;3;3;1;1;5;0;"12";"12";13 +"GP";"F";17;"U";"LE3";"T";2;1;"other";"other";"home";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;2;3;2;2;2;2;"11";"12";14 +"GP";"M";18;"U";"LE3";"T";4;4;"other";"other";"reputation";"father";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;2;5;3;4;5;2;"8";"9";11 +"GP";"F";19;"U";"GT3";"T";1;1;"other";"other";"course";"other";3;3;0;"no";"no";"no";"yes";"yes";"no";"no";"yes";1;5;5;4;3;5;12;"10";"10";11 +"GP";"F";19;"U";"LE3";"A";1;1;"other";"other";"course";"other";3;2;2;"no";"yes";"no";"no";"no";"yes";"yes";"yes";5;3;4;1;1;4;2;"8";"8";9 +"GP";"F";18;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;5;2;4;5;2;"10";"10";10 +"GP";"F";17;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";4;2;2;1;1;3;4;"14";"13";13 +"GP";"F";17;"R";"LE3";"T";2;2;"services";"services";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";3;3;2;2;2;3;0;"11";"11";10 +"GP";"F";17;"U";"GT3";"T";3;1;"services";"services";"course";"father";1;3;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";3;4;3;2;3;5;0;"17";"18";17 +"GP";"F";17;"U";"LE3";"T";0;2;"at_home";"at_home";"home";"father";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;3;3;2;3;2;0;"14";"14";15 +"GP";"F";18;"U";"GT3";"T";1;1;"other";"other";"home";"mother";2;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;5;5;1;2;2;0;"14";"14";14 +"GP";"M";18;"U";"GT3";"T";4;4;"other";"other";"course";"mother";1;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;2;3;0;"13";"14";13 +"GP";"M";17;"U";"GT3";"T";3;3;"other";"services";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;3;5;3;5;5;0;"17";"18";17 +"GP";"M";17;"R";"GT3";"T";2;2;"services";"other";"course";"mother";4;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;5;5;5;4;2;"11";"10";10 +"GP";"F";17;"U";"GT3";"T";4;4;"teacher";"services";"course";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;4;1;3;4;0;"13";"12";13 +"GP";"F";17;"U";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";2;3;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";4;3;3;1;2;4;4;"15";"14";15 +"GP";"F";17;"U";"GT3";"T";3;3;"at_home";"other";"course";"mother";1;1;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;2;5;2;5;5;2;"11";"12";11 +"GP";"M";18;"U";"LE3";"T";2;2;"other";"other";"course";"mother";1;4;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;5;5;2;4;5;0;"11";"11";12 +"GP";"M";19;"R";"GT3";"T";3;2;"at_home";"services";"home";"other";1;1;0;"no";"yes";"no";"no";"no";"yes";"no";"yes";5;3;4;2;2;5;0;"11";"10";10 +"GP";"F";18;"U";"GT3";"T";2;2;"at_home";"other";"course";"mother";4;2;0;"no";"no";"no";"yes";"yes";"yes";"no";"yes";4;2;5;1;1;2;2;"10";"9";10 +"GP";"F";17;"R";"GT3";"T";2;4;"at_home";"other";"course";"father";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;3;1;1;5;0;"15";"15";15 +"GP";"M";18;"U";"GT3";"T";2;2;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";5;4;2;1;2;5;6;"15";"14";15 +"GP";"F";18;"U";"GT3";"T";3;3;"services";"services";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;3;4;1;1;4;8;"10";"11";12 +"GP";"F";18;"U";"LE3";"T";2;2;"other";"other";"home";"other";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;3;3;1;1;2;0;"10";"9";12 +"GP";"F";18;"R";"GT3";"T";2;2;"at_home";"other";"course";"mother";2;4;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";4;4;4;1;1;4;6;"14";"13";14 +"GP";"F";17;"U";"GT3";"T";3;4;"services";"other";"course";"mother";1;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;5;1;3;5;8;"11";"13";14 +"GP";"F";17;"U";"GT3";"T";3;2;"other";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;2;2;3;2;0;"12";"13";15 +"GP";"F";18;"U";"LE3";"T";3;3;"services";"services";"home";"mother";1;4;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;3;3;1;1;1;4;"14";"14";15 +"GP";"F";17;"R";"GT3";"A";3;2;"other";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;2;3;2;0;"14";"14";16 +"GP";"M";18;"U";"GT3";"T";4;4;"teacher";"services";"home";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;2;2;2;0;"12";"12";13 +"GP";"M";18;"U";"LE3";"T";3;4;"services";"other";"home";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;3;3;1;3;5;6;"16";"16";17 +"GP";"F";17;"U";"GT3";"A";2;2;"at_home";"at_home";"home";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";3;3;1;1;2;4;18;"10";"12";14 +"GP";"F";18;"U";"GT3";"T";2;3;"at_home";"other";"course";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;2;3;0;"11";"12";14 +"GP";"F";18;"U";"GT3";"T";3;2;"other";"services";"other";"mother";1;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;4;3;2;3;1;4;"14";"16";17 +"GP";"M";18;"R";"GT3";"T";4;3;"teacher";"services";"course";"mother";1;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;3;2;1;2;4;4;"15";"14";17 +"GP";"M";18;"U";"GT3";"T";4;3;"teacher";"other";"course";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";5;4;5;2;3;5;0;"14";"13";14 +"GP";"F";17;"U";"GT3";"T";4;3;"health";"other";"reputation";"mother";1;3;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;4;3;1;3;4;0;"11";"12";13 +"GP";"F";17;"U";"GT3";"T";2;1;"services";"other";"course";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;3;4;2;2;1;10;"12";"15";15 +"GP";"F";17;"U";"GT3";"T";2;1;"services";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;5;2;4;4;4;"12";"16";16 +"GP";"F";19;"U";"LE3";"A";2;3;"at_home";"other";"home";"other";2;1;1;"no";"no";"no";"no";"yes";"no";"yes";"no";2;2;3;3;4;5;16;"10";"11";11 +"GP";"F";17;"U";"GT3";"T";3;1;"other";"at_home";"home";"mother";1;1;1;"no";"yes";"yes";"no";"yes";"yes";"yes";"yes";4;1;2;1;1;3;6;"10";"13";13 +"GP";"F";21;"U";"LE3";"T";4;4;"other";"other";"reputation";"other";1;3;2;"no";"no";"yes";"yes";"yes";"yes";"yes";"no";3;3;2;1;1;5;0;"9";"12";12 +"GP";"M";18;"U";"LE3";"T";2;2;"services";"services";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"no";"no";"yes";"no";4;4;4;1;3;3;11;"9";"11";12 +"GP";"M";18;"U";"LE3";"A";3;4;"other";"other";"reputation";"other";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;3;5;1;4;2;9;"13";"14";15 +"GP";"F";17;"U";"GT3";"T";2;2;"services";"services";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;4;1;3;4;0;"13";"17";17 +"GP";"M";17;"U";"LE3";"A";4;4;"health";"other";"reputation";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;2;1;2;4;2;"12";"15";15 +"GP";"F";18;"U";"LE3";"T";4;2;"teacher";"other";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;2;1;1;3;0;"14";"17";17 +"GP";"M";21;"R";"LE3";"T";1;1;"at_home";"other";"course";"other";2;2;2;"no";"yes";"no";"yes";"yes";"no";"yes";"yes";5;3;3;5;2;4;21;"9";"10";10 +"GP";"F";20;"R";"GT3";"T";1;1;"other";"other";"reputation";"other";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";3;2;2;1;3;3;8;"11";"15";15 +"GP";"F";19;"U";"GT3";"T";4;4;"teacher";"other";"home";"other";1;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";3;2;5;4;4;5;5;"9";"10";11 +"GP";"M";17;"U";"LE3";"A";3;2;"other";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;4;1;2;5;10;"16";"18";18 +"GP";"F";18;"U";"GT3";"T";3;2;"at_home";"other";"reputation";"father";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;4;1;2;2;5;"14";"17";17 +"GP";"M";18;"R";"GT3";"T";2;3;"other";"services";"reputation";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;1;3;4;5;4;13;"13";"14";14 +"GP";"M";19;"U";"GT3";"T";2;1;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;3;4;1;4;4;10;"7";"11";11 +"GP";"F";18;"U";"LE3";"A";2;2;"services";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;1;4;1;3;4;10;"14";"17";17 +"GP";"F";20;"U";"GT3";"T";1;0;"other";"other";"reputation";"mother";2;1;1;"yes";"no";"no";"no";"yes";"yes";"yes";"yes";5;3;1;1;1;5;5;"8";"10";10 +"GP";"F";18;"U";"GT3";"T";3;2;"services";"other";"home";"mother";1;2;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";3;1;2;1;2;1;4;"10";"13";13 +"MS";"F";16;"U";"GT3";"T";1;3;"at_home";"other";"other";"father";2;1;0;"no";"yes";"no";"no";"yes";"no";"yes";"yes";4;3;3;1;3;5;11;"10";"11";11 +"MS";"F";16;"R";"GT3";"T";2;2;"other";"other";"course";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;4;4;1;1;5;0;"12";"12";12 +"MS";"F";15;"R";"GT3";"T";1;1;"at_home";"services";"other";"mother";1;1;1;"no";"yes";"no";"no";"yes";"yes";"no";"yes";4;1;3;1;1;2;6;"10";"10";10 +"MS";"F";15;"R";"GT3";"T";3;3;"at_home";"other";"course";"mother";2;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;4;2;3;5;4;"10";"10";11 +"MS";"F";16;"R";"GT3";"T";2;3;"at_home";"services";"course";"mother";2;2;0;"no";"no";"no";"no";"yes";"yes";"no";"no";4;5;2;1;2;5;0;"16";"17";17 +"MS";"F";15;"R";"LE3";"T";2;1;"at_home";"other";"home";"mother";2;1;0;"no";"no";"no";"no";"yes";"no";"no";"no";1;3;4;1;1;1;0;"6";"8";9 +"MS";"M";16;"R";"LE3";"A";4;4;"at_home";"other";"home";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";5;3;2;1;3;2;5;"10";"11";11 +"MS";"M";16;"U";"GT3";"A";1;2;"other";"other";"other";"mother";1;3;0;"yes";"no";"no";"no";"yes";"yes";"yes";"no";4;4;3;1;1;5;0;"10";"11";11 +"MS";"F";17;"R";"GT3";"T";3;2;"at_home";"other";"course";"father";1;2;1;"no";"no";"no";"no";"yes";"yes";"no";"yes";4;5;4;1;2;5;0;"10";"10";10 +"MS";"F";17;"R";"GT3";"T";1;1;"other";"other";"other";"father";1;1;1;"no";"yes";"no";"no";"no";"no";"yes";"no";5;4;4;2;2;5;0;"6";"6";7 +"MS";"F";15;"R";"GT3";"T";4;4;"teacher";"other";"course";"mother";2;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";1;5;1;3;5;5;0;"13";"14";14 +"MS";"F";16;"U";"LE3";"A";2;2;"at_home";"other";"reputation";"mother";2;4;0;"no";"no";"no";"yes";"no";"no";"no";"yes";1;2;1;1;1;1;4;"10";"9";11 +"MS";"F";15;"R";"LE3";"T";1;1;"at_home";"services";"reputation";"father";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;3;1;2;4;0;"10";"10";10 +"MS";"F";15;"R";"LE3";"T";1;1;"other";"services";"course";"mother";2;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;4;3;1;2;2;4;"6";"7";8 +"MS";"F";16;"R";"GT3";"T";0;2;"other";"other";"other";"mother";2;1;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";3;2;3;1;2;2;0;"12";"11";12 +"MS";"F";17;"R";"GT3";"T";2;3;"other";"other";"course";"mother";2;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;5;5;1;3;3;2;"10";"11";12 +"MS";"F";15;"R";"GT3";"T";3;3;"other";"services";"course";"father";2;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";4;1;3;1;1;4;0;"14";"16";16 +"MS";"M";16;"U";"GT3";"T";1;1;"at_home";"services";"home";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";5;4;5;4;5;3;0;"7";"0";0 +"MS";"M";17;"U";"GT3";"T";1;1;"other";"other";"home";"mother";1;2;0;"no";"no";"yes";"no";"no";"yes";"yes";"no";4;4;3;2;4;5;4;"8";"9";9 +"MS";"M";15;"R";"LE3";"T";4;1;"health";"services";"reputation";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;4;1;2;2;0;"12";"13";14 +"MS";"M";15;"R";"LE3";"T";4;1;"health";"services";"reputation";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;4;1;2;2;7;"7";"9";8 +"MS";"M";16;"R";"GT3";"T";3;4;"other";"health";"other";"mother";3;2;0;"no";"no";"no";"no";"no";"yes";"no";"no";3;4;5;1;2;5;4;"9";"10";11 +"MS";"M";15;"R";"GT3";"T";1;1;"other";"other";"course";"mother";4;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";5;4;5;2;4;4;8;"7";"9";9 +"MS";"M";15;"U";"LE3";"T";3;3;"at_home";"at_home";"reputation";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;5;0;"11";"11";11 +"MS";"M";17;"R";"GT3";"T";2;1;"other";"other";"other";"mother";3;1;0;"no";"no";"no";"yes";"yes";"no";"no";"yes";5;5;5;5;5;3;8;"8";"10";9 +"MS";"F";16;"R";"GT3";"T";4;4;"teacher";"teacher";"course";"mother";2;3;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;2;2;1;1;4;6;"16";"16";17 +"MS";"F";15;"R";"GT3";"T";1;2;"other";"services";"course";"mother";2;1;0;"no";"no";"no";"no";"yes";"yes";"no";"no";5;1;2;1;1;1;3;"11";"13";13 +"MS";"F";16;"R";"GT3";"T";2;3;"other";"services";"course";"mother";3;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;5;4;1;2;1;2;"15";"15";15 +"MS";"M";16;"R";"GT3";"T";1;2;"other";"other";"course";"father";2;2;0;"no";"no";"no";"no";"yes";"yes";"no";"no";4;3;3;1;1;5;0;"10";"11";11 +"MS";"F";16;"R";"GT3";"T";2;2;"other";"other";"course";"mother";3;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;5;1;1;4;4;"9";"10";11 +"MS";"F";16;"U";"GT3";"T";1;2;"other";"services";"course";"mother";1;3;1;"no";"yes";"no";"no";"yes";"yes";"no";"no";1;3;2;1;2;4;0;"10";"8";8 +"MS";"F";16;"U";"GT3";"T";1;2;"other";"services";"course";"mother";1;3;1;"no";"yes";"no";"no";"yes";"yes";"no";"no";1;3;2;1;2;4;3;"9";"8";8 +"MS";"F";15;"U";"GT3";"T";2;1;"at_home";"other";"home";"mother";1;2;0;"yes";"yes";"no";"no";"no";"yes";"yes";"no";4;4;2;3;3;2;0;"9";"10";9 +"MS";"F";16;"U";"GT3";"T";1;1;"at_home";"other";"course";"father";1;2;0;"no";"yes";"no";"no";"no";"yes";"no";"yes";5;4;3;2;1;2;0;"13";"14";15 +"MS";"M";17;"R";"LE3";"T";1;2;"at_home";"services";"reputation";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;5;5;5;5;3;4;"10";"11";11 +"MS";"F";16;"R";"GT3";"T";1;1;"other";"other";"home";"father";4;4;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";4;3;2;1;1;1;0;"13";"10";13 +"MS";"F";16;"R";"GT3";"T";1;1;"at_home";"other";"other";"father";4;3;0;"yes";"yes";"no";"no";"yes";"yes";"no";"no";4;4;3;1;1;5;2;"10";"9";10 +"MS";"F";15;"R";"GT3";"T";1;1;"at_home";"other";"home";"father";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;1;2;1;"11";"10";11 +"MS";"F";16;"R";"GT3";"T";1;1;"at_home";"other";"other";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;2;2;4;3;2;0;"13";"12";14 +"MS";"F";15;"R";"GT3";"T";1;1;"at_home";"at_home";"course";"father";3;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;2;1;1;2;2;0;"13";"14";14 +"MS";"F";15;"R";"LE3";"T";2;2;"other";"other";"other";"father";1;3;0;"yes";"yes";"no";"no";"yes";"yes";"no";"no";4;4;3;2;2;5;2;"14";"11";12 +"MS";"M";16;"R";"GT3";"T";1;1;"at_home";"other";"other";"father";2;1;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";3;4;4;3;4;5;6;"11";"11";11 +"MS";"F";18;"U";"GT3";"T";1;2;"other";"other";"course";"father";1;2;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";3;4;4;2;3;5;9;"9";"8";8 +"MS";"M";15;"U";"GT3";"T";3;1;"other";"services";"home";"mother";2;1;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";3;2;3;1;3;4;0;"10";"9";11 +"MS";"F";16;"R";"GT3";"T";2;2;"other";"services";"course";"father";3;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;4;1;1;2;1;"14";"13";14 +"MS";"M";15;"U";"GT3";"T";2;2;"health";"other";"reputation";"mother";3;1;0;"no";"no";"no";"no";"yes";"yes";"no";"no";4;3;3;1;2;4;1;"13";"12";13 +"MS";"M";16;"U";"GT3";"T";4;4;"other";"teacher";"course";"father";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";4;3;1;1;1;3;0;"13";"12";13 +"MS";"F";15;"R";"GT3";"T";3;3;"services";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;5;4;1;1;1;4;"13";"12";12 +"MS";"F";16;"R";"GT3";"T";2;2;"at_home";"other";"course";"mother";2;2;1;"no";"yes";"no";"yes";"no";"yes";"no";"no";4;4;4;2;3;5;2;"12";"11";12 +"MS";"F";16;"R";"LE3";"T";2;2;"other";"other";"home";"father";3;1;0;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";4;3;2;1;1;4;0;"14";"14";16 +"MS";"M";16;"U";"LE3";"T";2;1;"at_home";"services";"course";"mother";2;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";2;4;3;2;3;4;4;"10";"8";10 +"MS";"M";15;"R";"LE3";"T";1;3;"at_home";"other";"reputation";"father";3;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;2;4;3;5;3;2;"10";"11";11 +"MS";"F";15;"U";"GT3";"T";2;2;"other";"services";"course";"mother";2;3;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";5;3;2;1;1;4;0;"12";"13";14 +"MS";"F";16;"R";"LE3";"T";2;1;"other";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";5;4;3;1;1;5;2;"10";"8";8 +"MS";"M";15;"U";"GT3";"T";3;3;"services";"services";"course";"father";2;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";4;3;3;2;4;3;11;"12";"10";11 +"MS";"F";16;"R";"GT3";"T";1;1;"at_home";"other";"course";"father";2;2;3;"yes";"yes";"no";"no";"yes";"yes";"no";"no";3;4;3;1;1;1;0;"7";"7";8 +"MS";"F";17;"U";"GT3";"T";2;2;"other";"at_home";"course";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"no";"no";"no";4;5;3;1;1;5;4;"9";"9";10 +"MS";"F";19;"U";"GT3";"T";2;3;"at_home";"services";"course";"other";1;1;1;"no";"no";"no";"no";"yes";"no";"yes";"yes";4;4;4;1;1;2;0;"9";"9";10 +"MS";"F";17;"R";"GT3";"T";2;1;"at_home";"other";"course";"mother";3;1;0;"no";"yes";"no";"yes";"yes";"no";"no";"yes";5;5;3;1;1;3;2;"9";"10";11 +"MS";"F";15;"R";"LE3";"T";1;1;"at_home";"other";"course";"mother";2;1;0;"no";"yes";"no";"no";"yes";"no";"no";"yes";5;2;1;1;3;4;0;"9";"10";9 +"MS";"F";16;"R";"GT3";"T";2;2;"other";"other";"course";"father";3;2;0;"no";"yes";"no";"no";"yes";"no";"yes";"no";3;4;5;1;2;1;1;"9";"10";11 +"MS";"F";16;"U";"LE3";"A";2;2;"other";"other";"home";"mother";1;1;0;"no";"yes";"no";"no";"yes";"no";"no";"no";4;3;4;1;2;1;6;"7";"7";8 +"MS";"F";17;"R";"GT3";"T";2;2;"at_home";"other";"course";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;5;1;2;4;0;"11";"10";11 +"MS";"F";16;"U";"GT3";"T";2;2;"other";"services";"course";"father";1;1;1;"no";"yes";"yes";"yes";"no";"yes";"yes";"no";4;4;3;1;4;3;1;"9";"10";10 +"MS";"F";18;"R";"LE3";"A";3;2;"other";"other";"course";"other";2;3;2;"no";"yes";"no";"no";"no";"no";"no";"yes";3;3;2;1;1;2;6;"7";"9";10 +"MS";"F";19;"U";"GT3";"T";1;1;"at_home";"services";"course";"mother";1;3;1;"no";"no";"no";"yes";"yes";"no";"no";"yes";5;3;1;1;1;3;6;"7";"9";9 +"MS";"M";18;"R";"GT3";"T";1;1;"other";"other";"home";"mother";2;1;1;"no";"no";"no";"yes";"yes";"no";"yes";"no";4;4;3;3;4;4;0;"8";"9";10 +"MS";"F";18;"R";"GT3";"T";1;1;"at_home";"at_home";"course";"mother";2;1;1;"no";"no";"no";"no";"no";"no";"yes";"yes";3;2;3;1;1;2;4;"9";"11";10 +"MS";"F";19;"U";"GT3";"T";1;1;"other";"other";"course";"other";2;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";1;1;4;4;1;1;12;"7";"8";9 +"MS";"F";16;"R";"GT3";"A";2;2;"health";"other";"course";"mother";1;2;0;"no";"no";"no";"no";"no";"yes";"no";"yes";3;3;2;1;1;3;2;"8";"10";10 +"MS";"F";17;"U";"GT3";"T";0;1;"other";"at_home";"course";"father";2;1;0;"no";"no";"no";"yes";"no";"yes";"no";"no";2;4;4;3;5;5;5;"9";"9";10 +"MS";"F";16;"R";"LE3";"T";1;2;"at_home";"other";"course";"mother";1;2;0;"no";"no";"no";"yes";"yes";"no";"yes";"no";4;4;5;1;3;3;0;"8";"9";9 +"MS";"F";16;"U";"GT3";"T";3;3;"other";"other";"reputation";"mother";1;1;0;"no";"no";"no";"yes";"yes";"no";"yes";"yes";4;5;4;1;1;4;0;"14";"13";13 +"MS";"F";16;"R";"LE3";"T";1;1;"services";"services";"home";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;4;4;2;2;4;2;"14";"14";14 +"MS";"M";17;"U";"GT3";"T";3;3;"services";"at_home";"course";"mother";2;4;1;"no";"yes";"yes";"yes";"yes";"yes";"no";"no";5;4;5;3;4;5;0;"10";"11";10 +"MS";"F";16;"U";"GT3";"T";2;1;"other";"services";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;3;3;1;1;1;0;"14";"13";14 +"MS";"F";16;"U";"GT3";"T";2;2;"services";"other";"course";"mother";1;1;0;"no";"yes";"yes";"yes";"yes";"yes";"no";"yes";4;2;5;1;2;5;0;"17";"16";16 +"MS";"M";17;"U";"GT3";"T";1;2;"other";"other";"course";"father";1;1;1;"no";"yes";"no";"yes";"yes";"no";"yes";"yes";5;3;5;5;5;1;12;"6";"7";7 +"MS";"M";16;"U";"LE3";"T";4;3;"other";"other";"course";"father";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;2;5;1;5;5;8;"14";"12";13 +"MS";"M";17;"R";"LE3";"T";2;2;"services";"services";"other";"mother";3;4;1;"no";"yes";"no";"no";"yes";"yes";"no";"no";1;3;5;3;5;3;2;"10";"8";9 +"MS";"F";16;"U";"GT3";"T";1;1;"other";"other";"course";"other";1;4;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";2;2;1;1;1;5;0;"14";"14";14 +"MS";"F";19;"U";"LE3";"T";2;2;"other";"other";"home";"mother";1;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;4;5;1;1;1;0;"12";"13";13 +"MS";"F";17;"R";"GT3";"T";1;1;"at_home";"other";"reputation";"mother";2;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";4;4;5;1;2;5;0;"11";"11";11 +"MS";"F";20;"U";"GT3";"T";3;3;"at_home";"services";"other";"mother";2;2;1;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;3;4;2;4;3;8;"11";"9";10 +"MS";"F";17;"U";"LE3";"T";1;1;"other";"services";"course";"father";1;3;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";4;3;3;1;1;3;0;"11";"11";10 +"MS";"M";17;"R";"GT3";"T";2;2;"other";"other";"course";"mother";3;1;1;"no";"yes";"no";"no";"no";"yes";"yes";"no";4;4;5;1;2;5;0;"10";"9";9 +"MS";"F";16;"R";"LE3";"T";1;1;"at_home";"other";"course";"father";3;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";5;3;2;1;1;1;0;"16";"17";18 +"MS";"F";17;"R";"GT3";"T";2;2;"other";"other";"reputation";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";5;3;2;1;1;1;0;"15";"17";17 +"MS";"F";17;"U";"GT3";"A";1;0;"other";"other";"other";"mother";2;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;4;5;1;1;4;1;"11";"9";10 +"MS";"F";18;"R";"GT3";"T";1;1;"at_home";"other";"other";"mother";1;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;2;1;1;5;9;"7";"7";7 +"MS";"F";16;"U";"GT3";"T";3;1;"other";"other";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;1;3;1;3;1;0;"8";"6";8 +"MS";"F";16;"U";"GT3";"T";3;2;"services";"at_home";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";3;1;3;1;4;3;2;"7";"6";7 +"MS";"F";18;"U";"LE3";"T";1;1;"other";"at_home";"reputation";"mother";2;2;0;"yes";"no";"no";"no";"yes";"yes";"no";"no";2;3;5;1;4;3;8;"9";"8";10 +"MS";"F";16;"R";"GT3";"T";4;4;"health";"teacher";"reputation";"father";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;3;3;2;3;2;0;"14";"16";16 +"MS";"F";16;"R";"LE3";"T";1;2;"other";"other";"reputation";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;4;5;1;4;2;0;"14";"14";15 +"MS";"F";18;"U";"GT3";"A";2;4;"other";"services";"reputation";"father";1;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";2;3;2;1;3;1;8;"8";"5";8 +"MS";"M";16;"R";"GT3";"T";2;1;"other";"services";"reputation";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;2;1;1;1;2;0;"8";"7";0 +"MS";"F";16;"U";"LE3";"T";1;1;"at_home";"other";"other";"mother";3;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;2;1;3;5;6;"6";"8";8 +"MS";"F";16;"R";"GT3";"T";2;3;"at_home";"services";"other";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";3;3;3;1;1;2;0;"8";"10";10 +"MS";"F";16;"U";"GT3";"T";4;4;"health";"health";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;4;1;2;3;4;"8";"8";8 +"MS";"M";18;"U";"LE3";"T";4;4;"at_home";"health";"home";"mother";1;4;0;"no";"yes";"no";"yes";"yes";"no";"yes";"yes";5;5;5;5;5;5;2;"5";"6";6 +"MS";"F";16;"R";"LE3";"T";3;4;"at_home";"other";"other";"mother";3;2;0;"no";"yes";"no";"no";"no";"yes";"no";"no";4;2;1;1;1;2;2;"7";"9";8 +"MS";"M";17;"U";"LE3";"T";4;4;"other";"services";"home";"mother";1;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;3;1;2;5;0;"15";"14";16 +"MS";"F";17;"R";"GT3";"T";4;1;"other";"other";"other";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;2;3;1;2;5;1;"13";"14";14 +"MS";"M";16;"U";"LE3";"T";2;2;"services";"services";"other";"mother";4;3;0;"no";"no";"no";"no";"yes";"yes";"no";"no";5;1;3;2;2;3;0;"10";"9";10 +"MS";"F";17;"R";"GT3";"T";2;2;"at_home";"other";"other";"mother";1;1;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";5;1;3;1;2;5;5;"9";"9";9 +"MS";"F";16;"U";"LE3";"T";4;4;"services";"services";"other";"father";2;1;0;"no";"yes";"no";"no";"yes";"yes";"no";"no";5;1;3;1;2;5;1;"11";"11";11 +"MS";"M";17;"U";"GT3";"T";3;3;"services";"services";"home";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";4;1;4;5;5;3;8;"7";"10";9 +"MS";"M";17;"U";"GT3";"T";1;1;"at_home";"services";"other";"mother";3;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;1;3;3;3;1;0;"10";"10";10 +"MS";"M";16;"U";"GT3";"T";2;1;"health";"services";"other";"mother";2;2;0;"no";"no";"no";"no";"no";"yes";"yes";"yes";4;2;2;1;4;5;2;"9";"7";8 +"MS";"F";16;"U";"LE3";"T";2;1;"other";"services";"other";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";3;2;2;1;1;3;0;"14";"15";16 +"MS";"M";16;"U";"LE3";"T";4;4;"teacher";"health";"other";"father";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;1;2;2;5;5;0;"11";"12";12 +"MS";"M";15;"R";"GT3";"T";1;2;"other";"services";"course";"mother";3;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";5;5;5;1;3;5;11;"9";"11";10 +"MS";"M";15;"U";"LE3";"A";2;2;"other";"other";"reputation";"mother";3;4;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";5;4;5;2;3;5;8;"13";"14";14 +"MS";"M";15;"U";"LE3";"A";2;1;"services";"services";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;3;3;1;2;5;11;"12";"13";12 +"MS";"F";16;"R";"LE3";"T";2;2;"other";"other";"course";"mother";1;3;0;"no";"yes";"no";"no";"no";"yes";"no";"yes";4;3;3;2;2;5;2;"11";"11";11 +"MS";"F";16;"U";"LE3";"T";4;1;"other";"other";"home";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";1;2;4;2;2;1;8;"9";"10";10 +"MS";"F";17;"U";"GT3";"T";3;2;"at_home";"other";"home";"mother";2;1;0;"no";"no";"no";"no";"yes";"yes";"no";"yes";4;3;3;2;2;1;5;"9";"11";11 +"MS";"F";17;"R";"GT3";"T";2;2;"other";"other";"other";"mother";2;2;0;"yes";"no";"yes";"no";"yes";"yes";"no";"no";5;1;3;1;1;5;0;"11";"9";11 +"MS";"F";16;"U";"GT3";"T";4;4;"teacher";"services";"course";"mother";2;3;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";5;3;5;1;4;5;1;"10";"11";12 +"MS";"M";17;"R";"GT3";"T";4;4;"health";"other";"course";"father";3;1;3;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";3;3;3;1;3;5;2;"9";"9";8 +"MS";"M";17;"R";"LE3";"T";1;3;"other";"other";"course";"father";2;1;0;"no";"no";"no";"yes";"yes";"yes";"no";"yes";5;1;2;3;3;5;2;"12";"11";12 +"MS";"M";17;"U";"GT3";"T";3;4;"services";"other";"other";"mother";1;2;1;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";5;4;4;3;4;5;8;"8";"9";8 +"MS";"F";17;"U";"GT3";"T";4;4;"health";"health";"course";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;2;5;1;1;5;0;"13";"15";16 +"MS";"M";16;"R";"LE3";"T";4;1;"other";"at_home";"other";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;1;2;2;1;2;0;"10";"11";11 +"MS";"F";17;"U";"GT3";"A";1;1;"at_home";"at_home";"other";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";4;5;5;1;2;3;2;"11";"10";11 +"MS";"F";17;"R";"GT3";"T";4;2;"other";"other";"course";"mother";2;2;0;"yes";"yes";"no";"no";"no";"yes";"yes";"no";4;3;3;2;3;5;0;"17";"18";18 +"MS";"M";16;"U";"LE3";"A";2;2;"other";"services";"course";"father";2;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";4;1;2;2;2;5;0;"12";"13";13 +"MS";"M";17;"U";"GT3";"T";3;2;"other";"other";"other";"father";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"yes";"no";4;1;2;2;2;1;0;"13";"14";13 +"MS";"M";19;"U";"GT3";"T";1;1;"other";"other";"other";"mother";1;2;2;"no";"yes";"no";"yes";"yes";"no";"yes";"no";4;4;3;3;4;4;2;"9";"9";10 +"MS";"M";17;"U";"LE3";"A";1;0;"other";"other";"home";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"no";"yes";4;1;2;1;1;5;4;"11";"11";12 +"MS";"F";17;"R";"GT3";"T";1;1;"at_home";"at_home";"course";"father";2;1;0;"no";"yes";"no";"yes";"yes";"no";"yes";"yes";3;5;5;2;2;4;3;"10";"11";10 +"MS";"F";16;"R";"GT3";"T";1;2;"other";"other";"home";"father";1;3;0;"yes";"yes";"no";"no";"no";"yes";"yes";"yes";4;3;4;1;1;3;5;"13";"14";13 +"MS";"M";16;"R";"LE3";"T";1;2;"other";"at_home";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"yes";"no";"no";4;4;4;2;4;5;4;"9";"10";11 +"MS";"F";17;"R";"GT3";"T";3;1;"other";"other";"course";"mother";2;2;3;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";5;4;4;1;1;5;2;"7";"9";10 +"MS";"M";17;"R";"GT3";"T";2;2;"other";"other";"course";"mother";2;1;0;"no";"no";"no";"yes";"yes";"no";"no";"yes";5;5;5;3;5;5;0;"8";"13";10 +"MS";"M";18;"R";"GT3";"T";1;0;"at_home";"at_home";"course";"other";3;1;1;"yes";"yes";"no";"no";"yes";"yes";"no";"no";4;3;2;1;1;4;0;"12";"12";13 +"MS";"M";17;"R";"GT3";"T";1;1;"other";"services";"course";"mother";2;1;0;"no";"yes";"no";"yes";"no";"yes";"yes";"yes";4;5;5;1;3;2;0;"10";"9";10 +"MS";"M";18;"U";"LE3";"T";1;1;"at_home";"at_home";"course";"mother";2;2;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";4;3;3;1;4;5;6;"10";"9";10 +"MS";"F";16;"R";"LE3";"T";2;2;"other";"services";"course";"father";1;2;0;"no";"no";"no";"yes";"yes";"yes";"no";"yes";5;4;3;1;1;1;0;"11";"13";12 +"MS";"M";17;"U";"GT3";"T";2;2;"other";"other";"course";"mother";1;1;1;"no";"no";"no";"yes";"yes";"yes";"no";"yes";1;2;1;2;3;5;0;"7";"0";0 +"MS";"M";16;"R";"GT3";"T";3;2;"services";"other";"course";"father";2;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;5;5;2;3;5;2;"11";"9";10 +"MS";"M";16;"R";"LE3";"T";1;1;"at_home";"other";"course";"mother";2;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;5;5;2;4;5;0;"10";"10";9 +"MS";"M";18;"R";"GT3";"T";1;1;"services";"other";"course";"other";2;1;1;"no";"yes";"no";"no";"yes";"no";"yes";"yes";5;3;3;2;3;5;2;"9";"7";9 +"MS";"M";18;"R";"GT3";"T";3;2;"services";"other";"course";"mother";1;1;1;"no";"no";"no";"no";"yes";"no";"yes";"no";2;3;1;2;2;5;0;"4";"0";0 +"MS";"M";19;"U";"GT3";"T";3;2;"at_home";"services";"course";"mother";2;1;3;"no";"no";"no";"yes";"yes";"yes";"no";"no";3;2;1;1;1;3;4;"6";"11";9 +"MS";"M";18;"U";"GT3";"T";3;3;"at_home";"at_home";"course";"mother";1;2;2;"no";"yes";"no";"yes";"yes";"no";"yes";"no";4;4;5;1;3;3;9;"4";"8";8 +"MS";"M";16;"R";"GT3";"T";2;2;"services";"services";"course";"mother";2;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";5;4;3;2;4;4;6;"7";"8";8 +"MS";"M";19;"U";"GT3";"T";2;1;"at_home";"other";"course";"other";2;1;3;"no";"no";"no";"yes";"no";"no";"yes";"yes";4;4;3;1;3;5;4;"8";"9";9 +"MS";"F";16;"U";"GT3";"A";3;2;"services";"at_home";"course";"mother";2;2;2;"no";"yes";"no";"yes";"yes";"yes";"no";"yes";2;5;5;1;1;1;8;"5";"5";7 +"MS";"F";17;"U";"GT3";"T";1;1;"other";"at_home";"course";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"no";"no";4;3;2;1;2;5;9;"7";"9";10 +"MS";"M";20;"R";"GT3";"T";1;1;"other";"other";"course";"other";2;1;1;"no";"yes";"no";"no";"yes";"no";"yes";"yes";4;4;3;2;4;4;12;"8";"11";10 +"MS";"F";18;"R";"GT3";"A";4;3;"services";"services";"course";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";5;4;4;3;4;2;8;"10";"11";10 +"MS";"M";18;"R";"GT3";"T";3;2;"other";"other";"course";"mother";2;1;0;"no";"yes";"no";"no";"no";"yes";"yes";"no";2;5;5;5;5;5;8;"9";"10";11 +"MS";"M";19;"R";"GT3";"T";1;1;"other";"services";"home";"other";3;2;1;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;4;4;3;3;2;8;"10";"9";11 +"MS";"M";17;"U";"GT3";"T";3;3;"health";"other";"course";"mother";2;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;5;4;2;3;3;4;"8";"9";10 +"MS";"M";18;"U";"LE3";"T";1;3;"at_home";"services";"course";"mother";1;1;0;"no";"no";"no";"no";"yes";"no";"yes";"yes";4;3;3;2;3;3;0;"9";"10";9 +"MS";"M";19;"R";"GT3";"T";1;1;"other";"other";"home";"other";3;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;4;3;3;5;4;"8";"9";10 +"MS";"F";18;"U";"GT3";"A";1;2;"at_home";"other";"course";"mother";2;2;2;"no";"yes";"no";"no";"yes";"yes";"no";"no";4;3;3;1;1;5;2;"6";"8";8 +"MS";"F";19;"U";"LE3";"A";1;1;"at_home";"other";"course";"mother";1;1;0;"no";"yes";"no";"no";"yes";"no";"no";"no";1;4;4;1;1;5;0;"6";"8";7 +"MS";"F";18;"R";"GT3";"T";2;2;"other";"other";"other";"mother";2;1;1;"no";"no";"no";"no";"yes";"no";"yes";"yes";5;5;5;1;1;3;0;"8";"6";0 +"MS";"F";17;"R";"GT3";"T";0;0;"at_home";"other";"course";"mother";2;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;3;1;1;5;0;"10";"11";11 +"MS";"F";17;"R";"LE3";"A";3;1;"other";"at_home";"course";"other";2;3;0;"no";"yes";"yes";"no";"yes";"no";"no";"no";4;2;3;2;2;3;5;"8";"7";8 +"MS";"F";17;"U";"GT3";"T";4;2;"teacher";"services";"home";"mother";1;2;0;"yes";"yes";"no";"yes";"yes";"yes";"yes";"no";5;5;5;1;3;5;0;"8";"8";0 +"MS";"F";18;"R";"LE3";"T";2;2;"services";"services";"course";"mother";1;2;1;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";2;3;3;1;2;4;3;"7";"6";8 +"MS";"F";17;"U";"GT3";"T";4;1;"health";"at_home";"course";"mother";1;1;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";3;2;2;1;1;5;0;"8";"10";9 +"MS";"F";17;"U";"LE3";"T";1;2;"at_home";"other";"course";"father";1;1;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;5;1;1;1;3;0;"7";"10";10 +"MS";"F";18;"U";"GT3";"T";1;1;"other";"other";"course";"mother";3;2;2;"no";"no";"no";"yes";"yes";"yes";"no";"yes";3;4;4;2;2;5;3;"7";"8";7 +"MS";"F";18;"U";"GT3";"T";2;2;"services";"at_home";"reputation";"father";2;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;3;5;1;1;1;2;"12";"13";14 +"MS";"F";17;"U";"GT3";"T";3;3;"services";"services";"course";"mother";2;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;4;3;1;1;4;0;"11";"12";13 +"MS";"F";18;"U";"LE3";"A";1;2;"at_home";"other";"reputation";"mother";2;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;4;3;1;2;4;0;"12";"13";14 +"MS";"F";18;"U";"GT3";"T";4;4;"teacher";"teacher";"reputation";"mother";2;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";4;3;5;1;2;1;0;"18";"18";18 +"MS";"M";18;"U";"LE3";"T";4;4;"services";"other";"reputation";"mother";1;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"no";5;4;5;1;1;5;3;"17";"17";17 +"MS";"F";17;"U";"GT3";"T";4;2;"other";"other";"course";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;3;1;2;4;0;"17";"18";18 +"MS";"F";18;"R";"GT3";"T";2;2;"at_home";"other";"course";"mother";3;2;1;"no";"no";"no";"yes";"yes";"yes";"no";"yes";4;3;3;1;1;4;0;"9";"0";0 +"MS";"M";18;"U";"LE3";"T";1;2;"at_home";"services";"home";"mother";2;1;0;"no";"yes";"no";"no";"no";"yes";"no";"no";4;1;4;5;5;1;8;"10";"11";11 +"MS";"M";18;"R";"GT3";"T";4;4;"at_home";"services";"other";"mother";3;1;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"yes";2;5;5;1;1;1;5;"12";"13";14 +"MS";"M";17;"R";"GT3";"T";1;1;"other";"services";"other";"father";3;1;0;"no";"no";"no";"no";"no";"no";"no";"no";4;2;3;3;4;4;4;"12";"13";14 +"MS";"F";18;"U";"GT3";"T";2;2;"other";"other";"course";"mother";2;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";1;3;1;1;1;2;4;"8";"8";10 +"MS";"F";18;"U";"LE3";"T";2;2;"services";"services";"course";"father";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";5;4;5;1;4;3;0;"11";"12";13 +"MS";"F";18;"R";"LE3";"A";4;2;"teacher";"other";"reputation";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;1;1;1;5;0;"5";"0";0 +"MS";"F";18;"U";"GT3";"T";1;1;"at_home";"services";"course";"mother";3;2;1;"no";"no";"no";"no";"yes";"no";"no";"no";4;4;2;1;2;2;2;"9";"10";10 +"MS";"F";19;"U";"GT3";"T";1;1;"at_home";"services";"other";"father";2;1;1;"no";"no";"no";"no";"yes";"no";"no";"no";5;5;5;2;3;2;0;"5";"0";0 +"MS";"F";17;"U";"GT3";"T";4;2;"teacher";"other";"course";"father";2;4;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;2;3;3;1;5;0;"18";"18";18 +"MS";"F";17;"R";"LE3";"A";2;1;"services";"other";"reputation";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"yes";5;3;3;1;2;2;5;"11";"11";12 +"MS";"F";18;"U";"LE3";"A";1;1;"at_home";"services";"course";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"no";"yes";5;2;3;1;2;3;2;"8";"10";11 +"MS";"F";18;"U";"GT3";"T";1;2;"at_home";"at_home";"course";"father";2;2;0;"no";"yes";"no";"no";"yes";"no";"no";"no";4;1;1;1;1;4;0;"11";"11";12 +"MS";"F";19;"R";"GT3";"A";1;1;"at_home";"at_home";"course";"other";2;2;3;"no";"yes";"no";"yes";"yes";"no";"no";"yes";3;5;4;1;4;1;0;"8";"0";0 +"MS";"F";18;"R";"GT3";"T";2;2;"services";"other";"home";"mother";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;2;1;1;1;4;5;"14";"14";15 +"MS";"M";17;"R";"GT3";"T";4;3;"services";"other";"home";"mother";2;2;1;"no";"yes";"yes";"yes";"no";"yes";"yes";"yes";4;5;5;1;3;2;4;"10";"11";11 +"MS";"F";18;"U";"GT3";"T";3;3;"services";"services";"course";"father";1;2;0;"no";"yes";"no";"no";"yes";"yes";"no";"yes";5;3;4;1;1;5;0;"10";"10";10 +"MS";"F";17;"R";"GT3";"T";4;4;"teacher";"services";"other";"father";2;2;0;"no";"yes";"yes";"yes";"yes";"yes";"yes";"no";4;3;3;1;2;5;2;"12";"12";12 +"MS";"F";17;"U";"LE3";"A";3;2;"services";"other";"reputation";"mother";2;2;0;"no";"no";"no";"no";"yes";"yes";"no";"yes";1;2;3;1;2;5;0;"15";"14";15 +"MS";"M";18;"U";"LE3";"T";1;1;"other";"services";"home";"father";2;1;0;"no";"no";"no";"no";"no";"yes";"yes";"yes";3;3;2;1;2;3;2;"14";"13";14 +"MS";"F";18;"U";"LE3";"T";1;1;"at_home";"services";"course";"father";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;3;2;1;1;4;0;"19";"17";18 +"MS";"F";18;"R";"LE3";"A";1;2;"at_home";"other";"course";"mother";3;2;0;"no";"no";"no";"no";"yes";"yes";"no";"yes";4;3;4;1;4;5;0;"16";"15";15 +"MS";"F";18;"U";"GT3";"T";3;3;"services";"services";"other";"mother";2;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;2;1;3;3;6;"13";"12";13 +"MS";"F";17;"U";"LE3";"T";4;4;"at_home";"at_home";"course";"mother";1;2;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";2;3;4;1;1;1;4;"15";"14";15 +"MS";"F";17;"R";"GT3";"T";1;2;"other";"services";"course";"father";2;2;0;"no";"no";"no";"no";"no";"yes";"no";"no";3;2;2;1;2;3;0;"13";"13";13 +"MS";"M";18;"R";"GT3";"T";1;3;"at_home";"other";"course";"mother";2;2;0;"no";"yes";"yes";"no";"yes";"yes";"no";"no";3;3;4;2;4;3;0;"8";"10";9 +"MS";"M";18;"U";"LE3";"T";4;4;"teacher";"services";"other";"mother";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;2;2;2;2;5;0;"15";"16";16 +"MS";"F";17;"R";"GT3";"T";1;1;"other";"services";"reputation";"mother";3;1;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";5;2;1;1;2;1;0;"8";"8";9 +"MS";"F";18;"U";"GT3";"T";2;3;"at_home";"services";"course";"father";2;1;0;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";5;2;3;1;2;4;0;"10";"10";10 +"MS";"F";18;"R";"GT3";"T";4;4;"other";"teacher";"other";"father";3;2;0;"no";"yes";"no";"no";"no";"yes";"yes";"yes";3;2;2;4;2;5;0;"7";"5";0 +"MS";"M";18;"R";"LE3";"T";1;2;"at_home";"services";"other";"father";3;1;0;"no";"yes";"no";"yes";"yes";"no";"yes";"yes";4;3;3;2;3;3;3;"9";"10";10 +"MS";"F";17;"U";"GT3";"T";2;2;"other";"at_home";"home";"mother";1;3;0;"no";"no";"no";"yes";"yes";"yes";"no";"yes";3;4;3;1;1;3;8;"10";"11";12 +"MS";"F";17;"R";"GT3";"T";1;2;"other";"other";"course";"mother";1;1;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";3;5;5;1;3;1;4;"7";"8";9 +"MS";"F";18;"R";"LE3";"T";4;4;"other";"other";"reputation";"mother";2;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;4;4;1;1;1;0;"15";"17";17 +"MS";"F";18;"R";"GT3";"T";1;1;"other";"other";"home";"mother";4;3;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";4;3;2;1;2;4;4;"10";"11";12 +"MS";"F";19;"R";"GT3";"T";1;1;"at_home";"other";"course";"other";2;2;1;"no";"yes";"no";"no";"yes";"yes";"yes";"yes";4;3;3;1;1;3;4;"7";"8";9 +"MS";"F";18;"R";"LE3";"T";4;4;"teacher";"services";"course";"mother";1;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";5;4;3;3;4;2;1;"13";"14";14 +"MS";"F";18;"U";"GT3";"T";3;3;"other";"other";"home";"mother";1;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"yes";4;1;3;1;2;1;1;"16";"16";16 +"MS";"F";17;"R";"GT3";"T";3;1;"at_home";"other";"reputation";"mother";1;2;0;"no";"yes";"no";"yes";"no";"yes";"yes";"no";4;5;4;2;3;1;10;"8";"9";9 +"MS";"M";18;"U";"GT3";"T";4;4;"teacher";"teacher";"home";"father";1;2;0;"no";"no";"no";"yes";"no";"yes";"yes";"no";3;2;4;1;4;2;4;"17";"18";19 +"MS";"M";18;"R";"GT3";"T";2;1;"other";"other";"other";"mother";2;1;0;"no";"no";"no";"yes";"no";"yes";"yes";"yes";4;4;3;1;3;5;0;"7";"7";0 +"MS";"M";17;"U";"GT3";"T";2;3;"other";"services";"home";"father";2;2;0;"no";"no";"no";"yes";"yes";"yes";"yes";"no";4;4;3;1;1;3;4;"14";"15";16 +"MS";"M";19;"R";"GT3";"T";1;1;"other";"services";"other";"mother";2;1;1;"no";"no";"no";"no";"yes";"yes";"no";"no";4;3;2;1;3;5;0;"5";"8";0 +"MS";"M";18;"R";"GT3";"T";4;2;"other";"other";"home";"father";2;1;1;"no";"no";"yes";"no";"yes";"yes";"no";"no";5;4;3;4;3;3;0;"7";"7";0 +"MS";"F";18;"R";"GT3";"T";2;2;"at_home";"other";"other";"mother";2;3;0;"no";"no";"no";"no";"yes";"yes";"no";"no";5;3;3;1;3;4;0;"14";"17";15 +"MS";"F";17;"U";"GT3";"T";4;3;"teacher";"other";"other";"mother";2;2;0;"no";"no";"no";"no";"yes";"yes";"yes";"no";5;5;4;1;1;1;0;"6";"9";11 +"MS";"F";18;"R";"GT3";"T";4;4;"teacher";"at_home";"reputation";"mother";3;1;0;"no";"yes";"no";"yes";"yes";"yes";"yes";"yes";4;4;3;2;2;5;4;"7";"9";10 +"MS";"F";19;"R";"GT3";"T";2;3;"services";"other";"course";"mother";1;3;1;"no";"no";"no";"yes";"no";"yes";"yes";"no";5;4;2;1;2;5;4;"10";"11";10 +"MS";"F";18;"U";"LE3";"T";3;1;"teacher";"services";"course";"mother";1;2;0;"no";"yes";"no";"no";"yes";"yes";"yes";"no";4;3;4;1;1;1;4;"15";"15";16 +"MS";"F";18;"U";"GT3";"T";1;1;"other";"other";"course";"mother";2;2;0;"no";"no";"no";"yes";"yes";"yes";"no";"no";1;1;1;1;1;5;6;"11";"12";9 +"MS";"M";17;"U";"LE3";"T";3;1;"services";"services";"course";"mother";2;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";2;4;5;3;4;2;6;"10";"10";10 +"MS";"M";18;"R";"LE3";"T";3;2;"services";"other";"course";"mother";3;1;0;"no";"no";"no";"no";"no";"yes";"yes";"no";4;4;1;3;4;5;4;"10";"11";11