diff --git a/vijieaswar/intror/Challenges.R b/vijieaswar/intror/Challenges.R index 1a82ceb..70fce6f 100644 --- a/vijieaswar/intror/Challenges.R +++ b/vijieaswar/intror/Challenges.R @@ -75,3 +75,8 @@ lapply(df, range, na.rm=T) sapply(df, range, na.rm=T) #the sapply displays the range min and max value in 2 rows +mapply() +?mapply (not in class) + +Question8 <- tapply(df$hp, df$gear, max, na.rm=T) +Question8 diff --git a/vijieaswar/intror/Practice.R b/vijieaswar/intror/Practice.R new file mode 100644 index 0000000..d79561d --- /dev/null +++ b/vijieaswar/intror/Practice.R @@ -0,0 +1,66 @@ + +cat <- "cat" +alpha <1 +int <-3L +#for integer, you have to have the L at the end, else it will be treated as a numeric + +#vectors are a sequence of objects that all have the same class. R will convert some of them if you have a word along with other numerics + +list is a special type of vector that can have objects of different classes. + +matrices are vectors with a deminsion attributes + +x = matrix (1:6, nrow=2, ncol=3) + +dim(x) +attributes(x) + +x <- 1:10 +dim(x) <- c(2,5) +# we added a dimension to create a matrix +# the above does the same as the command below +y <- matrix(1:10, nrow=2, ncol=5) +y +#you can also create a matrix with cbind + +x <- 1:3 +y <- 10:12 +#binding by coloumn +x1 <- cbind(x,y) +x1 +#binding by row +x2 <- rbind(x,y) +x2 + +#matrices all have to be the same class- cant mix char, numeric. dataframe acn have mixed classes +#Factors is another class- can label + +unclass(x) is more descriptive- + +num <- c(1,1,2,2,3) +fact <- factor(num, level=c(1,2,3), labels=c("yes","no","maybe")) +#database has intergers and you are giving it an atrribute by giving it labels and saying what it means + + +datafram: each column has to be the same type of data + +(x <- data.frame(foo=1:4,bar=c(T,T,F,F))) + +attributes are like metadata +adding information to your data without adding more values in the ddata.class( + + datafames have row.names + + you could give specific attributes such as names etc + +x <-1:3 +names(x) +#there wont be any names + +names(x) <- c("alpha", "beta","gamma") +#the above does not give a data fram +names(x) +str(x) +#it is not a factor but it is assigning names- it is making a label +names function unique to the person but factor labels will replace any value = to that by that character + diff --git a/vijieaswar/plotting/Classwork.R b/vijieaswar/plotting/Classwork.R index e312a93..d6ed2b3 100644 --- a/vijieaswar/plotting/Classwork.R +++ b/vijieaswar/plotting/Classwork.R @@ -102,7 +102,6 @@ library(reshape2) #cor returns a matric, not a df #creating a heatmap - mtcars %>% select(c(1,3,4,5,6,7)) %>% cor() %>% diff --git a/vijieaswar/rmarkdown/Challenge.Rmd b/vijieaswar/rmarkdown/Challenge.Rmd new file mode 100644 index 0000000..ea206ca --- /dev/null +++ b/vijieaswar/rmarkdown/Challenge.Rmd @@ -0,0 +1,75 @@ +--- +title: "Challenge" +author: "Viji" +date: "November 23, 2015" +output: + html_document: + fig_caption: true + number_section: true + toc: true +--- + + + +```{r, echo = FALSE} +library(dplyr) +library(tidyr) +library(pander) +library(captioner) +library(ggplot2) +library(rmarkdown) + +knitr::opts_chunk$set(echo = FALSE, warning = FALSE) +setwd("~/Desktop/practice-2015-10/vijieaswar/rmarkdown") +``` + +```{r} +ds <- as.data.frame(state.x77) %>% + add_rownames() %>% + tbl_df() + +str(state.x77) + +names(ds) + + +``` + +# Brief Description +The dataset consists of demographic information from `r dim(ds)[1]` states for the following `r dim(ds[-1])[2]` variables: `r names(ds[-1])`. The mean population across all states is `r round( mean(ds$Population),2)` and the standard deviation is `r round(sd(ds$Population),2)` + +```{r} + +figNums <- captioner(prefix = 'Figure') +state_cap <- figNums('stateLitIncome', 'Income varies across illiteracy') + +``` + +# Plots +```{r, echo=FALSE, fig.cap=state_cap, dpi=150} + ds %>% + ggplot(aes(x=Illiteracy, y=Income))+ + geom_point()+ + labs(plot.title="Income across illteracy", x="Illiteracy", y="Income") + + +``` + +#Tables +```{r} + +tabNums <- captioner(prefix = 'Table') + +ds %>% + gather(Variable, Value, -rowname) %>% + group_by(Variable) %>% + summarise(Means = mean(Value) %>% round(2), + SD = sd(Value) %>% round(2)) %>% + pander(caption = tabNums('Demodata', 'Demographic data of 50 states')) +``` + +```{r, eval=FALSE} +render('Challenge.Rmd', c('word_document', 'html_document')) +``` + + diff --git a/vijieaswar/rmarkdown/Challenge.docx b/vijieaswar/rmarkdown/Challenge.docx new file mode 100644 index 0000000..6432f8f Binary files /dev/null and b/vijieaswar/rmarkdown/Challenge.docx differ diff --git a/vijieaswar/rmarkdown/Challenge.html b/vijieaswar/rmarkdown/Challenge.html new file mode 100644 index 0000000..e16205a --- /dev/null +++ b/vijieaswar/rmarkdown/Challenge.html @@ -0,0 +1,143 @@ + + + + + + + + + + + + + + +Challenge + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ +
+ +
##  num [1:50, 1:8] 3615 365 2212 2110 21198 ...
+##  - attr(*, "dimnames")=List of 2
+##   ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
+##   ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
+
## [1] "rowname"    "Population" "Income"     "Illiteracy" "Life Exp"  
+## [6] "Murder"     "HS Grad"    "Frost"      "Area"
+
+

1 Brief Description

+

The dataset consists of demographic information from 50 states for the following 8 variables: Population, Income, Illiteracy, Life Exp, Murder, HS Grad, Frost, Area. The mean population across all states is 4246.42 and the standard deviation is 4464.49

+
+
+

2 Plots

+
+Figure 1: Income varies across illiteracy

Figure 1: Income varies across illiteracy

+
+
+
+

3 Tables

+
## 
+## ------------------------
+##  Variable   Means   SD  
+## ---------- ------- -----
+## Population  4246   4464 
+## 
+##   Income    4436   614.5
+## 
+## Illiteracy  1.17   0.61 
+## 
+##  Life Exp   70.88  1.34 
+## 
+##   Murder    7.38   3.69 
+## 
+##  HS Grad    53.11  8.08 
+## 
+##   Frost     104.5  51.98
+## 
+##    Area     70736  85327
+## ------------------------
+## 
+## Table: Table  1: Demographic data of 50 states
+
+ + +
+ + + + + + + + diff --git a/vijieaswar/rmarkdown/Practice.Rmd b/vijieaswar/rmarkdown/Practice.Rmd new file mode 100644 index 0000000..c71cebe --- /dev/null +++ b/vijieaswar/rmarkdown/Practice.Rmd @@ -0,0 +1,134 @@ +--- +title: "Practice" +author: "Viji" +date: "November 23, 2015" +output: + html_document: + fig_caption: true + toc: true +--- +```{r} +#in the YAML, add the following +#bibliography: give the location of the bib file +#for a particular style +#csl: give the location of the bib file citation style language. can be found in github + +``` + +This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . + +When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: + +# Intro + +## Citing a study +```{r} + + + +#how to enter here from papers. need to export the bib key +#@some bibkey +#[] will make the round bracketts around the citation. +``` + +```{r} + + +#* makes the text in italics +#** on either side will bold words +#``` is a code block + +#` having things in line +# > indicates that it is goin to be quote +# @ with bib key will extract refs + +#YAML header is for R to know what to do. YAML has o be closed in --- + +#pander, kable creates tables +#ggplot qplot creates figure + +#conrol+shift+K will knit html +``` + +```{r, echo=FALSE} +library(pander) +library(captioner) + +#install.packages('captioner', type = 'source') if you get an error message while installing + +knitr::opts_chunk$set(echo = FALSE, warning = FALSE) +#Can use this above ling to apply to every chunk + +#table option 1 +knitr::kable(summary(cars)) + + +#table option 2 +tabNums <- captioner(prefix = 'Table') +figNums <- captioner(prefix = 'Figure') +cars_cap <- figNums('carsFig', 'This is a caption for the figure') + + +pander(lm(cars), caption = tabNums('tab1', 'This is the caption for table 1')) #first coloumn as Y, and second column as X. if you have many columns, 2nd 3rd columnds are X2, X3 etc. + + +``` + +REference to **`r tabNums('tab1', display = 'cite')`**. + +You can also embed plots, for example: + +```{r, echo=FALSE, fig.cap=cars_cap} +#have to write fig.caps = cars_cap because knitr is finicky + +plot(cars) +``` + +```{r} +#Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. + +#`r will take the following stuff within ` as a command. if you dont write r after ` it will take it as text +#compare `r mean(cars$dist)` with `mean(cars$dist)` +``` + +#Tables +```{r} +library(dplyr) +library(tidyr) + +cars %>% + gather(Measure, Value) %>% + group_by(Measure) %>% + summarise(means =mean(Value) %>% round(2)) %>% + pander() + +``` + +# Figures +## Scatterplot + +```{r, message = FALSE, fig.height=10, fig.width=5, dpi=150, dev='png'} +library(ggplot2) +qplot(dist,speed, data = cars, geom= 'point') + +``` + +This is a list + +- There is `r length(cars)` rows in `cars` +- There are `r dim(cars)[2]` variables in `cars` + +you would add ` ` around the cars to differentiate it from regular text + +This is a quote: + +> to be or not to be + +```{r} +#command+option+i is a new R chunk +#command+alt+c runs the R chunk + + +``` + +# References \ No newline at end of file diff --git a/vijieaswar/rmarkdown/Practice.docx b/vijieaswar/rmarkdown/Practice.docx new file mode 100644 index 0000000..37f0261 Binary files /dev/null and b/vijieaswar/rmarkdown/Practice.docx differ diff --git a/vijieaswar/rmarkdown/Practice.html b/vijieaswar/rmarkdown/Practice.html new file mode 100644 index 0000000..598a164 --- /dev/null +++ b/vijieaswar/rmarkdown/Practice.html @@ -0,0 +1,274 @@ + + + + + + + + + + + + + + +Practice + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
#in the YAML, add the following
+#bibliography: give the location of the bib file
+#for a particular style
+#csl: give the location of the bib file citation style language. can be found in github
+

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

+

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

+
+

Intro

+
+

Citing a study

+
#how to enter here from papers. need to export the bib key
+#@some bibkey
+#[] will make the round bracketts around the citation. 
+
#* makes the text in italics
+#** on either side will bold words
+#``` is a code block
+
+#` having things in line
+# > indicates that it is goin to be quote 
+# @ with bib key will extract refs
+
+#YAML header is for R to know what to do. YAML has o be closed in ---
+
+#pander, kable creates tables
+#ggplot qplot creates figure
+
+#conrol+shift+K will knit html
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
speeddist
Min. : 4.0Min. : 2.00
1st Qu.:12.01st Qu.: 26.00
Median :15.0Median : 36.00
Mean :15.4Mean : 42.98
3rd Qu.:19.03rd Qu.: 56.00
Max. :25.0Max. :120.00
+ + +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + +
Table 1: This is the caption for table 1
 EstimateStd. Errort valuePr(>|t|)
dist0.16560.017499.4641.49e-12
(Intercept)8.2840.87449.4741.441e-12
+

REference to Table 1.

+

You can also embed plots, for example:

+
+Figure 1: This is a caption for the figure

Figure 1: This is a caption for the figure

+
+
+
+
+

Tables

+
## 
+## Attaching package: 'dplyr'
+## 
+## The following object is masked from 'package:stats':
+## 
+##     filter
+## 
+## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+ ++++ + + + + + + + + + + + + + + + + +
Measuremeans
speed15.4
dist42.98
+
+
+

Figures

+
+

Scatterplot

+
+ +
+

This is a list

+
    +
  • There is 2 rows in cars
  • +
  • There are 2 variables in cars
  • +
+

you would add ` ` around the cars to differentiate it from regular text

+

This is a quote:

+
+

to be or not to be

+
+
+
+
+

References

+
+ + +
+ + + + + + + + diff --git a/vijieaswar/wrangling/ChallengesVE.R b/vijieaswar/wrangling/ChallengesVE.R new file mode 100644 index 0000000..a1aa7a0 --- /dev/null +++ b/vijieaswar/wrangling/ChallengesVE.R @@ -0,0 +1,3 @@ + +#Date: 9th Nov 2015 + diff --git a/vijieaswar/wrangling/Practice.R b/vijieaswar/wrangling/Practice.R new file mode 100644 index 0000000..3295822 --- /dev/null +++ b/vijieaswar/wrangling/Practice.R @@ -0,0 +1,153 @@ + +install.packages('dplyr') +install.packages('tidyr') + +library(dplyr) +library(tidyr) + +head(swiss) +tail(swiss) + +summary(swiss) +str(swiss) +names(swiss) + +#command+shift+m gives %>%. pipe build on a previous function +head(swiss) +swiss %>% head +swiss %>% head() # it will use the first avaiable place (dont want this for regression) +swiss %>% head(.) # the . specifies where the data has to go to + +swiss %>% lm(Education ~ Infant.Mortality, data=.) %>% + summary %>% + coef %>% + + # this avoids the need for putting things in bracketts + # this will place the swiss data int he right palce + + +#using dplyr + + +ds <- swiss %>% + add_rownames() %>% + tbl_df() + +#rowname was only an attribute +#addg table dataframe property to it- the printing is prettier +ds +swiss + + +ds %>% + select(Education, Catholic) %>% + + #if you end up naeanting to select by the same name or partial name + +ds %>% + select(contains('Edu'), + matches('Cath'), + starts_with('F') + ) + +#matches('^F'), +#matches('n$'), +#matches('.'), +#matches('*'), +#matches('C.*l') + +#in reg exp, . means one (anything) but * means everything. . means a wildcard for one character. but * wild card for any lenght. . is one character +#contains and starts with can be a subset of matches. Matches is very powerful. matches uses regular expression. +# learn regular expression + #regular expression: ^ means starts with, $ means ends with so you can substitute the #starts with. reg exp is usefulwhen you want to do find and replace. + +#filter +dim(ds) + ds %>% + filter(Catholic < 50 | Fertility > 40) %>% + str() + + dplyr::add_rownames() + #THis is way of calling a library. + + ds %>% + filter(Education == 10 & + Infant.Mortality >5) + + ds %>% + filter(rowname != 'Rolle') %>% + str() + + #mutate + ds %>% + mutate(testing = 'yes', + Educated =ifelse( + Education >20, 'Yes', 'No' + )) %>% + select(testing, Educated) + + #it is better to do this instead of assigning into objects each time so that youdont overpopulate and it is + #easy wen you want to change something. and then you assign the final one if you needed + + + #tidyr + ds %>% + select(-rowname) %>% + gather(Variable, Value) %>% #variable is the factor level/variable and the value is the actual value. you coudl kee this line the same- call it the same + group_by(Variable) %>% + summarise(mean = mean(Value), + sd = sd(Value), + median = median(Value)) + + #if you want only a few variables, you filter first + #the group_by adds an attribute, so anything that occurs after group_by will be done by the the grouping + + ds %>% + select(-rowname) %>% + mutate(Fertile = ifelse(Fertility > 70, 'Yes', 'No')) %>% + gather(Variable, Value, -Fertile) %>% #variable is the factor level/variable and the value is the actual value. you coudl kee this line the same- call it the same + group_by(Fertile, Variable) %>% + summarise(meanSD = paste0(mean(Value) %>% round(2), + ' (', + sd(Value) %>% round(2), + ')')) %>% + spread(Fertile, meanSD) + + #paste retruns a character + #spread changes the output table to a wide format. otherwise Fertile will be in long format + + #arrange + #sorting by + ds %>% + arrange(Education, Agriculture) %>% + select(Education, Agriculture) + + + ds %>% + arrange(Education, desc(Agriculture)) %>% + select(Education, Agriculture) + + + ds %>% + select(County = rowname) %>% + mutate(County =gsub('e$', "", County) %>% + gsub('^C','HAHAHA', .)) + #gsub means global substite. the . says thats where the output needs to go. so R has to know where to put it first + # thats why it is mentioned in the first row. An alternate in as below. You are already telling R which dataset + #you are working with. + + ds %>% + select(County = rowname) %>% + mutate(County = County %>% + gsub('e$', "", .) %>% + gsub('^C','HAHAHA', .)) + + + + + + + + + + \ No newline at end of file