diff --git a/.Rprofile b/.Rprofile old mode 100644 new mode 100755 diff --git a/001_intro.Rmd b/001_intro.Rmd old mode 100644 new mode 100755 diff --git a/002_readings.Rmd b/002_readings.Rmd old mode 100644 new mode 100755 diff --git a/010_setup.Rmd b/010_setup.Rmd old mode 100644 new mode 100755 diff --git a/011_r.Rmd b/011_r.Rmd old mode 100644 new mode 100755 diff --git a/012_rstudio.Rmd b/012_rstudio.Rmd old mode 100644 new mode 100755 diff --git a/013_practice.Rmd b/013_practice.Rmd old mode 100644 new mode 100755 diff --git a/020_objects.Rmd b/020_objects.Rmd old mode 100644 new mode 100755 index 41c1f64..702a68e --- a/020_objects.Rmd +++ b/020_objects.Rmd @@ -115,7 +115,7 @@ paste0("Hello World! Today is ", date(), ".") The syntax of each command requires a bit of memorization, and lots of practice. Nobody types functional code in R without going through cycles of trial-and-error with it. Programming requires some discipline, some logic and a lot of patience. -A final thing: it is good practice to write code in a certain style. For example, R is insensitive to extra blank space, but it is good practice to use blank space only when it helps to align elements. The [Google R Style Guide](https://google-styleguide.googlecode.com/svn/trunk/google-r-style.html) is a good starting point to learn a few common conventions. +A final thing: it is good practice to write code in a certain style. For example, R is insensitive to extra blank space, but it is good practice to use blank space only when it helps to align elements. The [Google R Style Guide](https://google.github.io/styleguide/Rguide.xml) is a good starting point to learn a few common conventions. Let's now turn to manipulating simple things in R. diff --git a/021_vectors.Rmd b/021_vectors.Rmd old mode 100644 new mode 100755 diff --git a/022_variables.Rmd b/022_variables.Rmd old mode 100644 new mode 100755 diff --git a/023_practice.Rmd b/023_practice.Rmd old mode 100644 new mode 100755 diff --git a/030_functions.Rmd b/030_functions.Rmd old mode 100644 new mode 100755 diff --git a/031_control.Rmd b/031_control.Rmd old mode 100644 new mode 100755 diff --git a/032_iteration.Rmd b/032_iteration.Rmd old mode 100644 new mode 100755 diff --git a/033_practice.Rmd b/033_practice.Rmd old mode 100644 new mode 100755 diff --git a/040_data.Rmd b/040_data.Rmd old mode 100644 new mode 100755 diff --git a/041_dataio.Rmd b/041_dataio.Rmd old mode 100644 new mode 100755 diff --git a/042_reshaping.Rmd b/042_reshaping.Rmd old mode 100644 new mode 100755 diff --git a/043_practice.Rmd b/043_practice.Rmd old mode 100644 new mode 100755 diff --git a/080_models.Rmd b/080_models.Rmd old mode 100644 new mode 100755 diff --git a/081_correlation.Rmd b/081_correlation.Rmd old mode 100644 new mode 100755 diff --git a/082_ols.Rmd b/082_ols.Rmd old mode 100644 new mode 100755 diff --git a/083_practice.Rmd b/083_practice.Rmd old mode 100644 new mode 100755 diff --git a/090_ts.Rmd b/090_ts.Rmd old mode 100644 new mode 100755 diff --git a/091_lags.Rmd b/091_lags.Rmd old mode 100644 new mode 100755 diff --git a/092_smoothing.Rmd b/092_smoothing.Rmd old mode 100644 new mode 100755 diff --git a/093_practice.Rmd b/093_practice.Rmd old mode 100644 new mode 100755 diff --git a/100_maps.Rmd b/100_maps.Rmd old mode 100644 new mode 100755 diff --git a/101_geocoding.Rmd b/101_geocoding.Rmd old mode 100644 new mode 100755 diff --git a/102_choropleths.Rmd b/102_choropleths.Rmd old mode 100644 new mode 100755 diff --git a/110_networks.Rmd b/110_networks.Rmd old mode 100644 new mode 100755 diff --git a/111_influence.Rmd b/111_influence.Rmd old mode 100644 new mode 100755 diff --git a/112_networkds.Rmd b/112_networkds.Rmd old mode 100644 new mode 100755 diff --git a/120_data.Rmd b/120_data.Rmd old mode 100644 new mode 100755 diff --git a/121_sources.Rmd b/121_sources.Rmd old mode 100644 new mode 100755 diff --git a/122_figures.Rmd b/122_figures.Rmd old mode 100644 new mode 100755 diff --git a/R-2013-Lyon/README.md b/R-2013-Lyon/README.md old mode 100644 new mode 100755 diff --git a/R-2013-Lyon/abstract.pdf b/R-2013-Lyon/abstract.pdf old mode 100644 new mode 100755 diff --git a/R-2013-Lyon/extract.png b/R-2013-Lyon/extract.png old mode 100644 new mode 100755 diff --git a/R-2013-Lyon/slides.pdf b/R-2013-Lyon/slides.pdf old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/code/011_r.R b/code/011_r.R old mode 100644 new mode 100755 diff --git a/code/012_rstudio.R b/code/012_rstudio.R old mode 100644 new mode 100755 diff --git a/code/013_practice.R b/code/013_practice.R old mode 100644 new mode 100755 diff --git a/code/020_objects.R b/code/020_objects.R old mode 100644 new mode 100755 diff --git a/code/021_vectors.R b/code/021_vectors.R old mode 100644 new mode 100755 diff --git a/code/022_variables.R b/code/022_variables.R old mode 100644 new mode 100755 diff --git a/code/023_practice.R b/code/023_practice.R old mode 100644 new mode 100755 diff --git a/code/030_functions.R b/code/030_functions.R old mode 100644 new mode 100755 diff --git a/code/031_control.R b/code/031_control.R old mode 100644 new mode 100755 diff --git a/code/032_iteration.R b/code/032_iteration.R old mode 100644 new mode 100755 diff --git a/code/033_practice.R b/code/033_practice.R old mode 100644 new mode 100755 diff --git a/code/040_data.R b/code/040_data.R old mode 100644 new mode 100755 diff --git a/code/041_dataio.R b/code/041_dataio.R old mode 100644 new mode 100755 diff --git a/code/042_reshaping.R b/code/042_reshaping.R old mode 100644 new mode 100755 diff --git a/code/043_practice.R b/code/043_practice.R old mode 100644 new mode 100755 diff --git a/code/080_models.R b/code/080_models.R old mode 100644 new mode 100755 diff --git a/code/081_correlation.R b/code/081_correlation.R old mode 100644 new mode 100755 diff --git a/code/082_ols.R b/code/082_ols.R old mode 100644 new mode 100755 diff --git a/code/083_practice.R b/code/083_practice.R old mode 100644 new mode 100755 diff --git a/code/090_ts.R b/code/090_ts.R old mode 100644 new mode 100755 diff --git a/code/091_lags.R b/code/091_lags.R old mode 100644 new mode 100755 diff --git a/code/092_smoothing.R b/code/092_smoothing.R old mode 100644 new mode 100755 diff --git a/code/093_practice.R b/code/093_practice.R old mode 100644 new mode 100755 diff --git a/code/100_maps.R b/code/100_maps.R old mode 100644 new mode 100755 diff --git a/code/101_geocoding.R b/code/101_geocoding.R deleted file mode 100644 index 6cccce7..0000000 --- a/code/101_geocoding.R +++ /dev/null @@ -1,54 +0,0 @@ - - -packages <- c("downloader", "ggmap", "plyr") -packages <- lapply(packages, FUN = function(x) { - if(!require(x, character.only = TRUE)) { - install.packages(x) - library(x, character.only = TRUE) - } -}) - - - -# Get the data. -# url = "http://aiddata.org/weceem_uploads/_ROOT/File/geocoding/AllWorldBank_IBRDIDA.zip" -zip = "data/wb.projects.zip" -# if(!file.exists(zip)) download(url, zip, mode = "wb") -# Read from the ZIP file. -wb = read.csv(unz(zip, "AllWorldBank_IBRDIDA.csv")) -# Subset to Africa. -wb = subset(wb, Region == "AFRICA") -# Inspect variables. -v = c("Project.ID", "Latitude", "Longitude", "Country", "Total.Amt") -head(wb)[v] - - - -# Get OpenStreetMap data. -map = get_map(location = 'Africa', zoom = 4) -# Plot World Bank projects. -ggmap(map) + - geom_point(data = wb, - aes(x = Longitude, y = Latitude, color = Country, size = Total.Amt), - alpha = .3) + - scale_size_area(max_size = 8) + - labs(y = NULL, x = NULL) + - theme(axis.text = element_blank(), - axis.ticks = element_blank(), - legend.position = "none") - - - -# Get OpenStreetMap data. -ton = get_map(location = 'Africa', zoom = 4, source = "stamen", maptype = "watercolor") -# Plot World Bank projects. -ggmap(ton) + - geom_point(data = wb, - aes(x = Longitude, y = Latitude, color = Country, size = Total.Amt)) + - scale_size_area(max_size = 8) + - labs(y = NULL, x = NULL) + - theme(axis.text = element_blank(), - axis.ticks = element_blank(), - legend.position = "none") - - diff --git a/code/102_choropleths.R b/code/102_choropleths.R old mode 100644 new mode 100755 diff --git a/code/10_colonization.R b/code/10_colonization.R old mode 100644 new mode 100755 diff --git a/code/110_networks.R b/code/110_networks.R old mode 100644 new mode 100755 diff --git a/code/111_influence.R b/code/111_influence.R old mode 100644 new mode 100755 diff --git a/code/112_networkds.R b/code/112_networkds.R old mode 100644 new mode 100755 diff --git a/code/11_austerity.R b/code/11_austerity.R old mode 100644 new mode 100755 diff --git a/code/1_hello.R b/code/1_hello.R old mode 100644 new mode 100755 diff --git a/code/2_bmi.R b/code/2_bmi.R deleted file mode 100644 index 07b76a0..0000000 --- a/code/2_bmi.R +++ /dev/null @@ -1,177 +0,0 @@ - -## IDA Exercise 2 -## -------------- - - -# Welcome back! This script is like the previous one. Run it to train yourself -# using R code and functions. Read the course pages first for context and help. - -# If you have not yet done so, execute the practice code from last week first. -# It has more comments on doing basic things with R and RStudio. - - -# Functions -# --------- - -# Here's Quételet's original Body Mass Index function. He did not code it in R. -# It requires weight in kilograms and height in meters to compute BMI properly. -bmi.quetelet <- function(weight, height, digits = 2) { - round(weight / (height^2), digits) -} - -# (Remember to select everything from { to } to run the code successfully.) - -# Now check if it works. -bmi.quetelet(65, 1.8) -bmi.quetelet(35, 1.8) -bmi.quetelet(95, 1.8) - -# Let's assume that my height is fixed at 1.80 m and that my weight varies. -# My BMI will equal to (weight) / 1.8^2, so I define the function y = x / 3.24. - -# That's a simple linear function. -bmi.fixed.height <- function(x) { - x / 3.24 -} - -# Let's plot it. -require(ggplot2) - -# Plotting over a weight range of 35 to 155. -qplot(c(35, 155), stat = "function", fun = bmi.fixed.height, geom = "line") - -# Let's save this for later use. -g <- qplot(c(35, 155), stat = "function", fun = bmi.fixed.height, geom = "line") - -# Now add some axis titles. -g <- g + labs(x = "Weight (fixed height = 1.8)", y = "BMI") - -# Now add a line to show when I become obese. -g <- g + geom_hline(yintercept = 30, linetype = "dashed") - -# Now add a rectangle to fill the surface where I am obese. -g <- g + geom_rect(ymin = 30, ymax = Inf, xmin = -Inf, xmax = Inf, fill = "red", alpha = .2) - -# Plot. -g - -# Now the opposite: fixed weight at 65 kg, height varies. -bmi.fixed.weight <- function(x) { - 65 / x^2 -} - -# Plotting over a range of 1.40 to 2.40. -h <- qplot(c(1.4, 2.4), stat = "function", fun = bmi.fixed.weight, geom = "line") - -# All options in one command. -h + labs(x = "Height (fixed weight = 65)", y = "BMI") + - geom_hline(yintercept = 30, linetype = "dashed", colour = "red") + - geom_rect(ymin = 30, ymax = Inf, xmin = -Inf, xmax = Inf, fill = "red", alpha = .2) - -# Now: - -# ... so you like math? -# ... you like logic too? -# ... let's be playful. - -# Create a BMI classification scale. -bmi.scale <- c(0, 18.5, 25, 30, 40) -# We need the names to go with it. -bmi.class <- c("Underweight", "Normal", "Overweight", "Obese", "Mordibly obese") - -# Check both elements have the same length. -length(bmi.scale) == length(bmi.class) - -# Remember what a modulo is? "How many times can you fit y into x"? -# R has a quick syntax for it. -4 %/% 3 -# ... because 4/3 = 1 plus something smaller than 1. -# The remainder is just one little modification away. -4 %% 3 -# You can fit 3 five times into 17... -17 %/% 3 -# ... and 17 - 3 * 5 = 2. -17 %% 3 - -# Now try to get the modulo of an imaginary BMI value that is underweight. -15 %/% bmi.scale -# That's correct: 15 / 0 = positive infinity. -15 / 0 - -# Now try to get the remainder of an imaginary BMI value that is overweight. -27 %% bmi.scale -# That's correct: the remainder of 27 / 0 is not a number (NaN). -27 %% 0 - -# Let's save the result of this last operation. -x <- 27 %% bmi.scale -# Check result. -x -# Identify the values where the remainder is NOT equal to the imaginary BMI. -x != 27 -# Save it to an object. -y <- (x != 27) -# Remove the FALSE values: keep only the vector elements that are not FALSE. -y <- y[y != FALSE] -# Check result. -y -# The length of the resulting object is the category to which the BMI belongs. -bmi.class[length(y)] - -# ... Notice how R fails gracefully by keeping the first NA element. -# ... But anyway, I digress. This whole stuff is not very efficient. -# ... All that work just to fit a number within a range! - -# Note the simple/lazy solution: -fake.bmi <- 32 -if(fake.bmi < 18.5) fake.class = "Underweight" -if(fake.bmi >= 18.5 & fake.bmi < 25) fake.class = "Normal" -if(fake.bmi >= 25 & fake.bmi < 30) fake.class = "Overweight" -if(fake.bmi >= 30 & fake.bmi < 40) fake.class = "Obese" -if(fake.bmi >= 40) fake.class = "Mordibly obese" - -# ... but this is too long to code, and we are too lazy. - -# Back to logic. -27 < bmi.scale -# Save it. -z <- (27 > bmi.scale) -# Check. -z -# Remove FALSE values. -z <- z[z != TRUE] -# Check. -z -# Length. -z <- length(z) -# Check. -z -# BMI classification. -bmi.class[z] -# This is almost as short as we would like. - -# By the power of R, we can code all of that into one line. -bmi.class[length(bmi.scale[27 > bmi.scale])] - -# Code that as a function. -bmi.quetelet <- function(weight, height, digits = 2) { - bmi <- weight / (height^2) - bmi.scale <- c(0, 18.5, 25, 30, 40) - bmi.class <- c("Underweight", "Normal", "Overweight", "Obese", "Mordibly obese") - class <- bmi.class[length(bmi.scale[bmi > bmi.scale])] - # Save results as a vector. Note how we round the BMI only at "print stage". - r <- c(round(bmi, digits), class) - # Return BMI and classification. - return(r) -} - -# Examples. -bmi.quetelet(35, 1.80) -bmi.quetelet(65, 1.80) -bmi.quetelet(95, 1.80) -bmi.quetelet(99, 1.80) -bmi.quetelet(99, 1.50) - - -## Peace and donuts. -## 2013-02-27 diff --git a/code/3_hhi.R b/code/3_hhi.R old mode 100644 new mode 100755 diff --git a/code/4_congress.R b/code/4_congress.R old mode 100644 new mode 100755 diff --git a/code/5_confidence.R b/code/5_confidence.R old mode 100644 new mode 100755 diff --git a/code/6_fertility.R b/code/6_fertility.R old mode 100644 new mode 100755 diff --git a/code/7_fertility.R b/code/7_fertility.R deleted file mode 100644 index 30d1bd3..0000000 --- a/code/7_fertility.R +++ /dev/null @@ -1,558 +0,0 @@ - -## IDA Session 7 -## ------------- - - -# Package loading -# --------------- - -# Load/install packages. -packages <- c("countrycode", "ggplot2", "Hmisc", "plyr", "scales") -packages <- lapply(packages, FUN = function(x) { - if (!require(x, character.only = TRUE)) { - install.packages(x) - library(x, character.only = TRUE) - } -}) - - -# DATA PREPARATION -# ---------------- - -# Let's first review some of the things that we now know how to do with data. - -# Load dataset. -qog_cs <- read.csv("data/qog_basic_cs.csv") - -# Create an extract of the QOG cross-sectional dataset. The list of variables is -# quasi-identical to the one used previously, with an extra variable for regions. -qog <- with(qog_cs, data.frame( - row.names = cname, # Country name (set as row name in the data extract) - ccode = ccodealp, # Country 3-letter code - region = ht_region, # Geographical region of the country - births = wdi_fr, # Fertility rate - gdpc = wdi_gdpc, # GDP per capita - gexp = wdi_ge, # Government expenditures as percentage of GDP - hexpc = wdi_hec, # Health care expenditure per capita - gini = uw_gini, # GINI coefficient of economic inequality - edu = bl_asyt25, # Average years of schooling - gris = wdi_gris, # Female to male ratio in schools - winpar = m_wominpar # Percentage of women in parliament -)) - - -# Geographic indicators -# --------------------- - -# Let's now see what kind of geographic descriptors we have in the dataset. -# The first one is a list of ISO-3 country codes, an international standard. -levels(qog$ccode) - -# This list has been supplemented, in the QOG dataset, by a geographic variable -# combining spatial proximity and expert decisions on regional democratization. -str(qog$region) - -# An issue here is that we imported the data without its labels, so the regions -# are uninformative. But with ISO-3 country codes, we can recreate UN regions. -table(countrycode(qog$ccode, "iso3c", "region")) - -# We save the UN continent instead to create groups with many countries each. -qog$region <- with(qog, countrycode(ccode, "iso3c", "continent")) - -# Check result. -qplot(qog$region, geom = "bar") - - -# Missing data -# ------------ - -# Some entries in the QOG dataset are not recognized as UN countries. -rownames(qog)[is.na(qog$region)] - -# Drop these cases, that have been counted in other country indicators. -qog <- subset(qog, !is.na(region)) - -# The analysis that we will run today also runs faster without missing data. -# Let's identify the rows with missing data. -qog$fulldata <- complete.cases(qog) - -# Check result. The complete.cases() function is a utility function that will -# return TRUE if there is absolutely no missing data on a row of the dataset. -table(qog$fulldata) - -# Identify where the missing data lies. There is a lot, so we will remember that -# when we look at plots and reading the results of statistical tests. -qplot(data = qog, x = region, fill = fulldata, geom = "bar") - -# Subset to full data rows. -qog <- na.omit(qog) - - -# Reordering factors -# ------------------ - -# In R, sorting the data is different from ordering levels in a variable. -# Let's see how to sort the data from highest to lowest fertility rates. -by_births <- order(qog$births, decreasing = TRUE) - -# Show the first three columns in that sorting order. -head(qog[by_births, 1:3]) - -# For ascending sort order, you would simply not specify the decreasing option. -head(qog[order(qog$births), 1:3]) - -# Countries for which fertility rates are missing are always sorted last. -tail(qog[order(qog$births), 1:3]) - -# But this is different from the order of a string variable like country codes. -# The country code variable holds text, coded as factors and ordered by levels. -str(qog$ccode) - -# An issue here is that R naturally orders factors by alphabetical order, which -# is not optimal for ordering categorical variables like countries in plots. - -# Here's an example where countries are alphabetically ordered. -qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point") - -# Now, reorder country code factor levels by their respective fertility rates. -qog$ccode <- with(qog, reorder(ccode, births, mean)) - -# The example now shows countries ordered by fertility rates. -qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point") - -# Let's now visualize the distribution of fertility rates in each region. -qplot(data = qog, x = births, fill = region, color = region, geom = "density") + - facet_grid(region ~ .) - -# The regions are shown in alphabetical order. Let's fix that. -qog$region <- with(qog, reorder(region, -births, mean)) - -# Rerun the plot to see regions ordered by average fertility rates. -qplot(data = qog, x = births, fill = region, color = region, geom = "density") + - facet_grid(region ~ .) - -# These manipulations show how to plot distributions as densities and how to use -# the mean value of a distribution to summarize it. More after our last recodes. - - -# Log transformation -# ------------------ - -# Let's turn to a different variable: GDP per capita. -qplot(data = qog, x = region, y = gdpc, color = region, geom = "boxplot") - -# For such a variable, a log-transformation helps. -qplot(data = qog, x = region, y = log(gdpc), color = region, geom = "boxplot") - -# What exactly are we doing here? We are basically compressing the distribution -# of the variable to get outliers in closer range to other observations. - -# Let's look at the distribution of GDP per capita more closely to get that. -summary(qog$gdpc) - -# The maximum value is pretty far away from the median value. -with(qog, max(gdpc) / median(gdpc)) - -# Order the data by GDP per capita. -qog <- qog[order(qog$gdpc, decreasing = TRUE), ] - -# Get the five leader countries. -head(qog)[1:4] - -# Now reorder the levels of the country code factor variable by GDP per capita. -qog$ccode <- reorder(qog$ccode, qog$gdpc, mean) - -# In fact, the top 5-10% observations are just on a completely different level. -qplot(data = qog, x = gdpc, stat = "ecdf", geom = "step") - -# This curve is the empirical cumulative distribution function (ECDF) of the -# variable: it shows how its values change throughout its quantiles. - -# Here's the ECDF for each region: see which one creates most variation. -qplot(data = qog, x = gdpc, stat = "ecdf", color = region, geom = "step") - -# Now see what happens to the ECDFs when you log the variable. -qplot(data = qog, x = log(gdpc), stat = "ecdf", color = region, geom = "step") - -# The overall distribution of log-GDP per capita is more linear. -qplot(data = qog, x = log(gdpc), stat = "ecdf", geom = "step") - -# Log-transform GDP per capita. -qog$gdpc <- log(qog$gdpc) - - -# Interval recodes -# ---------------- - -# Let's turn to yet another variable: government expenditure as % of GDP. -qplot(data = qog, x = region, y = gexp, color = region, geom = "boxplot") - -# The plots show great variability of government expenditure within regions. -qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") + - facet_grid(region ~ .) - -# Let's try a 'Hans Rosling' genius plot: a stacked density plot by region. -qplot(data = qog, x = gexp, ..density.., geom = "density", - color = region, fill = region, alpha = I(.75), position = "stack") - -# However, this plot does not preserve the count of each region. This one does. -qplot(data = qog, x = gexp, ..count.., geom = "density", - color = region, fill = region, alpha = I(.75), position = "fill") - -# One option now is to recode this continuous variable to categories of low, -# medium and high government expenditure, for instance. How do we get there? - -# Let's recode some continuous variables into categorical ones by dividing them -# into two equally sized groups defined by their median (50th percentile) value. - -# Summarize the variable. -summary(qog$gexp) - -# The quartiles and median are the 25th, 50th and 75th percentiles. They can be -# used to "cut" the variable to intervals. First, compute the quartile values. -q <- quantile(qog$gexp) - -# Check the quartile values. -q - -# Plot regional distributions of government expenditures with sample quartiles. -qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") + - geom_vline(x = q, linetype = "dashed") + facet_grid(region ~ .) - -# Now create a new variable by using these cutoff points as interval categories. -qog$gexp.4 <- cut(qog$gexp, breaks = q) - -# Check the interval categories. -table(qog$gexp.4) - -# Check them with boxplots. -qplot(data = qog, y = gexp, x = gexp.4, geom = "boxplot") - -# Stack them by region. -g <- qplot(data = qog, x = region, fill = gexp.4, geom = "bar") - -# Check result, which is unsatisfactory with default colors and missing values. -g - -# Plot with blue sequential gradient. -g + scale_fill_brewer() - -# Plot with red-blue diverging gradient. -g + scale_fill_brewer(palette = "RdBu") - -# Back to the data. We are going to produce interval recodes for all variables, -# using a very simple cutoff point for the values: above or below the median. - -# Define a function to cut the data at median and label segments "lo" and "hi". -hilo <- function(x) { - cut(x, breaks = quantile(x, probs = 0:2/2), labels = c("lo", "hi"), - ordered = TRUE, include.lowest = TRUE) -} - -# Apply to all continuous variables. -x <- lapply(qog[, 3:10], FUN = hilo) - -# Rename variables with original names, followed by ".2" to discriminate them. -names(x) <- paste0(names(x), ".2") - -# Convert to data frame and check result. -str(x) - -# Add these variables to the original QOG data. -qog <- cbind(qog, as.data.frame(x)) - -# Check final result. -str(qog) - -# Let's tell R to look in the QOG data for any variable we mention subsequently. -# The command saves us time and frees us from the need to call the qog object in -# in front of every variable call (e.g. qog$births). To cancel the attachment, -# we will type detach(qog) when were are done. - -attach(qog) - - -# DESCRIPTIVE STATISTICS -# ---------------------- - -# What's on the menu here? Take a variable like log-GDP per capita. This is the -# distribution, a.k.a the probability density function (PDF) of the variable. -qplot(gdpc, geom = "density") - -# It is easier to read it in natural units (constant USD), so exponentiate it. -qplot(exp(gdpc), geom = "density") - -# And now have a look again at which countries compose the distribution. -qplot(exp(gdpc), ..count.., geom = "density", - color = region, fill = region, alpha = I(.75), position = "stack") - -# For such a variable, the mean and the median can describe the distribution. -# Now turn again to some previously studied examples of categorical variables. - -# Frequencies of government expenditure in each region. -qplot(x = region, fill = gexp.4, geom = "bar") - -# Plot stacked (relative) frequencies of government expenditure in each region. -qplot(x = region, fill = gexp.4, geom = "bar", position = "fill") + - scale_y_continuous(labels = percent) + scale_fill_brewer() - -# Here, frequencies and relative frequencies (percentages) make sense. -# The next sections show to produce these summary statistics. - - -# Continuous (and ordinal) measures -# --------------------------------- - -# Standard summary statistics (the 'five-number summary' figures). -summary(qog[, 3:9]) - -# Obtain summary statistics for the fertility rate: - -# (1) Range -max(births) -min(births) - -# (2) Arithmetic mean -mean(births) -sum(births) / length(births) # formula - -# (3) Standard deviation -sd(births) -sqrt(sum((births - mean(births))^2) / length(births)) # formula: sqrt(var(x)) - -# (4) Median -median(births) -quantile(births, probs = .5) # 50th percentile - -# (5) Percentiles -quantile(births) # quartiles -quantile(births, probs = c(0.1, 0.9)) # specific cutoff points - - -# Plots for continuous variables -# ------------------------------ - -# Histograms. -hist(births, main="Histogram of Fertility in 81 countries") -hist(births, br = 20) - -# For normality assessment, you first want to visualize the normal distribution. -curve(dnorm, from = -3, to = 3, col = "red", lwd = 2) - -# Then you want to visualize the quantiles of your variable against normal ones. -# Say hello to the normal quantile-quantile ('QQ') plot. -qqnorm(births) - -# And finally you want to add a line that would correspond to perfect normality. -# Note that this command requires that you have just plotted a normal QQ-plot. -qqline(births, col = "red") - -# More default plots. -stripchart(births, method = "jitter", main = "Stripchart with jitter") -boxplot(births, main = "Boxplot") - -# For a continuous and a categorical variable, boxplots have an easy syntax. -boxplot(births ~ region) -boxplot(births ~ gini.2) - -# The next plots go further than the default R syntax. You need to learn ggplot2 -# to get the syntax. An awesome handbook is Winston Chang's R Graphics Cookbook. - -# Histograms, using ggplot2 syntax. -qplot(births) -qplot(births, binwidth = 1) -ggplot(qog, aes(x = births)) + - geom_histogram(binwidth = 1, fill = "white", color = "black") - -# Density plots, using ggplot2 syntax. -qplot(births, geom = "density") -qplot(births, stat = "density", geom = "line") - -# Combined histogram and density plots. -ggplot(qog, aes(x = births, y = ..density..)) + - geom_histogram(binwidth = 1, fill = "cornsilk", color = "grey50") + - geom_line(stat = "density", size = 2) - -# Normal QQ-plot. -qplot(sample = births) -qplot(sample = births, color = region) - -# Boxplots. -qplot(y = births, x = region, geom = "boxplot") -qplot(y = births, x = region, geom = "boxplot") + geom_point(alpha = .5) -qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2) -qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2) + coord_flip() - -# Violin plots. -qplot(y = births, x = region, geom = "violin") + geom_point() -qplot(y = births, x = region, geom = "violin") + geom_point(alpha = .5) - - -# Categorical (and ordinal) measures -# ---------------------------------- - -# Obtain frequencies (count and relative) for geographic regions: - -table(region) # frequencies by category -prop.table(table(region)) # in proportions -round(prop.table(table(region)), 1) # rounded to one decimal point -round(100 * (prop.table(table(region)))) # shown in percentages - -# R was clearly not designed by people who use percentages a lot. -# Let's write a function to do this in as less code as possible. - -percents <- function(r, c, which = 0, digits = 1) { - if(which == "rows") which = 1 - if(which == "cols") which = 2 - if(which == "cell" | which == 0) which = NULL - round(100 * prop.table(table(r, c), which), digits) -} - -# Cell percentages. -percents(region, births.2) - -# Check that the function works. -sum(percents(region, births.2, "cell")) # this is the default - -# Row percentages. -percents(region, births.2, "rows") # you can use 1 instead of "rows" - -# Column percentages. -percents(region, births.2, "cols") # you can use 2 instead of "cols" - - -# Plots for categorical variables -# ------------------------------- - -# Default bar/dot plots, for categorical variables. -barplot(table(region)) -barplot(table(region), horiz = TRUE) -dotchart(table(region), cex = 1) - -# For a pair of categorical variables, use mosaic plots. -mosaicplot(table(region, births.2), col = TRUE) -mosaicplot(table(gini.2, births.2), col = TRUE) - -# And finally, the infamous pie chart that makes R look like Excel 97. -pie(table(region)) - -# Using ggplot2 syntax, we can show how to transform bars into polar coordinates -# (which is, again, a perceptual nightmare for the reader). - -# Bar plots. -qplot(region, geom = "bar") -qplot(region, geom = "bar", fill = gexp.4) + coord_flip() - -# "This is a pie of my favourite bars" (Barney Stinson). -qplot(region, geom = "bar", fill = gexp.4) + coord_polar() + - scale_fill_brewer(palette = "RdBu") - -# Dot plots. -qplot(region, geom = "point") - - -# ASSOCIATION TESTS -# ----------------- - - -# Significance testing revolves around the idea that a statistic, whatever it -# might stand for, can be estimated to be significantly different from zero. -# One of the simplest forms of test, the t-test, is shown below. -t.test(births) - -# The bounds of the test form a confidence interval: the range in which the -# test estimates that the mean of the fertility rate is located. The width of -# the interval depends on both the sample size and the level of confidence. -c(t.test(births)$conf.int[1], mean(births), t.test(births)$conf.int[2]) - - -# Confidence intervals -# -------------------- - -# Here's the manual way to obtain a confidence interval, based on the assumption -# that the data follows some approximation of the normal distribution. We first -# compute the mean, standard deviation and N of fertility rates in each region. -ci <- ddply(qog, .(region), summarise, - mu = mean(births), - sd = sd(births), - n = length(births)) - -# We now turn to the standard error, which is an approximation of the standard -# deviation of the data in the true population (each region), assuming that it -# can be approximated from the sample standard deviation divided by sqrt(N). -ci$se <- ci$sd / sqrt(ci$n) - -# We finally decide for a 95% level of confidence that corresponds roughly to -# two standard errors around the mean. In a normal distribution, roughly 95% -# of all observations fall within this range around the mean. -ci$lo <- ci$mu - 1.96 * ci$se -ci$hi <- ci$mu + 1.96 * ci$se - -# Plot 95% CIs. -qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") + - geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) - -# Plot 99% CIs. -qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") + - geom_errorbar(aes(ymax = mu + 2.58 * se, ymin = mu - 2.58 * se), width = .25) - -# Throw all this code into a convenience function. -getCI <- function(data, x, group, z = 1.96) { - require(plyr) - df <- with(data, data.frame(x, group, z, n = 1)) - df <- na.omit(df) - print(head(df)) - ci <- ddply(df, .(group), summarise, - mu = mean(x), - sd = sd(x), - n = sum(n), - se = sd(x) / sqrt(sum(n)), - lo = mean(x) - mean(z) * sd(x) / sqrt(sum(n)), - hi = mean(x) + mean(z) * sd(x) / sqrt(sum(n))) - return(ci) -} - -# Test by getting 95% CIs for fertility in the high and low GINI country groups. -ci <- getCI(qog, births, gini.2) - -# Plot the result. -qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") + - geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) + - labs(y = "Mean fertility rate", x = "Level of GINI coefficient") - -# Test by getting 95% CIs for fertility in the high and low GINI country groups. -ci <- getCI(qog, births, gdpc.2) - -# Plot the result. -qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") + - geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) + - labs(y = "Mean fertility rate", x = "Level of GDP per capita") - -# In truth, this is typically the case where we would be graphing scatterplots. -qplot(births, gdpc) + geom_smooth() - -# ... but that will have to wait one more week :) - - -# With two categorical variables -# ------------------------------ - -chisq.test(table(region, births.2)) # Chi-squared test of independence -fisher.test(table(region, births.2)) # if cell counts < 5, Fisher's test is recommended - -chisq.test(table(gini.2, births.2)) -chisq.test(table(gdpc.2, births.2)) -chisq.test(table(gexp.2, births.2)) # etc. - -# With one continuous variable and a binary variable -# -------------------------------------------------- - -t.test(births ~ gini.2) -t.test(births ~ gdpc.2) -t.test(births ~ gexp.2) # etc. - -detach(qog) - - -# That's all for now! Enjoy your day. -# 2013-02-18 diff --git a/code/8_fertility.R b/code/8_fertility.R old mode 100644 new mode 100755 diff --git a/code/8_imf.R b/code/8_imf.R old mode 100644 new mode 100755 diff --git a/code/8_vwreg.R b/code/8_vwreg.R old mode 100644 new mode 100755 diff --git a/code/9_twitter.R b/code/9_twitter.R old mode 100644 new mode 100755 diff --git a/code/README.md b/code/README.md old mode 100644 new mode 100755 diff --git a/data/README.md b/data/README.md old mode 100644 new mode 100755 diff --git a/data/assange.txt b/data/assange.txt old mode 100644 new mode 100755 diff --git a/data/bartels.presvote.4812.csv b/data/bartels.presvote.4812.csv old mode 100644 new mode 100755 diff --git a/data/beijing.aqi.2013.csv b/data/beijing.aqi.2013.csv old mode 100644 new mode 100755 diff --git a/data/browsers.0813.csv b/data/browsers.0813.csv old mode 100644 new mode 100755 diff --git a/data/bshor.congress.2012.csv b/data/bshor.congress.2012.csv old mode 100644 new mode 100755 diff --git a/data/dailykos.votes.0812.csv b/data/dailykos.votes.0812.csv old mode 100644 new mode 100755 diff --git a/data/doctorow.txt b/data/doctorow.txt old mode 100644 new mode 100755 diff --git a/data/dwnominate.zip b/data/dwnominate.zip old mode 100644 new mode 100755 diff --git a/data/fide.zip b/data/fide.zip old mode 100644 new mode 100755 diff --git a/data/ga.network.csv b/data/ga.network.csv old mode 100644 new mode 100755 diff --git a/data/geos.tww.csv b/data/geos.tww.csv old mode 100644 new mode 100755 diff --git a/data/grades.2012.csv b/data/grades.2012.csv old mode 100644 new mode 100755 diff --git a/data/htus8008.zip b/data/htus8008.zip old mode 100644 new mode 100755 diff --git a/data/icm.polls.8413.csv b/data/icm.polls.8413.csv old mode 100644 new mode 100755 diff --git a/data/imf.weo.2012.csv b/data/imf.weo.2012.csv old mode 100644 new mode 100755 diff --git a/data/nhis.2005.csv b/data/nhis.2005.csv old mode 100644 new mode 100755 diff --git a/data/oecd.bli.2011.tsv b/data/oecd.bli.2011.tsv old mode 100644 new mode 100755 diff --git a/data/olympics.2012.csv b/data/olympics.2012.csv old mode 100644 new mode 100755 diff --git a/data/piketty.saez.2011.zip b/data/piketty.saez.2011.zip old mode 100644 new mode 100755 diff --git a/data/qog.cs.zip b/data/qog.cs.zip old mode 100644 new mode 100755 diff --git a/data/schiller.8712.csv b/data/schiller.8712.csv old mode 100644 new mode 100755 diff --git a/data/twitter.an.zip b/data/twitter.an.zip old mode 100644 new mode 100755 diff --git a/data/us.recessions.4807.csv b/data/us.recessions.4807.csv old mode 100644 new mode 100755 diff --git a/data/wasserman.votes.0812.csv b/data/wasserman.votes.0812.csv old mode 100644 new mode 100755 diff --git a/data/wb.projects.zip b/data/wb.projects.zip old mode 100644 new mode 100755 diff --git a/data/wdi.govdebt.0511.csv b/data/wdi.govdebt.0511.csv old mode 100644 new mode 100755 diff --git a/images/book-chang.gif b/images/book-chang.gif old mode 100644 new mode 100755 diff --git a/images/book-kabacoff.jpg b/images/book-kabacoff.jpg old mode 100644 new mode 100755 diff --git a/images/book-pace.jpg b/images/book-pace.jpg old mode 100644 new mode 100755 diff --git a/images/book-teetor.gif b/images/book-teetor.gif old mode 100644 new mode 100755 diff --git a/images/book-urdan.jpg b/images/book-urdan.jpg old mode 100644 new mode 100755 diff --git a/images/book-yau.png b/images/book-yau.png deleted file mode 100644 index 3d7784e..0000000 Binary files a/images/book-yau.png and /dev/null differ diff --git a/images/data-science-conway.png b/images/data-science-conway.png old mode 100644 new mode 100755 diff --git a/images/isarithmic-map-sparks.png b/images/isarithmic-map-sparks.png old mode 100644 new mode 100755 diff --git a/images/rstudio-autocompletion.png b/images/rstudio-autocompletion.png old mode 100644 new mode 100755 diff --git a/images/rstudio-interface-1.png b/images/rstudio-interface-1.png old mode 100644 new mode 100755 diff --git a/images/rstudio-interface-2.png b/images/rstudio-interface-2.png old mode 100644 new mode 100755 diff --git a/images/rstudio-new-script.png b/images/rstudio-new-script.png old mode 100644 new mode 100755 diff --git a/images/rstudio-save-plot.png b/images/rstudio-save-plot.png old mode 100644 new mode 100755 diff --git a/images/rstudio-setwd-pref.png b/images/rstudio-setwd-pref.png old mode 100644 new mode 100755 diff --git a/images/rstudio-setwd.png b/images/rstudio-setwd.png old mode 100644 new mode 100755 diff --git a/images/split-apply-combine-shalizi.png b/images/split-apply-combine-shalizi.png old mode 100644 new mode 100755 diff --git a/images/split-apply-combine-wickham.png b/images/split-apply-combine-wickham.png old mode 100644 new mode 100755 diff --git a/index.Rmd b/index.Rmd old mode 100644 new mode 100755 diff --git a/style.css b/style.css old mode 100644 new mode 100755 diff --git a/syllabus.pdf b/syllabus.pdf old mode 100644 new mode 100755