# Intro to R workshop
# Social Sciences Data and Software
# May 8th, 2014
# Gustavo Robles and Charlie Gomez
##########################################
### 0. PRELIMINARIES ####################
##########################################
# COMMENTS. To insert comments use "#"
# SHORCUTS. To run a command directly from the script, place the cursor at the end of the command line and type:
# MAC users: command + enter
# Windows users: Control + R
##########################################
### 1. SETTING THE WORKING DIRECTORY #####
##########################################
# Let's start with an empty working directory
rm(list = ls()) ### Remove all objects in the working environment
### Similar to "clear all" in Stata
# The working directory is the place in your computer where R will be running the script
# To set the working directory use setwd("Path")
# PLEASE CHANGE YOUR WORKING DIRECTORY NOW
setwd("/Users/.../workshops/Intro to R May 2014")
# The working directory can be set in your computer, in your AFS space at Stanford, or in some other location that you can access through your computer.
# To display the current working directory use getwd()
getwd()
##################################################
### 2. Creating and storing objects in R ########
##################################################
# The command line prompt (>) is an invitation to type commands or expressions
# After you write a command, type Enter to execute it
# R can work as a calculator, for example,
2+3
2*3
2^3
2/3
# In R you can create different objects and give them a name.
# To create/declare an object, use "<-"
# <- means "the values en the right are assigned to the name on the left".
# 2.1. The simplest objects are scalars, for example
A <- 2
# This object is a global variable in the workspace.
# To call it type
A
# A famous scalar
pi
# 2.2. String variables
B <- "Workshop"
B
# 2.3. Vectors of numbers or strings
# The function "c()" in one of the most important functions in R
### It concatenates a series of elements
C <- c(100,200,300,400,550)
C
# Note that "c()" is a function and "C" is a vector of the workspace
c
C
# To create a series of numbers, use ":"
1:10
# To create other sequences, use the function seq()
# The notation for this functions is seq(from,to,by)
seq(1,10)
seq(1,10,2)
# 2.4. Matrices
# Use the function "matrix()"
# Most of the functions have a help menu
# If you need help with any function, type "?function"
# The help menu describes the function and its arguments and output
?matrix
D <- matrix(c(10,20,30,40), nrow=2,ncol=2)
D
# 2.5. Tables and databases
# The function data.frame() creates tables.
# You can specify the names and values of your variables
E <- data.frame(age = c(20,24,26,23,29),
sex = c("Female","Male","Female","Male","Female"),
treatment = c(1,0,0,1,1),
income = c(1000,1500,2000,2500,3000))
E
# 2.6. Econometric/statistical models
# You can also estimate a statistical model and save it as an object.
# For example, the function lm() refers to the lineal model
# The simplest notation is lm(formula, data)
?lm
G <- lm(income ~ treatment + age + sex, data=E)
# Lineal model G is now an object of the workspace.
G
# 2.7. Lists
# Finally, you can create "lists" with a collection of "objects"
# Here we make a list with all the objects we've created so far
# We use the function list()
global.list <- list(A,B,C,D,E,G)
global.list
# To know the type or "class" of an object, you can use the function class()
class(A)
class(B)
class(C)
class(D)
class(E)
class(G)
class(global.list)
# Note that in your R-script, some classes have a different color
# Numbers : Green
# Strings : Red
# Functions : Blue
# Object names : Black
# To know which objects you've created so far use the function ls()
# The function ls() "lists" the objects that are stored in the working space
ls()
# To remove an object from the working space, use remove o rm
remove(A)
ls()
# To remove all objects in the workspace, type
# rm(list = ls())
# or choose "Clear Workspace" in the drop down menu
# rm() removes objects from the workspace or from a local environment
################################################
### 3. Functions in R ########################
################################################
########################
# 3.1. Basic functions #
########################
# There are different functions already loaded in R
# All functions are followed by (), which must contain the function's arguments
# Let's work with our vector C
C
# Mean
mean(C)
### To call the help file of a function type ?function
?mean
# Other functions for vectors
sd(C) # standard deviation
max(C) # maximum
min(C) # minimum
log(C) # logarithm
exp(C) # exponential
sum(C) # sum
length(C) # number of elements
sqrt(C) # squared root
################################
# 4.2. Matrix operators ########
################################
# Let's work with our vector C and matrix D
# Note that all vectors in R are column vectors
C
D
t(C) # Transpose of a vector or a matrix t()
C + C # Sum
C - C # Diference
# Note the following difference
D
D*D # Element wise multiplication
D^2 # Element wise exponentiation
D%*%D # Dot product/inner product
##################################################
# 4.2. Functions to get familiar with datasets ##
##################################################
# Let's call our data frame E
E
dim(E) # Dimensions of the data frame
head(E,3) # Shows first 3 rows
tail(E,3) # Shows last 3 row
str(E) # Displays the structure of an R object
summary(E) # Displays summary statistics
# You can use the following commands to browse your data
View(E) # Opens a spreadsheet-style data for browsing
fix(E) # Opens the database for editing
edit(E) # Opens the database for editing
# We don't recommend to edit your data using this commands
##################################
# 4.4. User-defined functions ###
##################################
# You can create your own function in R
# The basic syntax of a function is
# function.name <- function(arguments){
# operations using arguments
# return(outputs)
# }
# Note the use of local variables to define the function.
# Local variables are defined within the function and are not part or the working space
# Example, sd() is a function in R that estimates the standard deviation
# of a vector by dividing the sum of squared deviations with respect to the mean by n-1.
# Here we will create an alternative function "sd.n" that divides squared deviations by n.
sd.n <- function(input.vector){ # input.vector is a local var
mean.vector <- mean(input.vector) # mean.vector is a local var
squared.dev <- (input.vector - mean.vector)^2 # squared.dev is a local var
variance <- sum(squared.dev)/length(input.vector) # variance.local is a local variable
std.dev.n <- sqrt(variance) # std.dev.n is a local var
return(std.dev.n) # Report std.dev.n
}
C
sd(C)
sd.n(C)
################################################
### 5. Functions created by other users ########
################################################
# There is a vast online library of functions in R created by
# other users. The programs come in "packages" that also include
# databases and functions to estimate advanced statistical models,
# import data, make graphs and tables, estimate test statistics, etc.
# These functions, models, and databases come in packages that
# you need to install and load in R.
# For example, the package "foreign" includes a function that
# allows the user to read STATA databases.
# To install the package, type install.packages("name")
install.packages("foreign")
# You will have to select a server or CRAN mirror from which the package will be downloaded to your computer.
# Choose the server USA (CA 2) for faster downloads.
# The installation in your computer has to be made ONCE.
# Nevertheless, you have to "load" or call the package EVERY R session you are going to use it.
# To load a package that has been previously installed, type
# library()
library("foreign")
# To see a summary of a package type library(help="foreign")
library(help="foreign")
# To see the full documentation, type help(package="package")
help(package="foreign")
# For this session we need to install and load the following packages
# PLEASE INSTALL THE FOLLOWING PACKAGES NOW
install.packages("plyr")
install.packages("lattice")
install.packages("Zelig")
install.packages("xlsx")
# Use search() to see which packages are loaded in the working space
search()
library("plyr")
search()
#################################
### 6. Reading Data in R ########
#################################
# Different ways to read data
################################
# 6.1. Databases in packages ###
################################
# In addition to functions, some packages in R include databases
# You can see the available databases in a package by typing data(package="package")
# The function data() loads data sets, or lists available ones
# For example, the package "Zelig" has the following datasets
library("Zelig")
data(package="Zelig")
# This session we will be working with the "turnout" dataset in the "Zelig" package.
# We need first to load the package to the working environment by using the function data()
data("turnout")
# Remember, use ls() to see the objects available in your workspace.
ls()
# Let's browse the dataset
fix(turnout)
str(turnout)
# Or we can just look at the first 5 rows
head(turnout,5)
# turnout contains individual-level turnout data.
# It pools several American National Election Surveys conducted during the 1992 presidential election year.
# Only the first 2,000 observations are included in the sample data
# The codebook is available in the help menu of the dataset
?turnout
#################################
# 6.2. Delimited data files #####
#################################
# Functions read.csv(), read.table(), read.delim() read delimited data files, .csv, and .txt files
# Remember, our working directory is
getwd()
# Reading and naming a delimited data file
# data2 <- read.csv(
# file = "path/file.extension",# Path and name of the file
# header=TRUE, # Name of variables in the first row? (TRUE,FALSE)
# sep = "\t", # Column delimiter (comma, tab, semi colon, space, etc.)
# dec = ".", # Character for decimals
# na.strings = "NA") # Character or string for missing values
# data("data2")
# Here is a summary of other functions you can use for reading data
# read.xlsx() reads Excel files, requires package "xlsx"
# read.dta() reads Stata databases, requires package "foreign"
# read.xport() reads a file in SAS Transport (XPORT) format, requires "foreign"
# read.ssd() runs a SAS script that saves a SAS permanent dataset (.ssd or .sas7bdat),requires "foreign"
#################################
### 6.3. Online databases #######
#################################
# The function read.table() can be used to read online databases
Prestige <- read.table(
"http://socserv.socsci.mcmaster.ca/jfox/books/Companion/data/Prestige.txt",
header=TRUE)
ls()
Prestige
# Prestige is a database of that contains data on Canadian Occupations
# The database is also available in the package "car"
#######################################
### 7. Transforming Data in R ########
#######################################
# The main distinction of R versus Stata, is that in R you can use different objects and datasets at the same time.
# The most important skill in R is knowing how to select an object or extracting elements of an object.
# In R, there are important differences between (),[],[[]],and {}
# () are used for functions. They are preceded by the name of the function and contain the arguments of the function.
C
mean(C)
# [] are used to indicate the position of an element in an object.
# They are preceded by the name of the object and contain the position number of an element in a list or matrix.
C
C[2] # Second element of vector C
# [[]] are used to indicate the position of an object in a list.
# They are preceded by the name of the list and contain the relative position of the object in the list.
global.list
global.list[[3]] # Third object of our list.
global.list[[3]][2] # Second element of the third object of our list.
# {} are used to program loops and functions.
# function.name <- function(arguments){
# operations using arguments
# return(outputs)
# }
# See the code for our function sd.n()
###############################
### 7.1. Subsampling data ####
###############################
# This is one of the most important skills to learn in R #
# A dataset is a two dimensional object in R c(rows, columns).
# To select elements in a matrix, table, or dataframe you should provide the NUMBERS of the rows and columns you want to select.
# Use data[rows,columns] to indicate these numbers, leave a blank to select all.
# Let's go back to our turnout dataset
head(turnout)
# Let's select some rows
turnout[3 , ] # Row 3, all columns
turnout[c(3:5) , ] # Rows 3 to 5, all columns
turnout[ c(1,3,5), ] # Rows 1, 3, and 5, all columns
# Columns or variables in datasets can be selected in three different ways
# Suppose we want to select the vector of ages
head(turnout)
# These three notations are equivalent
turnout[ , 2] # All rows, column 2
turnout[ , "age"] # All rows, column "age"
turnout$age # Variable age in dataset turnout
# The sign "$" indicates that vector "age" is an element of the object/dataset "turnout"
# To select two or more variables, you can type
turnout[ , c("age","educate")]
#################################################
### 7.2. Subsampling data using conditionals ###
#################################################
# To select samples of the data that satify one or more conditions, use the following conditionals
# < less than
# <= less or equal than
# > greater than
# >= greater or equal than
# == equal to
# != different to
# You can use more than one conditional to select samples of your data by using
# & and
# | or
head(turnout)
# Let's select the observations for white people using conditionals
# Two different notations
turnout$race=="white" # The output is a vector with values TRUE or FALSE
which(turnout$race=="white") # The output is a vector with the rows that satisfy this condition
# Note that the first vector reports output for the 2000 observations of the data.
# The second one only reports sucess cases.
# Now we subsample our data
# The two notations are equivalent
turnout[ turnout$race=="white" , ] # Select cases (rows) where condition == TRUE
turnout[which(turnout$race=="white"), ] # Select cases (rows) that satisfy the condition
head(turnout)
# You can specify more than one condition
turnout$race=="white" & turnout$age < 30
which(turnout$race=="white" & turnout$age < 30 )
# And take a subsample of the data
turnout[turnout$race=="white" & turnout$age < 30, ]
###########################
### 7.3. Dropping data ###
###########################
# Use the minus sign to drop cases or variables
# Let's go back for a minute to data frame E
E
E[-3,] # Drops row 3
E[-c(3:5),] # Drops rows 3 to 5
E[-c(1,3,5),] # Drops rows 1, 3, and 5
# To drop variables you can use the following notations
# Note that colnames("data") gives the column names of the data frame
E
colnames(E)
E[ , -3] # Drops variable "treatment"
E[, c(colnames(E)!="treatment")] # Drops variable "treatment"
E[,-c(which(colnames(E)=="treatment"))] # Drops variable "treatment"
###########################################
### 7.4. Transforming data ##############
###########################################
data(turnout)
# To create a new variable, refer to the database followed by $ and the name of the new variable
head(turnout)
# For example, to create a variable of squared years of education
turnout$educate_sqr <- turnout$educate^2
head(turnout)
# Suppose we want to create a dummy variable "d_30_plus" that takes the value of 1 if age >= 30 and 0 otherwise.
# One way to create this variable is as follows
head(turnout)
turnout$d_30_plus[turnout$age >= 30] <- 1 # Generate d_30_plus = 1 if age >= 30
head(turnout)
turnout$d_30_plus[turnout$age < 30] <- 0 # Replace d_30_plus = 0 if age < 30
head(turnout)
# Using this notation, you can create a new variables or replace existing ones.
# Another example.
# Suppose that instead of a numeric variable for "d_30_plus", we want a string variable "Above 30"/"Below 30"
# Equivalent notation
head(turnout)
turnout[ turnout$d_30_plus==1 , "d_30_plus" ] <- "Above 30" # Replace d_30_plus = "Above 30" if d_30_plus == 1
head(turnout)
turnout[ turnout$d_30_plus==0 , "d_30_plus" ] <- "Under 30" # Replace d_30_plus = "Above 30" if d_30_plus == 0
head(turnout)
# An alternative way to do it is by using the ifelse() function
turnout$d_30_plus <- ifelse(turnout$age>=30, "Above 30","Under 30")
head(turnout)
# For multiple statements use
turnout$d_30_plus <- ifelse(turnout$age>=30, "Above 30",
ifelse(turnout$age < 30,"Under 30","error"))
head(turnout)
####################################
### 8. Analyzing Data in R ########
####################################
####################################
### 8.1. Summary statistics ########
####################################
head(turnout)
# The function summary() provides a summary statistics of variables
# Summary of a single variable
summary(turnout$income)
# Please load the "plyr" package to the workspace.
# ddply() is a useful function in the "plyr" package to estimate summary statistics
library("plyr")
# Suppose we want to estimate the mean of income by race.
head(turnout)
# The following commands are equivalent but the output format is different.
tapply(turnout$income, turnout$race, mean) # tapply(data, group index, function)
by(turnout$income, turnout$race, mean) # by(data, group index, function)
aggregate(income~race, turnout, mean) # aggregate(formula, data, function)
ddply(turnout,"race", function(x) c(mean=mean(x$income))) # ddply(data, group index, functions)
# When a function does not let you specify a dataset, it is useful to use the command with() to shorten notation
with(turnout, tapply(income, race, mean))
with(turnout, by(income, race, mean))
head(turnout)
# Supose now that we want to estimate the mean of income by race and by our variable for age
with(turnout, tapply(income, list(race, d_30_plus),mean)) # tapply(data, group index, function)
with(turnout, by(income, list(race, d_30_plus),mean)) # by(data, group index, function)
aggregate(income~race+d_30_plus, turnout, mean) # aggregate(formula, data, function)
ddply(turnout,c("race","d_30_plus"), function(x) c(mean=mean(x$income))) # ddply(data, group index, functions)
# Suppose that we want to estimate the mean of income and schooling by race and age
# Only aggregate() and ddply() will estimate both statistics simultaneously
aggregate(cbind(income,educate)~race+d_30_plus, turnout, mean)
ddply(turnout,c("race","d_30_plus"), function(x) c(mean_income=mean(x$income),mean_schooling=mean(x$educate)))
# Finally, suppose that we want to estimate the mean of income and the median of schooling by race and age
# Only ddply can estimate this simultaneously
ddply(turnout,c("race","d_30_plus"), function(x) c(mean_income=mean(x$income),median_schooling=median(x$educate)))
####################################
### 8.2. Statistical Models ########
####################################
# R packages include functions that estimate a wide variety of statistical models
# The simplest one is the ordinary least squares model.
# Let's estimate the following linear model that we will name "model_1"
# lm() is a function that estimates an OLS model.
model_1 <- lm(income ~ educate + race + age, data=turnout) # lm(formula, data)
# Now the model_1 is an object of the workspace
model_1
# To see the different outputs created by the model use ls()
ls(model_1)
# To extract the coefficients type
model_1$coefficients
coefficients(model_1)
# To extract the predicted values type
model_1$fitted.values
fitted(model_1)
# You can also extract results using summary()
summary(model_1)
# summary() also produces different outputs
ls(summary(model_1))
# To extract the coefficients type
summary(model_1)$coefficients
# Any output can be stored in a table.
################################################
### 9. Exporting Tables in R ##################
################################################
# First let's create the table of interest and name it Table 1
Table_1 <- ddply(turnout,c("race","d_30_plus"), function(x) c(mean_income=mean(x$income),mean_schooling=mean(x$educate)))
# Table_1 is in the workspace
Table_1
# To save tables in R, you can use write.table()
?write.table
write.table(
Table_1, # Object to be saved as a table
file = "Table1.csv", # Path and name of the file
append = FALSE, # Replace or append the document (default is Replace)
sep = ",", # Field separator, default sep = "," but could be "\t", "\p", " "
na = "", # The string to use for missing values
dec = ".", # Character for decimal points
col.names = TRUE, # Include the column names (default)
row.names = FALSE) # Include the row names
# Here is a summary of other functions you can use for exporting tables
# write.csv() creates comma separated values files
# write.dta() creates Stata databases, requires package "foreign"
# write.xlsx() creates Excel files, requires package "xlsx"
# xtable() and print.xtable() creates Latex documents, requires package "xtable"
# texreg() creates regression tables in Latex, Word, and html, requires package "texreg"
# stargazer() creates regression tables in Latex, Word, and html, requires package "stargazer"
################################################
### 7. Graphs in R ############################
################################################
library("lattice")
# plot() is one the most commonly used commands to make graphs in R
?plot
plot(
income~educate, ### Formula to plot, you can also provide coordinates
data=turnout, ### Database
type = "p", ### Type: "p" = point; "l" = lines
col = "blue", ### Color
pch = 20, ### Symbol
main = "Plot income vs education", ### title
sub = "Turnout database", ### Subtitle
xlab = "Schooling", ### Label for X-axis
ylab = "Income" ### Label for Y-axis
)
# Once you open a graph window using plot(),
# you can add multiple points or lines to that graph
# R will keep adding elements to the graph until a new plot is called or the graph window is closed
abline(lm(income~educate,data=turnout), lwd=3,col="red") # Adds a regression line to the plot
abline(v=10.5, lwd=3, lty=2) # Adds a vertical line to the plot
points(x=c(5,15), y=c(5,10), pch=20, col="orange") # Adds points to the plot
text(x=c(5,15), y=c(5,10),c("person 1", "person 2"), pos=3) # Adds text to the plot
# Type ?par for a description of the available graphical parameters
?par
# To close a graph window, use dev.off()
dev.off()
# Exporting graphs
### Saving graphs as jpeg
jpeg("graph1.jpeg") # Opens an empty .jpeg file in that path
plot(income~educate, data=turnout, type = "p", col = "blue", pch = 20, main = "Scatter plot income vs education")
abline(lm(income~educate,data=turnout), lwd=3,col="red")
dev.off() # Closes the .jpeg file
# You can also save your plot as .pdf using pdf() instead of jpeg()