R

R-chi-squared-2.png

R and Python (ok, and Stata) are the most used scripting laguages for statistical computating and data analyses. Although R and Python are more popular among staticians, data scientists, and political scientists, Stata is quite popular among economists. You find below some code snippets in R that I found useful at some point.

Graphics

Cartesian Coordinate-like plot for mathematical functions

I think the plots of mathematical functions are ugly in R. There is no standard package (to the best of my knowledge) to create those traditional graphics of mathematical functions using Cartesian Coordinate system with the origem (0,0) or (0,0,0) in the right place. I created myself two functions that extends the base plot functionalities and generate such plots. Below you see some examples and the source file for that generate the plots in the style I am refering to. Use it as you wish.

The source files are

A reproductible example is below.

source('/plotCC.R')
source('/curveCC.R')

## =====================================================
## Standard R plot
## =====================================================
f <- quote(x^2-3)
g <- quote(x^3)

x <- seq(-3,3,length=50)
plot(x,eval(f), type='l', col='blue')
points(x,eval(g), type='l', col='red')
legend <- c(as.expression(f), as.expression(g) )
legend('bottomright', lty=1, col=c('blue','red'), legend=legend, bty='n')
title('ugly Standard better for functions')

par(las=1,cex.axis=.7,bty='l', pch=20, cex.main=.9, mar=c(4,5,3,2))
plot(x,eval(f), type='l', col='blue')
points(x,eval(g), type='l', col='red', add=T)
legend <- c(as.expression(f), as.expression(g) )
legend('bottomright', lty=1, col=c('blue','red'), legend=legend, bty='n')
title('Standard plot a little better')


## =====================================================
## Plot CC
## =====================================================
f <- quote(x^2-3)
g <- quote(x^3)

x <- seq(-3,3,length=50)
plotCC(x,eval(f), col='blue')
plotCC(x,eval(g), col='red', add=T)
legend <- c(as.expression(f), as.expression(g) )
legend('bottomright', lty=1, col=c('blue','red'), legend=legend, bty='n')
title('Using plotCC or curve CC')
dev.off()

## another method
f <- function(x) return(x^2-3)
g <- function(x) return(x^3)
x <- seq(-3,3,length=50)
plotCC(x,f(x), col='blue')
plotCC(x,g(x), col='red', add=T)
legend <- c(as.expression(bquote(f*(x)==x^2-3)), as.expression(bquote(f(x)==x^3)) )
legend('bottomright', lty=1, col=c('blue','red'), legend=legend, bty='n')
title('y=f(x)')

## =====================================================
## curve CC
## =====================================================
## not necessary to create x
f <- function(x) return(x^2-3)
g <- function(x) return(x^3)
curveCC(f,xmin=-3,xmax=3, ymax=6,  col='blue')
curveCC(g,xmin=-3,xmax=3, ymax=6,  col='red', add=T)
legend('bottomright', lty=1, col=c('blue','red'), legend=c('f','g'), bty='n')

plotCC.png

Probability distribution (Plots)

Exponential Distribution

# Exponential
exppdf		<- function(x) lambda*exp(-lambda*x)

par(las=1, cex.axis=.7,bty='l', cex.main=.9, mar=c(4,5,3,2), col='lightblue')
# parameter
lambda		<-1
plot(exppdf,0,10, main=bquote(f(X==x)==.(lambda)*e^(-.(lambda)*x)), ylab=expression(f(x)==lambda*e^(-lambda*x)))

percentis	<- qexp(seq(.2,1,.2), rate=lambda)
segments(x0= percentis,x1= percentis,y0=0,y1=dexp(percentis, rate=lambda),lt=2)
text(x=percentis,y=dexp(percentis, rate=lambda), label=paste(seq(20,100,20),'%'),pos=4, cex=.7)

expDist.png

X^2 Distribution

# chi-squared distribution
distribution <- function(x) dchisq(x,df=df)
# Parameter
df<-1

par(las=1, cex.axis=.7,bty='l', cex.main=.9, mar=c(4,5,3,2))
plot(distribution,0,20, bty='l',  las=1,cex.axis=.7,ylab='Density', col='lightblue', main=expression(chi^2~'-distribution; '~f(x)==frac(1,2^(frac(2,k))*Tau*(frac(2,k)) )*x^(frac(2,k)~-1)*e^(-~frac(x,2))),ylim=c(0,.3))

# other values for the parameter
for (i in 2:7){
df=i;par(new=T)
plot(distribution,0,20, bty='l', col=i, lty=i,add=T) }
legend('topright', legend=paste('df=',1:7), col=1:7, lty=1:7, cex=.7)

chiDist.png

Beta Distribution

The beta distribution is given by

X\sim Beta(\alpha,\beta)

f_X(x) = \begin{cases}\displaystyle \frac{1}{B(\alpha,\beta)}x^{\alpha-1}(1-x)^{\beta-1} & , x\in (0,1)\\ 0 & , \text{otherwise} \end{cases}

Where

\displaystyle B(\alpha,\beta) = \frac{\Gamma(\alpha)\Gamma(\beta)}{\Gamma(\alpha+\beta)}

and

\displaystyle\Gamma(\alpha) = \int_0^\infty x^{\alpha-1}e^{-x}dx

require(gtools)

# Beta
betaDist		<- function(x) dbeta(x, alpha,beta)

# parameter
values     <- c(.3,1,1.5,3)
parameters <- permutations(values, n=length(values), r=2, repeats.allowed=T)

par(las=1,cex.axis=.7,bty='l', pch=20, cex.main=.9, mar=c(4,5,3,2))
alpha      <-parameters[1,1]
beta       <-parameters[1,2]

par(las=1,cex.axis=.7,bty='l', pch=20, cex.main=.9,mfrow=c(nrow(parameters)/4,nrow(parameters)/4),mar=c(1,1,1,1))
for( i in 1:nrow(parameters)){
    alpha      <-parameters[i,1]
    beta       <-parameters[i,2]
    plot(betaDist,0,1, col='black')
    legend('topright', legend=bquote(atop(alpha == .(alpha),beta == .(beta))),bty='n', cex=1.4)
}

betaDistr.png

Density with shaded area (example with Gaussian Curve)

mu = 0
sd = 1
x0 = -Inf
x1 = 1

par(las=1, cex.axis=.7,bty='l', cex.main=.9, mar=c(4,5,3,2))
plot(dnorm,mu-4*sd,mu+4*sd, bty='l',las=1, cex.axis=.8, ylab=expression(f(x)), col='darkgrey')
shaded <- seq(mu-4*sd,x1,.2)
polygon(x=c(x1, shaded),y=c(0,dnorm(shaded)), col='lightgrey', lty=0)
segments(x0=x1,y0=0,y1=dnorm(x1), lty=2)

normDistShaded_ex1.png

mean=100; sd=15
lb=80; ub=105

x	<- seq(-4,4,length=100)*sd + mean
hx	<- dnorm(x,mean,sd)

par(las=1, cex.axis=.7,bty='l', cex.main=.9, mar=c(4,5,3,2))
plot(x, hx, type="l", xlab="X Values", ylab="Density",main="Normal Distribution", axes=T, las=1, cex.axis=.7)
i	= lb & x <= ub
lines(x, hx)
polygon(c(lb,x[i],ub), c(0,hx[i],0), col="grey") 

area	<- pnorm(ub, mean, sd) - pnorm(lb, mean, sd)
result	<- paste("P(",lb,"< IQ <",ub,") =",signif(area, digits=3))

normDistShaded_ex2.png

Bivariate Normal and its contour plot

The bivariate normal distribution is given by:

\displaystyle f_{x_1,x_2}(x_1,x_2)=\frac{1}{2\pi\sigma_1\sigma_2\sqrt{1-\rho^2}}e^{-z/(2(1-\rho^2)}

where

\displaystyle z=\frac{(x_1-\mu_1)^2}{\sigma_1^2}-\frac{2\rho(x_1-\mu_1)(x_2-\mu_2)}{\sigma_1\sigma_2}+\frac{(x_2-\mu_2)^2}{\sigma_2^2}

and

\displaystyle \rho=\text{corr}(X_1,X_2)=\frac{\text{Cov}(X_1,X_2)}{\sigma_1\sigma_2}

require(plot3D)
require(mvtnorm)

mu1	<- 1	# expected value of x
sig1	<- 0.5	# variance of x
mu2	<- 2	# expected value of y
sig2	<- 2	# variance of y
rho	<- .3	# corr(x, y)

x <- seq(-2,5,length=100)
y <- seq(-2,5,length=100)

jointpdf <- function(x,y) dmvnorm(cbind(x,y), mean=c(mu1,mu2),sigma=matrix(c(sig1, rho, rho,sig2), byrow=T,nrow=2)) 
fxy <- outer(x, y, jointpdf)

par(mfrow=c(2,2), mar=c(2,2,2,2))
persp3D(x, y,fxy, theta=10, phi=30, expand=0.8 ,shade = 0.03, colkey = FALSE)
contour2D(fxy, x=x,y=y,NAcol = "black", colkey = FALSE)
image2D(fxy, x=x,y=y,NAcol = "black", colkey = FALSE)
image2D(fxy, x=x,y=y,NAcol = "black",contour = TRUE, lphi = 0.7)

Bivariate Normal and its contour plot.png

Histogram with adjusted curve

x <- rbinom(n=5000,size=50,prob=.5)
hist(x, prob=T, col='lightgrey', las=1, cex.axis=.8)
lines(density(x), col='blue', lty=2) # adjust=1
lines(density(x,adjust=2), col='green', lty=2)
lines(density(x,adjust=3), col='red', lty=1)
legend('topright',legend=c('Adjust 1','Adjust 2','Adjust 3'), lty=c(2,2,1),col=c('blue', 'green','red'))

Histogram with adjusted curve1.png

x <- rbinom(n=5000,size=50,prob=.5)
hist(x, prob=T, col='lightgrey', las=1, cex.axis=.8)
curve(dnorm(x, mean=.5*50,sd=sqrt(.5*.5*50)), add=T, col='red',lty=2)

Histogram with adjusted curve2.png

Using ggplot2 (and pipe with magrittr)

library(magrittr)
library(ggplot2)
library(tibble)

x % tibble::as_data_frame(.) 

x %>%
    ggplot2::ggplot(.) +
    ggplot2::geom_histogram(ggplot2::aes(x=x, y=..density..),fill="lightblue", colour='white')+
    ggplot2::geom_density(ggplot2::aes(x=x), fill="#00000044", adjust=1, alpha=.2, colour='white') +
    ggplot2::theme_bw() +
    ggplot2::scale_x_continuous(expand = c(0, 0)) +
    ggplot2::scale_y_continuous(expand = c(0, 0)) 

histogram.png

Force Multiple plots

Some function do not allow plot more than one plot using mfrow. One solution is the following:

p1 <- plot(1,1, main='figure 1')
p2 <- plot(1,1, main='figure 2')
p3 <- plot(1,1, main='figure 3')
p4 <- plot(1,1, main='figure 4')


print(p1, position = c(0, 0, 0.5, .5), more = TRUE)
print(p2, position = c(0.5, 0, 1, .50), more = TRUE)
print(p3, position = c(0.5, .5, 1, 1), more = TRUE)
print(p4, position = c(0, .5, .5, 1))

Math annotation

Annotations using math symbol and values of variables

Problem: plotting a annotation with greek characters and a value of a variables

Solution: bquote

par(las=1,cex.axis=.7,bty='l', pch=20, cex.main=.9, mar=c(4,5,3,2)) 

lambda=3
xPois <- rpois(100000, lambda=3)
hist(xPois, main=
         bquote('Poisson Distribution: '~~
                f(lambda)~'='~frac(e^-lambda*lambda^x,x*'!')~'; ' # function
                ~lambda==.(lambda)),                              # including var
     xlab='x',xlim=c(0,17),col='gray')

AnnotPoisson.png

Expressions for some probability distributions to plot

# Normal
expr.norm <- expression(italic(paste(displaystyle(f(x)~"="~frac(1,sqrt(2*pi*sigma^scriptscriptstyle("2")))~e^{frac(-1,2*sigma^{scriptscriptstyle("2")})*(x-mu)^scriptscriptstyle("2")})~~~~
    displaystyle(list(paste(-infinity<x) <infinity, paste(-infinity<mu) <infinity, paste(0<sigma^scriptscriptstyle("2")) <infinity)))))

# Uniform
expr.unif <- expression(italic(paste(displaystyle(f(x)~"="~frac(1,b-a)~~~~displaystyle(paste(-infinity<paste(a<=paste(x<=b))) <infinity)))))

# t-distribution
expr.t <- expression(italic(paste(displaystyle(f(x)~"="~frac(Gamma~bgroup("(",frac(nu+1,2),")"),sqrt(nu*pi)~Gamma~bgroup("(",frac(nu,2),")"))~bgroup("(",1+frac(x^2,nu),")")^{-frac(nu+1,2)})~~~~displaystyle(list(paste(-infinity<x)  0)))))

# f-distribution
expr.F <- expression(italic(paste(displaystyle(f(x)~"="~frac(Gamma~bgroup("(",frac(nu[1]+nu[2],2),")"),Gamma~bgroup("(",frac(nu[1],2),")")~Gamma~bgroup("(",frac(nu[2],2),")"))~bgroup("(",frac(nu[1],nu[2]),")")^{frac(nu[1],2)}~frac(x^{frac(nu[1],2)-1},bgroup("(",1+frac(d[1],d[2])*x,")")^{frac(d[1]+d[2],2)}))~~~~displaystyle(paste(0<=x)  0))))

# gamma
expr.gam <- expression(italic(paste(displaystyle(f(x)~"="~frac(beta^alpha,Gamma(alpha))*x^{alpha-1}*e^{-beta*x})~~~~displaystyle(list(paste(0<x) <infinity, paste(0<alpha) <infinity, paste(0<beta) <infinity)))))

# exponential
expr.exp <- expression(italic(paste(displaystyle(f(x)~"="~lambda*e^{-lambda*x})~~~~displaystyle(list(paste(0<=x) 0)))))

# chi-squared
expr.chisq <- expression(italic(paste(frac(1,2^{frac(nu,2)}*Gamma~bgroup("(",frac(nu,2),")"))*x^{frac(nu,2)-1}*e^{-frac(x,2)}
~~~~displaystyle(list(paste(0<=x) <infinity, nu %in% bold(N))))))

# log-normal
expr.lnorm <- expression(italic(paste(displaystyle(f(x)~"="~frac(1,x*sigma*sqrt(2*pi))~e^{-frac((log(x)-mu)^2,2*sigma^2)})~~~~displaystyle(list(paste(0<x) <infinity, paste(-infinity<log(mu)) <infinity, paste(0<sigma^scriptscriptstyle("2")) <infinity)))))

# beta
expr.beta <- expression(italic(paste(displaystyle(f(x)~"="~frac(Gamma(alpha+beta),Gamma(alpha)*Gamma(beta))*x^{alpha-1}*(1-x)^{beta-1})~~~~displaystyle(list(paste(0<=x) <=1, paste(0<alpha) <infinity, paste(0<beta) <infinity)))))

par(mar=c(0,0,2,0), mfrow=c(8,1),xaxt='n', yaxt='n')
plot(1,1,type='n',axes=FALSE)
    title('Normal')
    text(1,1,label=expr.norm, main='Normal')
#plot(1,1,pch=F)
#title('Uniform')
#    text(1,1,label=expr.unif)
plot(1,1,type='n',axes=FALSE)
title('t-distribution')
    text(1,1,label=expr.t)
plot(1,1,type='n',axes=FALSE)
title('F-distribution')
    text(1,1,label=expr.F)
plot(1,1,type='n',axes=FALSE)
title('Gamma')
    text(1,1,label=expr.gam)
plot(1,1,type='n',axes=FALSE)
title('exponential')
    text(1,1,label=expr.exp)
plot(1,1,type='n',axes=FALSE)
title(bquote(chi^2-'distribution'))
    text(1,1,label=expr.chisq)
plot(1,1,type='n',axes=FALSE)
title('log-normal')
    text(1,1,label=expr.lnorm)
plot(1,1,type='n',axes=FALSE)
title('Beta')
    text(1,1,label=expr.beta)

AnnotDistribExpression.png

Map (Cloropleth map with ggplot)

Here is an easy way to create a cloropleth map using shapefiles and ggplot2. The shapefile used in the example can be found here.

## =====================================================
## Author: Diogo A. Ferrari 
## 
## Note for linux users:
## It may be necessary to install the c library libgeos. So run on terminal:
##  sudo apt-get install libgeos-dev
## You need that library if the installation of the R package rgeos returns an error.
## =====================================================


## {{{ packages }}}


if (!require("magrittr")) install.packages('magrittr', repos='http://cran.us.r-project.org'); library(magrittr)
if (!require("dplyr")) install.packages('dplyr', repos='http://cran.us.r-project.org'); library(dplyr)
## to read shapefiles and manipulate the shapefile information
if (!require("rgeos")) install.packages('rgeos', repos='http://cran.us.r-project.org'); library(rgeos)
if (!require("maptools")) install.packages('maptools', repos='http://cran.us.r-project.org'); library(maptools)
## to plot
if (!require("ggplot2")) install.packages('ggplot2', repos='http://cran.us.r-project.org'); library(ggplot2)


## }}}
## {{{ loading shapefiles }}}

shpfile = file.path('shapefiles/estados_2010.shp')
ufs = maptools::readShapePoly(shpfile)

## }}}
## {{{ loading data }}}

## lets create a fake data set to use in the example. The polygons are Brazilian states (in Portuguese estados, or UF, which stands for Federal Units), identified by their acronym (sigla means acronym in Portuguese )
set.seed(666)
data = tibble::data_frame(sigla = c("AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO"),
                           info = runif(27,-.5,.7)) 


## }}}
## {{{ prepare data to plot }}}

## see what information is associated with polygons in the shapefile R object:
ufs@data %>% head(.)

## 1. to use ggplot, we need to tidy the shapefile (select an id, which here is columns "UF" in uf@data)
ufs.gg % rgeos::gCentroid(., byid=TRUE, id=.@data$sigla) %>%
    data.frame(sigla=row.names(.),. )  %>%
    dplyr::rename(cent.long = x, cent.lat = y) 

## 3. we finnaly merge the data associated with the ids (here, UF)
ufs.gg = ufs.gg %>% dplyr::full_join(., data , by=c("id"="sigla")) 


## }}}

Warning message:
use rgdal::readOGR or sf::st_read
  id     nome sigla regiao_id codigo_ibg
0  1     Acre    AC         3         12
1  2  Alagoas    AL         4         27
2  3 Amazonas    AM         3         13
3  4 Amap\xe1    AP         3         16
4  5    Bahia    BA         4         29
5  6 Cear\xe1    CE         4         23

Now the objects are ready to plot:

ufs.gg  %>% tibble::as_data_frame(.)    # the polygons with the data information
# A tibble: 85,585 x 8
    long   lat order hole  piece group id     info
          
 1 -73.6 -7.20     1 F     1     AC.1  AC    0.429
 2 -72.9 -7.53     2 F     1     AC.1  AC    0.429
 3 -72.7 -7.62     3 F     1     AC.1  AC    0.429
 4 -72.7 -7.62     4 F     1     AC.1  AC    0.429
 5 -72.7 -7.62     5 F     1     AC.1  AC    0.429
 6 -72.2 -7.72     6 F     1     AC.1  AC    0.429
 7 -72.2 -7.72     7 F     1     AC.1  AC    0.429
 8 -72.2 -7.72     8 F     1     AC.1  AC    0.429
 9 -72.0 -7.77     9 F     1     AC.1  AC    0.429
10 -71.5 -7.89    10 F     1     AC.1  AC    0.429
# ... with 85,575 more rows
centroids %>% tibble::as_data_frame(.)  # the centroids of each polygon
# A tibble: 27 x 3
   sigla cent.long cent.lat
 *          
 1 AC        -70.4   - 9.31
 2 AL        -36.6   - 9.52
 3 AM        -64.7   - 4.18
 4 AP        -52.0     1.44
 5 BA        -41.7   -12.5 
 6 CE        -39.6   - 5.09
 7 DF        -47.8   -15.8 
 8 ES        -40.7   -19.6 
 9 GO        -49.6   -16.0 
10 MA        -45.3   - 5.05
# ... with 17 more rows

So can create the plots using ggplot:

## plot shapes only
ufs.gg %>%
    ggplot2::ggplot(.) +
    ggplot2::geom_polygon(aes(x = long, y = lat, group = group), colour = "black", fill = NA)+
    ggthemes::theme_map()

shapes.png

… include the labels for each polygon

ufs.gg %>%
    ggplot2::ggplot(.) +
    ggplot2::geom_polygon(aes(x = long, y = lat, group = group), colour = "black", fill=NA)+
    ggrepel::geom_text_repel(data = centroids, aes(x=cent.long, y=cent.lat, label=sigla))+
    ggthemes::theme_map()

shapes-with-labels.png

… and finally create the cloropleth map with polygons colored using variable info in the data:

## names of the polygons (states) and colors by info in data
ufs.gg %>%
    ggplot2::ggplot(.) +
    ggplot2:: geom_polygon(aes(x = long, y = lat, group = group, fill=info), colour = "black")+
    ggrepel::geom_text_repel(data = centroids, aes(x=cent.long, y=cent.lat, label=sigla), fontface="bold", size=4)+
    ggthemes::theme_map()+
    ## legend
    scale_fill_gradient2(low="blue", mid="white", high="darkred", # colors in the extremes
                         midpoint=0,                              # value of the midpoint
                         breaks=seq(-1,1,length=5),               # values displayed in the legend
                         limits=c(-1,1),                          # legend limit values
                         name = "Information"                     # Title of the legend
                         )+
    guides(fill = guide_colorbar(title.position="top",           # Position of the title of legend
                                 title.hjust = 0.5,              # justification of the title relative to legend bar
                                 barwidth=15,                    # width of legend bar
                                 barheight=1.2                   # height of legend bar
                                 ))+
    theme(legend.position = c(0.1, 0.2),                         # position of legend (inside the plot)
          legend.direction = 'horizontal'                        # legend direction
          )

shapes-with-colors.png

… or we can have a different set up for the legend:

## names of the polygons (states) and colors by info in data
ufs.gg %>%
    ggplot2::ggplot(.) +
    ggplot2:: geom_polygon(aes(x = long, y = lat, group = group, fill=info), colour = "black")+
    ggrepel::geom_text_repel(data = centroids, aes(x=cent.long, y=cent.lat, label=sigla), fontface="bold", size=4)+
    ggthemes::theme_map()+
    ggplot2::theme(legend.position = "bottom") +
    scale_fill_gradient2(low="blue", mid="white", high="darkred", # colors in the extremes
                         midpoint=0,                              # value of the midpoint
                         breaks=seq(-1,1,length=5),               # values displayed in the legend
                         limits=c(-1,1),                          # legend limit values
                         name = "Information"                     # Title of the legend
                         )+
    guides(fill = guide_colorbar(title.position="top",           # Position of the title of legend
                                 title.hjust = 0,                # justification of the title relative to legend bar
                                 barwidth=40,                    # width of legend bar
                                 title.theme = element_text(     # title of the plot (text settings)
                                     size=15,
                                     face='italic',
                                     angle=0),
                                 label.theme = element_text(     # labels in the plot bar (text settings)
                                     size=10,
                                     angle=0),
                                 ))

shapes-with-color-legend-bottom.png

Issues with packages

Installing Car in Ubuntu

If the regular instalation does not work, try

sudo apt-get install r-cran-car

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s