R demo (Dag van het Onderzoek, 2014)

 

# -----------------------------------------------------------------------------
# R SESSION FOR PRESENTATION ON "DATA ANALYSIS IN CORPUS LINGUISTICS" AT
# THE KU LEUVEN, DEPT. OF LINGUISTICS, "DAG VAN HET ONDERZOEK" (MARCH 18, 2014)
# (Dirk Speelman)
# -----------------------------------------------------------------------------

# specify location of datasets

DATAFOLDER <- "http://wwwling.arts.kuleuven.be/qlvl/datasets/"

# load packages that we will use

library(MASS)    # needed for isoMDS(...)
library(car)     # needed for Anova(...)
library(effects) # needed for plot(effect(...)) and plot(allEffects(...)) 
library(party)   # needed for ctree(...) and plot(ctree(...))
library(lme4)    # needed for glmer(...) and print(glmer(...))

# -----------------------------------------------------------------------------
# (LOGISTIC) REGRESSION (AND CONDITIONAL INFERENCE TREES)
# -----------------------------------------------------------------------------

# read the data
d <- read.table(paste0(DATAFOLDER, "noemenheten.txt"), header=TRUE, sep="\t")

# look at the data
head(d) # an alternative is to use edit(d) for a spreadsheet-like representation

# fit a first logistic regression model (no interactions) and show the results
fit1.glm <- glm(variant ~ conver.type + age + region + occup.type, data=d, 
                family=binomial)
summary(fit1.glm)

# relative importance of predictors
Anova(fit1.glm)

# the same visually
vals <- Anova(fit1.glm)[["LR Chisq"]]
names(vals) <- rownames(Anova(fit1.glm))
vals <- sort(vals)
dotchart(vals, pch=16, xlim=range(0, vals), main="importance of variables",
         xlab="model Chisq")

# plot effects in model
plot(allEffects(fit1.glm))

# fit a second logistic regression model (one interaction) and show the results
fit2.glm <- glm(variant ~ conver.type + age*region + occup.type, data=d,
                family=binomial)
summary(fit2.glm)

# relative importance of predictors
Anova(fit2.glm)

# the same visually
vals <- Anova(fit2.glm)[["LR Chisq"]]
names(vals) <- rownames(Anova(fit2.glm))
vals <- sort(vals)
dotchart(vals, pch=16, xlim=range(0, vals), main="importance of variables",
         xlab="model Chisq")

# plot effects in model
plot(allEffects(fit2.glm))

# plot only the interaction
plot(effect("age*region", fit2.glm))

# fit mixed model with speaker as random effect
fit.glmer <- glmer(variant ~ conver.type + age*region + occup.type +
                  (1|speaker), data=d, family=binomial)
print(fit.glmer, corr=FALSE)

# relative importance of predictors
Anova(fit.glmer)

# the same visually
vals <- Anova(fit.glmer)[["Chisq"]]
names(vals) <- rownames(Anova(fit.glmer))
vals <- sort(vals)
dotchart(vals, pch=16, xlim=range(0, vals), main="importance of variables",
         xlab="Chisq")

# plot effects in model
plot(allEffects(fit.glmer))

# plot only the interaction
plot(effect("age*region", fit.glmer))

# calculate and plot conditional inference tree 
fit.ctree = ctree(variant ~ conver.type + generation + region + occup.type, 
                  data=d)
plot(fit.ctree)

# -----------------------------------------------------------------------------
# MULTIDIMENSIONAL SCALING
# -----------------------------------------------------------------------------

# read info on subcorpora
d  <- read.table(paste0(DATAFOLDER, "milan-subcorpora.txt"), header=TRUE) 

# look at the data in d
head(d) # an alternative is to use edit(d) for a spreadsheet-like representation  

# read distance matrix based on all profiles
dst <- as.dist(read.table(paste0(DATAFOLDER, "milan-dists-glob.txt")))

# look at the data in dst
as.matrix(dst)[1:6,1:6] # an alternative is to use edit(as.matrix(dst))  

# perfom the isoMDS analysis and store the result in dst.isoMDS
dst.isoMDS <- isoMDS(dst,k=2)

# --------------------------------------------------------------
# show the MDS solution, identifying the component of each item
#

plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2")
text(dst.isoMDS$points, labels=d$comp, col=as.numeric(d$comp))

# ---------------------------------------------------------------
# highlighting the components one by one, allowing the researcher to
# click on items to individually identify them, and to press 
# to go to the component
# (close the graphics window before running the following code)

for (comp in levels(d$comp)) {
  plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2",
       main=comp)
  cols <- rep("gray", nrow(d))
  cols[d$comp == comp] <- "blue"
  text(dst.isoMDS$points, labels=d$comp, col=cols)  
  identify(dst.isoMDS$points, labels=d$name)
}

# ---------------------------------------------------------------
# highlighting the components one by one, allowing the researcher to
# click on items to individually identify them, and to press 
# to go to the component
# (close the graphics window before you run the following code)

for (comp in levels(d$comp)) {
  plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2",
       main=comp)
  labs <- as.character(d$comp)
  cols <- rep("gray", nrow(d))
  labs[d$comp == comp & d$edu == "H"] <- "H"
  cols[d$comp == comp & d$edu == "H"] <- "blue"
  labs[d$comp == comp & d$edu == "L"] <- "L"
  cols[d$comp == comp & d$edu == "L"] <- "red"
  text(dst.isoMDS$points, labels=labs, col=cols)  
  identify(dst.isoMDS$points, labels=d$name)
}

# read distance matrix based on "die" profiles
dst <- as.dist(read.table(paste0(DATAFOLDER, "milan-dists-die.txt")))

# look at the data in dst
as.matrix(dst)[1:6,1:6] # an alternative is to use edit(as.matrix(dst))  

# perfom the isoMDS analysis and store the result in dst.isoMDS
dst.isoMDS <- isoMDS(dst, k=2)

# ---------------------------------------------------------------
# highlighting the components one by one, allowing the researcher to
# click on items to individually identify them, and to press 
# to go to the component
# (close the graphics window before you run the following code)

for (comp in levels(d$comp)) {
  plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2",
       main=comp)
  cols <- rep("gray", nrow(d))
  cols[d$comp == comp] <- "blue"
  text(dst.isoMDS$points, labels=d$comp, col=cols)  
  identify(dst.isoMDS$points, labels=d$name)
}

# ---------------------------------------------------------------
# highlighting the regions one by one, allowing the researcher to
# click on items to individually identify them, and to press 
# to go to the component
# (close the graphics window before you run the following code)

for (reg in levels(d$reg)) {
  plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2",
       main=reg)
  cols <- rep("gray", nrow(d))
  cols[d$reg == reg] <- "blue"
  text(dst.isoMDS$points, labels=d$reg, col=cols)  
  identify(dst.isoMDS$points, labels=d$name)
}