R demo (Dag van het Onderzoek, 2014)
# ----------------------------------------------------------------------------- # R SESSION FOR PRESENTATION ON "DATA ANALYSIS IN CORPUS LINGUISTICS" AT # THE KU LEUVEN, DEPT. OF LINGUISTICS, "DAG VAN HET ONDERZOEK" (MARCH 18, 2014) # (Dirk Speelman) # ----------------------------------------------------------------------------- # specify location of datasets DATAFOLDER <- "http://wwwling.arts.kuleuven.be/qlvl/datasets/" # load packages that we will use library(MASS) # needed for isoMDS(...) library(car) # needed for Anova(...) library(effects) # needed for plot(effect(...)) and plot(allEffects(...)) library(party) # needed for ctree(...) and plot(ctree(...)) library(lme4) # needed for glmer(...) and print(glmer(...)) # ----------------------------------------------------------------------------- # (LOGISTIC) REGRESSION (AND CONDITIONAL INFERENCE TREES) # ----------------------------------------------------------------------------- # read the data d <- read.table(paste0(DATAFOLDER, "noemenheten.txt"), header=TRUE, sep="\t") # look at the data head(d) # an alternative is to use edit(d) for a spreadsheet-like representation # fit a first logistic regression model (no interactions) and show the results fit1.glm <- glm(variant ~ conver.type + age + region + occup.type, data=d, family=binomial) summary(fit1.glm) # relative importance of predictors Anova(fit1.glm) # the same visually vals <- Anova(fit1.glm)[["LR Chisq"]] names(vals) <- rownames(Anova(fit1.glm)) vals <- sort(vals) dotchart(vals, pch=16, xlim=range(0, vals), main="importance of variables", xlab="model Chisq") # plot effects in model plot(allEffects(fit1.glm)) # fit a second logistic regression model (one interaction) and show the results fit2.glm <- glm(variant ~ conver.type + age*region + occup.type, data=d, family=binomial) summary(fit2.glm) # relative importance of predictors Anova(fit2.glm) # the same visually vals <- Anova(fit2.glm)[["LR Chisq"]] names(vals) <- rownames(Anova(fit2.glm)) vals <- sort(vals) dotchart(vals, pch=16, xlim=range(0, vals), main="importance of variables", xlab="model Chisq") # plot effects in model plot(allEffects(fit2.glm)) # plot only the interaction plot(effect("age*region", fit2.glm)) # fit mixed model with speaker as random effect fit.glmer <- glmer(variant ~ conver.type + age*region + occup.type + (1|speaker), data=d, family=binomial) print(fit.glmer, corr=FALSE) # relative importance of predictors Anova(fit.glmer) # the same visually vals <- Anova(fit.glmer)[["Chisq"]] names(vals) <- rownames(Anova(fit.glmer)) vals <- sort(vals) dotchart(vals, pch=16, xlim=range(0, vals), main="importance of variables", xlab="Chisq") # plot effects in model plot(allEffects(fit.glmer)) # plot only the interaction plot(effect("age*region", fit.glmer)) # calculate and plot conditional inference tree fit.ctree = ctree(variant ~ conver.type + generation + region + occup.type, data=d) plot(fit.ctree) # ----------------------------------------------------------------------------- # MULTIDIMENSIONAL SCALING # ----------------------------------------------------------------------------- # read info on subcorpora d <- read.table(paste0(DATAFOLDER, "milan-subcorpora.txt"), header=TRUE) # look at the data in d head(d) # an alternative is to use edit(d) for a spreadsheet-like representation # read distance matrix based on all profiles dst <- as.dist(read.table(paste0(DATAFOLDER, "milan-dists-glob.txt"))) # look at the data in dst as.matrix(dst)[1:6,1:6] # an alternative is to use edit(as.matrix(dst)) # perfom the isoMDS analysis and store the result in dst.isoMDS dst.isoMDS <- isoMDS(dst,k=2) # -------------------------------------------------------------- # show the MDS solution, identifying the component of each item # plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2") text(dst.isoMDS$points, labels=d$comp, col=as.numeric(d$comp)) # --------------------------------------------------------------- # highlighting the components one by one, allowing the researcher to # click on items to individually identify them, and to press# to go to the component # (close the graphics window before running the following code) for (comp in levels(d$comp)) { plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2", main=comp) cols <- rep("gray", nrow(d)) cols[d$comp == comp] <- "blue" text(dst.isoMDS$points, labels=d$comp, col=cols) identify(dst.isoMDS$points, labels=d$name) } # --------------------------------------------------------------- # highlighting the components one by one, allowing the researcher to # click on items to individually identify them, and to press # to go to the component # (close the graphics window before you run the following code) for (comp in levels(d$comp)) { plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2", main=comp) labs <- as.character(d$comp) cols <- rep("gray", nrow(d)) labs[d$comp == comp & d$edu == "H"] <- "H" cols[d$comp == comp & d$edu == "H"] <- "blue" labs[d$comp == comp & d$edu == "L"] <- "L" cols[d$comp == comp & d$edu == "L"] <- "red" text(dst.isoMDS$points, labels=labs, col=cols) identify(dst.isoMDS$points, labels=d$name) } # read distance matrix based on "die" profiles dst <- as.dist(read.table(paste0(DATAFOLDER, "milan-dists-die.txt"))) # look at the data in dst as.matrix(dst)[1:6,1:6] # an alternative is to use edit(as.matrix(dst)) # perfom the isoMDS analysis and store the result in dst.isoMDS dst.isoMDS <- isoMDS(dst, k=2) # --------------------------------------------------------------- # highlighting the components one by one, allowing the researcher to # click on items to individually identify them, and to press # to go to the component # (close the graphics window before you run the following code) for (comp in levels(d$comp)) { plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2", main=comp) cols <- rep("gray", nrow(d)) cols[d$comp == comp] <- "blue" text(dst.isoMDS$points, labels=d$comp, col=cols) identify(dst.isoMDS$points, labels=d$name) } # --------------------------------------------------------------- # highlighting the regions one by one, allowing the researcher to # click on items to individually identify them, and to press # to go to the component # (close the graphics window before you run the following code) for (reg in levels(d$reg)) { plot(dst.isoMDS$points, type="n", xlab="dim 1", ylab="dim 2", main=reg) cols <- rep("gray", nrow(d)) cols[d$reg == reg] <- "blue" text(dst.isoMDS$points, labels=d$reg, col=cols) identify(dst.isoMDS$points, labels=d$name) }