# "Descriptive Statistics"
# "UQ SLC Digital Team"
#
# clean current workspace
rm(list=ls(all=T))
# set options
options(stringsAsFactors = F)
# install libraries
install.packages(c("boot", "DescTools", "dplyr",  
                   "psych", "Rmisc", "stringr"))
library(knitr)
Means <- c("(Arithmetic) mean (average)", "Median (middle value)", "Mode (most frequent value)", "Geometric mean (average factor)", "Harmonic mean (average rate)")
Use <- c("Description of normally dsitributed numeric variables (most common measure of central tendency)", "Description of non-normal numeric variables or ordinal variables (skewed data or influential outliers)", "Description of nominal and categorical variables", "Description of dynamic processes such as growth rates", "Description of dynamic processes such as velocities")
df <- data.frame(Means, Use)
kable(df, caption = "Measures of central tendency and their use.")r echo = F, results = 'asis'}
par(mfrow=c(1,2))
x1 <- c(2, 8, 4, 6)
x2 <- c(5, 5, 5, 5)
barplot(x1, ylim = c(0,10), axes = F)
text(seq(.7, 4.3, 1.2), x1+1, x1)
barplot(x2, ylim = c(0,10), axes = F, col = rep("red", 4))
text(seq(.7, 4.3, 1.2), x2+1, x2)
mtext("(Arithmetic) Mean", 3,1,at=2.5)
par(mfrow=c(1,1))r echo = F, results = 'asis'}
library(knitr)
Sentences <- c("Call me Ishmael", "Some years ago -- never mind how long precisely -- having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world.", "It is a way I have of driving off the spleen, and regulating the circulation.", "Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people's hats off--then, I account it high time to get to sea as soon as I can.")
Words <- c(3, 40, 15, 87)
df <- data.frame(Sentences, Words)
kable(df, caption = "Sentences of the first paragraph of Herman Melville's *Moby Dick* and the number of words in each sentence.")
# create numeric vector
frequencies <- c(3, 40, 15, 87)
# calculate mean
mean(frequencies)r echo = F, results = 'asis'}
par(mfrow=c(1,2))
x1 <- c(5, 2, 9, 7, 1, 3, 8, 4, 6)
x2 <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)
barplot(x1, ylim = c(0,11), axes = F)
text(seq(.7, (.7+9*1.2), 1.2), x1+1, x1)
barplot(x2, ylim = c(0,11), axes = F, col = c(rep("grey", 4), "red", rep("grey", 4)))
text(seq(.7, (.7+9*1.2), 1.2), x2+1, x2)
mtext("Median value", 3, 1, at =(.7+4*1.2))
par(mfrow=c(1,1))r echo = F, results = 'asis'}
library(knitr)
Age <- c("0-10", "19-25", "26-33", "34-41", "42-49", "50+")
Counts <- c(9, 160, 70, 15, 9, 57)
df <- data.frame(Age, Counts)
kable(df, caption = "Number of speakers across age groups in the private dialogue section of the Irish component of the  *International Corpus of English* (ICE)")
Age <- c("0-18", "19-25", "26-33", "34-41", "42-49", "50+")
Counts <- c(9, 160, 70, 15, 9, 57)
df <- data.frame(Age, Counts)
barplot(df$Counts,
        ylim = c(0,200),
        ylab = "Frequency",
        xlab = "Age")
mtext(df$Age, 1, 1, at = seq(.7, 6.7, 1.2))
text(seq(.7, 6.7, 1.2), df$Counts +10, df$Counts)
box()r median}
# create a vector consisting out of ranks
ranks <- c(rep(1, 9), rep(2, 160), rep(3, 70), rep(4, 15), rep(5, 9), rep(6, 57))
# calculate median
median(ranks)r echo = F, results = 'asis'}
par(mfrow=c(1,2))
x1 <- c(5, 2, 9, 7, 1, 3, 8, 4, 6)
x2 <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)
barplot(x1, ylim = c(0,11), axes = F)
text(seq(.7, (.7+9*1.2), 1.2), x1+1, x1)
barplot(x2, ylim = c(0,11), axes = F, col = c(rep("grey", 8), "red"))
text(seq(.7, (.7+9*1.2), 1.2), x2+1, x2)
mtext("Mode", 3, 1, at =(.7+8*1.2))
par(mfrow=c(1,1))r echo = F, results = 'asis'}
library(knitr)
CurrentResidence <- c("Belfast", "Down", "Dublin (city)", "Limerick", "Tipperary")
Speakers <- c(98, 20, 110, 13, 19)
df <- data.frame(CurrentResidence, Speakers)
kable(df, caption = "Number of speakers across counties of current residency in the private dialogue section of the Irish component of the  *International Corpus of English* (ICE)")
barplot(df$Speakers,
        ylim = c(0,200),
        ylab = "Frequency",
        xlab = "Age")
mtext(df$CurrentResidence, 1, 1, at = seq(.7, 5.5, 1.2))
text(seq(.7, 5.5, 1.2), df$Speakers +10, df$Speakers)
box()r mode}
# create a factor with the current residence of speakers
CurrentResidence <- c(rep("Belfast", 98),         # repeat "Belfast" 98 times
                      rep("Down", 20),            # repeat "Down" 20 times
                      rep("Dublin (city)", 110),  # repeat "Dublin (city)" 110 times
                      rep("Limerick", 13),        # repeat "Limerick" 13 times
                      rep("Tipperary", 19))       # repeat "Tipperary" 19 times
# calculate mode
names(which.max(table(CurrentResidence)))         # extract which level occurs most frequentlyr echo = F, results = 'asis'}
library(knitr)
Year <- c("Year 1", "Year 2", "Year 3", "Year 4")
Package1 <- c("+5%", "-5%", "+5%", "-5%")
Package2 <- c("+20%", "-20%", "+20%", "-20%")
df <- data.frame(Year, Package1, Package2)
kable(df, caption = "Performance of two stock packages.")
library(knitr)
library(dplyr)
Corpus <- c(rep("C1", 5), 
            rep("C2", 5))
Speaker <- rep(c("A", "B", "C", "D", "E"), 2)  
Frequency <- c(5.2, 11.4, 27.1, 13.7, 9.6, 0.2, 0.0, 1.1, 93.7, 0.4)  
particletable <- data.frame(Corpus, Speaker, Frequency)
A <- c(5.2, 0.2)
B <- c(11.4, 0.0)
C <- c(27.1, 0.1)
D <- c(13.7, 93.7)
E <- c(9.6, 0.4)
barplotdata <- data.frame(A, B, C, D, E)
barplotdata <- as.matrix(barplotdata)
rownames(barplotdata) <- c("Corpus1", "Corpus2")
kable(particletable, 
      caption = "Relative frequencies of discourse partciles per speaker in two corpora")r echo = F, results = 'asis'}
barplot(barplotdata, 
        main = "Use of discourse particles across two corpora", # add title
        xlab="Speaker",                    # add x-axis label
        ylab="Relative Frequency",         # add y-axis label
        ylim = c(0,110),                   # define y-axis range
        col=c("darkblue","red"),           # define colors
        legend = rownames(barplotdata),    # add a legend
        beside=T,                          # stack the bars
        las = 1                            # tick label perpensicular
        )                                  # end barplot
box()                                      # create a box around panel
text(c(1.5,2.5,4.5,5.5,7.5,8.5,10.5,11.5, 13.5,14.5), barplotdata+5, barplotdata) r echo = F, results = 'asis'}
library(knitr)
Month <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", "Mean")
CityA <- c(-5, -12, 5, 12, 15, 18, 22, 23, 20, 16, 8, 1, 10.25)  
CityB <- c(7, 7, 8, 9, 10, 13, 15, 15, 13, 11, 8, 7, 10.25)  
temprature <- data.frame(Month, CityA, CityB)
kable(temprature, caption = "Average temperature in two cities by month")
January <- c(-5.00,  7.00)
February <- c(-12.00,  7.00)
March <- c(5.00,  8.00)
April <- c(12.00,  9.00)
May <- c(15.00, 10.00)
June <- c(18.00, 13.00)
July <- c(22.00, 15.00)
August <- c(23.00, 15.00)
September <- c(20.00, 13.00)
October <- c(16.00, 11.00)
November <- c(8.00,  8.00)
December <- c(1.00,  7.00)
lineplotdata <- data.frame(January, February, March, April, May, June, July, August, September, October, November, December)
lineplotdata <- as.matrix(lineplotdata)
rownames(lineplotdata) <- c("CityA", "City")r echo = F, results = 'asis'}
plot(lineplotdata[1,], 
     type = "l", 
     col = "red", 
     axes = F, 
     ylab = "Temperture", 
     xlab = "Month",
     ylim = c(-15, 30),
     lty = 1)
lines(lineplotdata[2,], 
      col = "darkblue",
      lty = 2)
axis(1, 1:12, colnames(lineplotdata), cex = .75, las = 2)
axis(2, seq(-15, 30, 5), seq(-15, 30, 5), las = 2)
box()
legend("topright", 
       lty = c(1,2), 
       col = c("red", "darkblue"), 
       legend = c("City A", "City B"), 
       bty = "n")r range}
# create a numeric vector
cityA <- c(-5, -12, 5, 12, 15, 18, 22, 23, 20, 16, 8, 1)
min(cityA); max(cityA) # extract ranger iqr}
summary(cityA) # extract IQRr variance}
sd(cityA)^2r sd}
sd(CityA) # calculate standard deviation
library(stringr)                          # activate stringr library
# define path to corpus
corpusfiles = list.files(path = "data/ICEIrelandSample", pattern = NULL, all.files = T, full.names = T, recursive = T, ignore.case = T, include.dirs = T)
# load and clean corpus
corpus <- lapply(corpusfiles, function(x) {
  scan(x, what = "char", sep = "", quote = "", quiet = T, skipNul = T)})
corpus <- unlist(corpus)                  # unlist corpus
corpus <- paste(corpus, sep = " ", collapse = " ")    # paste corpus together in one element
corpus <- gsub(" {2,}", " ", corpus)      # remove superfluous white spaces
corpus <- str_replace_all(corpus, ">.{1,20}</", "><") # clean corpus
corpus <- str_replace_all(corpus, "<.{1,50}>", "")    # clean corpus
corpus <- gsub(" {2,}", " ", corpus)      # remove superfluous white spaces
corpus <- str_trim(corpus, side = "both") # remove superfluous white spaces at string margins
corpus <- tolower(corpus)                 # convert to lower case
corpuswords <- strsplit(corpus, " ")      # split corpus into words
words <- table(corpuswords)[order(table(corpuswords), decreasing = T)][1:20]  # inspect frequency of first 20 elements
pos <- c("PPR", "ART", "PPR", "CON", "OTH", "PPR", "OTH", "ART", "V", "DPR", "PRP", "PRP", "PPR", "V", "V", "PPR", "PPR", "CON", "DPR", "ART")
wordstb <- data.frame(names(words), pos, as.vector(words))
colnames(wordstb) <- c("WordType", "POS", "Frequency")
library(knitr)
kable(wordstb, caption = "20 most frequent words in the sample corpus of ICE Ireland.")
sd(wordstb$Frequency, na.rm=TRUE) /  
   sqrt(length(wordstb$Frequency[!is.na(wordstb$Frequency)]))  
# load library
library(psych)
describe(wordstb$Frequency,  # vector to be described
               type=2)       # determine of skew and kurtosis
# load library
library(stringr)                          
# define path to corpus
corpusfiles = list.files(path = "data/ICEIrelandSample", pattern = NULL, all.files = T, full.names = T, recursive = T, ignore.case = T, include.dirs = T)
# load and clean corpus
corpus <- lapply(corpusfiles, function(x) {
  scan(x, what = "char", sep = "", quote = "", quiet = T, skipNul = T)})
corpus <- unlist(corpus)                  # unlist corpus
corpus <- paste(corpus, sep = " ", collapse = " ")    # paste corpus together in one element
corpus <- gsub(" {2,}", " ", corpus)      # remove superfluous white spaces
corpus <- str_replace_all(corpus, ">.{1,20}</", "><") # clean corpus
corpus <- str_replace_all(corpus, "<.{1,50}>", "")    # clean corpus
corpus <- gsub(" {2,}", " ", corpus)      # remove superfluous white spaces
corpus <- str_trim(corpus, side = "both") # remove superfluous white spaces at string margins
corpus <- tolower(corpus)                 # convert to lower case
corpuswords <- strsplit(corpus, " ")      # split corpus into words
words <- table(corpuswords)[order(table(corpuswords), decreasing = T)][1:20]  # inspect frequency of first 20 elements
pos <- c("PPR", "ART", "PPR", "CON", "OTH", "PPR", "OTH", "ART", "V", "DPR", "PRP", "PRP", "PPR", "V", "V", "PPR", "PPR", "CON", "DPR", "ART")
wordstb <- data.frame(names(words), pos, as.vector(words))
colnames(wordstb) <- c("WordType", "POS", "Frequency")
library(knitr)
kable(wordstb, caption = "20 most frequent words in the sample corpus of ICE Ireland.")
t.test(wordstb$Frequency, conf.level=0.95)  # extract mean and confidence intervals
# load library
library(Rmisc) 
# extract mean and confidence intervals
CI(wordstb$Frequency, ci=0.95)   
# load library
library(DescTools)   
# extract mean and confidence intervals
MeanCI(wordstb$Frequency, conf.level=0.95)   
# load library
library(DescTools)            
# extract mean CIs
MeanCI(wordstb$Frequency, method="boot", type="norm", R=1000)
# load library
library(DescTools)
MeanCI(wordstb$Frequency, method="boot", type="norm", R=1000)
# load library
library(Rmisc)
# apply summarySE function to data
summarySE(data=wordstb,           
          # define variable representing frequencies
          measurevar="Frequency", 
          # define grouping variabel
          groupvars="POS",        
          # extract standard deviation, standard error, 
          # and confidence intervals
          conf.interval = 0.95)   
# load library
library(boot) 
# function to extract values
BootFunction = function(x, index) {                        
                  return(c(mean(x[index]),
                           var(x[index]) / length(index)))
}
# apply function to data
Bootstrapped = boot(data=wordstb$Frequency,                
            statistic=BootFunction,
            R=1000)
# extract values
mean(Bootstrapped$t[,1])                                   
# alternative to extract values
boot.ci(Bootstrapped, conf=0.95)                           
binom.test(2, 20, 0.5,              # binom.test(x, n, p = 0.5, ...)
           alternative="two.sided", # define sidedness
           conf.level=0.95)         # define confidence level
Input =("
 Paw
 right
 left
 right
 right
 right
 right
 left
 right
 right
 right  
")
Gus = read.table(textConnection(Input),header=TRUE)
Successes = sum(Gus$ Paw == "left")      # Note the == operator
Failures  = sum(Gus$ Paw == "right")
Total = Successes + Failures
Expected = 0.5
binom.test(Successes, Total, Expected,
           alternative="two.sided",
           conf.level=0.95)
# load library
library(DescTools)                    
BinomCI(2, 20,                        # apply BinomCI function
        conf.level = 0.95,            # define ci
        method = "modified wilson")   # define method for ci extraction
# load library
library(DescTools) 
observed = c(35,74,22,69)       # define multinominal vector
MultinomCI(observed,            # apply MultinomCI function
           conf.level=0.95,     # define ci
           method="goodman")    # define method for ci extraction
# References