> # \chapterX{Foreword} > > library(shipunov) package 'shipunov', version 1.11 > # \chapter{The data} > > # \section{Origin of the data} > > # \section{Population and sample} > > # \section{How to obtain the data} > > # \section{What to find in the data} > > # \subsection{Why do we need the data analysis} > > # \subsection{What data analysis can do} > > # \subsection{What data analysis cannot do} > > # \section{Answers to exercises} > > # \chapter{How to process the data} > > # \section{General purpose software} > > # \section{Statistical software} > > # \subsection{Graphical systems} > > # \subsection{Statistical environments} > > # \section{The very short history of the \S and \R} > > # \section{Use, advantages and disadvantages of the \R} > > # \section{How to download and install \R} > > # \section{How to start with \R} > > # \subsection{Launching \R} > > # \subsection{First steps} > > round(1.5, digits=0) [1] 2 > round(1.5, d=0) [1] 2 > round(d=0, 1.5) [1] 2 > round(1.5, 0) [1] 2 > round(1.5,) [1] 2 > round(1.5) [1] 2 > round(0, 1.5) [1] 0 > args(round) function (x, digits = 0) NULL > args(q) function (save = "default", status = 0, runLast = TRUE) NULL > # \subsection{How to type} > > round (1.5, digits=0) [1] 2 > round(1.5, digits=0) [1] 2 > round ( 1.5 , digits = 0 ) [1] 2 > # \subsection{Overgrown calculator} > > 2+2 [1] 4 > 2+.2 [1] 2.2 > log(-1) Warning in log(-1) : NaNs produced [1] NaN > 100/0 [1] Inf > # \subsection{How to play with \R} > > bb <- matrix(1:9, ncol=3) > bb [,1] [,2] [,3] [1,] 1 4 7 [2,] 2 5 8 [3,] 3 6 9 > coordinates <- expand.grid(1:100, 1:100) > sampled.rows <- sample(1:nrow(coordinates), 100) > coordinates[sampled.rows, ] Var1 Var2 7672 72 77 8019 19 81 6675 75 67 69 69 1 6479 79 65 649 49 7 6514 14 66 4235 35 43 7554 54 76 8007 7 81 7108 8 72 8185 85 82 3377 77 34 6348 48 64 6024 24 61 2951 51 30 4097 97 41 6506 6 66 2518 18 26 4725 25 48 4800 100 48 5970 70 60 9438 38 95 415 15 5 6299 99 63 2329 29 24 1259 59 13 9657 57 97 2366 66 24 5867 67 59 7934 34 80 6588 88 66 8569 69 86 1570 70 16 7815 15 79 6942 42 70 8976 76 90 7280 80 73 59 59 1 5311 11 54 2035 35 21 6195 95 62 1517 17 16 7634 34 77 5094 94 51 1013 13 11 8013 13 81 2699 99 27 9685 85 97 8090 90 81 9659 59 97 1175 75 12 1075 75 11 7430 30 75 5173 73 52 7599 99 76 904 4 10 394 94 4 9326 26 94 3180 80 32 2577 77 26 1805 5 19 7663 63 77 6754 54 68 2983 83 30 2701 1 28 3968 68 40 9276 76 93 2069 69 21 9839 39 99 6521 21 66 231 31 3 7661 61 77 4463 63 45 2764 64 28 2264 64 23 9226 26 93 6603 3 67 7780 80 78 1849 49 19 4185 85 42 3708 8 38 7467 67 75 657 57 7 2994 94 30 5685 85 57 2650 50 27 3032 32 31 5015 15 51 4251 51 43 6173 73 62 2834 34 29 5460 60 55 3881 81 39 9063 63 91 7997 97 80 5913 13 60 1449 49 15 5845 45 59 3951 51 40 > dice <- as.vector(outer(1:6, 1:6, paste)) > sample(dice, 4, replace=TRUE) [1] "4 3" "1 1" "1 5" "6 1" > sample(dice, 5, replace=TRUE) [1] "6 6" "2 4" "4 5" "5 1" "5 5" > cards <- paste(rep(c(6:10,"V","D","K","T"), 4), + c("Tr","Bu","Ch","Pi")) > sample(cards, 6) [1] "T Bu" "K Ch" "10 Tr" "K Pi" "V Ch" "D Pi" > sample(cards, 6) [1] "D Bu" "10 Tr" "T Ch" "10 Ch" "T Bu" "9 Pi" > # \section{\R and data} > > # \subsection{How to enter the data from within \R} > > c(1, 2, 3, 4, 5) [1] 1 2 3 4 5 > aa <- c(2, 3, 4, 5, 6, 7, 8) > aa [1] 2 3 4 5 6 7 8 > (aa <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)) [1] 1 2 3 4 5 6 7 8 9 > aa [1] 1 2 3 4 5 6 7 8 9 > rep(1, 5) [1] 1 1 1 1 1 > rep(1:5, each=3) [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 > seq(1, 5) [1] 1 2 3 4 5 > 1:5 [1] 1 2 3 4 5 > # \subsection{How to name your objects} > > # \subsection{How to load the text data} > > mydata <- read.table("data/mydata.txt", sep=";", head=TRUE) > str(mydata) 'data.frame': 3 obs. of 3 variables: $ a: int 1 4 7 $ b: int 2 5 8 $ c: int 3 6 9 > head(mydata) a b c 1 1 2 3 2 4 5 6 3 7 8 9 > # \subsection{How to load data from Internet} > > # \subsection{How to use \texttt{read.table()} properly} > > read.table("data/mydata1.txt", head=TRUE) a b c one 1 2 3 two 4 5 6 three 7 8 9 > read.table("data/mydata2.txt", sep="\t", quote="", head=TRUE) a b c one 1 2 3 two 4 5 6 three o'clock 7 8 9 > read.table("data/mydata3.txt", dec=",", se=";", h=T) a b c 1 1 2 3.5 2 4 5 6.0 3 7 8 9.0 > # \subsection{How to load binary data} > > xx <- "apple" > save(xx, file="xx.rd") # Save object "xx" > exists("xx") [1] TRUE > rm(xx) > exists("xx") [1] FALSE > dir() [1] "000todo" "data" [3] "Makefile" "old.zip" [5] "open" "pics" [7] "supp" "visual_statistics.aux" [9] "visual_statistics.log" "visual_statistics.out" [11] "visual_statistics.pdf" "visual_statistics.r" [13] "visual_statistics_rplots.pdf" "visual_statistics_rresults.txt" [15] "visual_statistics.tex" "visual_statistics.toc" [17] "xx.rd" > load("xx.rd") # Load object "xx" > xx [1] "apple" > # \subsection{How to load data from clipboard} > > # \subsection{How to edit data in \R} > > # \subsection{How to save the results} > > write.table(file="trees.txt", trees, row.names=FALSE, sep="\t", + quote=FALSE) > # \subsection{History and scripts} > > # \section{\R graphics} > > # \subsection{Graphical systems} > > pdf(file="pics/05430.pdf"); oldpar <- par(mar=c(4, 4, 2, 1)) > plot(1:20, main="Title") > legend("topleft", pch=1, legend="My wonderful points") > par(oldpar); dev.off() null device 1 > plot(1:20, type="n") > mtext("Title", line=1.5, font=2) > points(1:20) > legend("topleft", pch=1, legend="My wonderful points") > pdf(file="pics/05610.pdf"); oldpar <- par(mar=c(4, 4, 2, 1)) > plot(cars) > title(main="Cars from 1920s") > par(oldpar); dev.off() pdf 2 > plot(1:20, pch=3, col=6, main="Title") > pdf(file="pics/07420.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > library(lattice) > xyplot(1:20 ~ 1:20, main="title") > par(oldpar); dev.off() pdf 2 > pdf(file="pics/05960.pdf"); oldpar <- par(mar=c(2, 2, 1, 1)) > library(ggplot2) > qplot(1:20, 1:20, main="title") > par(oldpar); dev.off() pdf 2 > # \subsection{Graphical devices} > > png(file="01_20.png", bg="transparent") > plot(1:20) > text(10, 20, "a") > dev.off() pdf 2 > pdf(file="01_20.pdf", width=8) > plot(1:20) > text(10, 20, "a") > dev.off() pdf 2 > # \subsection{Graphical options} > > pdf(file="pics/06060.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > old.par <- par(mfrow=c(2, 1)) > hist(cars$speed, main="") > hist(cars$dist, main="") > par(old.par) > par(oldpar); dev.off() pdf 2 > hist(cars$speed, main="", xaxt="n", xlab="") > old.par <- par(new=TRUE) > hist(cars$dist, main="", axes=FALSE, xlab="", lty=2) > par(old.par) > # \subsection{Interactive graphics} > > # \section{Answers to exercises} > > eggs <- read.table("data/eggs.txt") > str(eggs) 'data.frame': 555 obs. of 3 variables: $ V1: int 51 48 44 48 46 43 48 46 49 45 ... $ V2: int 34 33 33 34 32 32 35 32 34 33 ... $ V3: chr "221" "221" "221" "221" ... > head(eggs) V1 V2 V3 1 51 34 221 2 48 33 221 3 44 33 221 4 48 34 221 5 46 32 221 6 43 32 221 > pdf(file="pics/09010.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > plot(V2 ~ V1, data=eggs, xlab="Length, mm", ylab="Width, mm", + pch=21, bg="grey") > abline(line(eggs$V1, eggs$V2), lty=2, lwd=1.5) > lines(loess.smooth(eggs$V1, eggs$V2), col=2, lty=2, lwd=1.5) > legend("topleft", lty=2, col=1:2, lwd=1.5, legend=c( + "Tukey's median-median line", "LOESS curve")) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/10320.pdf"); oldpar <- par(mar=c(2, 2, 2, 1)) > plot(1:20, col="green", xlab="", ylab="") > points(20:1, pch=0) > par(oldpar); dev.off() pdf 2 > # \chapter{Types of data} > > # \section{Degrees, hours and kilometers: measurement data} > > rnorm(10) [1] -0.856583317 -0.651605803 -0.007427443 -0.039881843 1.053649210 [6] 0.881785566 0.009527859 0.785361902 1.493052534 1.291616044 > runif(10) [1] 0.96716324 0.05713539 0.30067351 0.83834945 0.76931100 0.19325560 [7] 0.76148651 0.93578418 0.11441590 0.02459024 > pdf(file="pics/10490.pdf") > old.par <- par(mfrow=c(2, 1)) > hist(rnorm(100), main="Normal data") > hist(runif(100), main="Non-normal data") > par(old.par) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/parametric_to_crop.pdf") > library(plotrix) > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > draw.circle(-.2, 0, .4) > draw.circle(.1, 0, .9) > text(-.2, 0, "parametric", cex=1.5) > text(.1, 0.6, "nonparametric", cex=1.5) > dev.off() pdf 2 > x <- c(174, 162, 188, 192, 165, 168, 172.5) > str(x) num [1:7] 174 162 188 192 165 ... > is.vector(x) [1] TRUE > is.numeric(x) [1] TRUE > sort(x) [1] 162.0 165.0 168.0 172.5 174.0 188.0 192.0 > rev(sort(x)) [1] 192.0 188.0 174.0 172.5 168.0 165.0 162.0 > pdf(file="pics/10247_to_crop.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > library(vegan) Loading required package: permute This is vegan 2.5-6 > phanerozoic <- read.table("data/phanerozoic.txt") > with(phanerozoic, linestack(541-V2, labels=paste(V1, V2), cex=1)) > par(oldpar); dev.off() pdf 2 > # \section{Grades and t-shirts: ranked data} > > rr <- c(2, 1, 3, 3, 1, 1, 2) > str(rr) num [1:7] 2 1 3 3 1 1 2 > (hh <- cut(x, 3, labels=c(1:3), ordered_result=TRUE)) [1] 2 1 3 3 1 1 2 Levels: 1 < 2 < 3 > x [1] 174.0 162.0 188.0 192.0 165.0 168.0 172.5 > as.numeric(hh) [1] 2 1 3 3 1 1 2 > # \section{Colors, names and sexes: nominal data} > > # \subsection{Character vectors} > > sex <- c("male", "female", "male", "male", "female", "male", + "male") > is.character(sex) [1] TRUE > is.vector(sex) [1] TRUE > str(sex) chr [1:7] "male" "female" "male" "male" "female" "male" "male" > sex [1] "male" "female" "male" "male" "female" "male" "male" > sex[2:3] [1] "female" "male" > `[`(sex, 2:3) [1] "female" "male" > table(sex) sex female male 2 5 > # \subsection{Factors} > > sex.f <- factor(sex) > sex.f [1] male female male male female male male Levels: female male > pdf(file="pics/08830.pdf"); oldpar <- par(mar=c(2, 2, 1, 1)) > plot(sex.f) > par(oldpar); dev.off() pdf 2 > is.factor(sex.f) [1] TRUE > is.character(sex.f) [1] FALSE > str(sex.f) Factor w/ 2 levels "female","male": 2 1 2 2 1 2 2 > levels(sex.f) [1] "female" "male" > nlevels(sex.f) [1] 2 > as.numeric(sex.f) [1] 2 1 2 2 1 2 2 > w <- c(69, 68, 93, 87, 59, 82, 72) > pdf(file="pics/09290.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > plot(x, w, pch=as.numeric(sex.f), col=as.numeric(sex.f), + xlab="Height, cm", ylab="Weight, kg") > legend("topleft", pch=1:2, col=1:2, legend=levels(sex.f)) > par(oldpar); dev.off() pdf 2 > palette() [1] "black" "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710" [8] "gray62" > (ff <- factor(3:5)) [1] 3 4 5 Levels: 3 4 5 > as.numeric(ff) # incorrect! [1] 1 2 3 > as.numeric(as.character(ff)) # correct! [1] 3 4 5 > sex.f[5:6] [1] female male Levels: female male > sex.f[6:7] [1] male male Levels: female male > droplevels(sex.f[6:7]) [1] male male Levels: male > sex.f[6:7, drop=T] [1] male male Levels: male > factor(as.character(sex.f[6:7])) [1] male male Levels: male > m <- c("L", "S", "XL", "XXL", "S", "M", "L") > m.f <- factor(m) > m.f [1] L S XL XXL S M L Levels: L M S XL XXL > m.o <- ordered(m.f, levels=c("S", "M", "L", "XL", "XXL")) > m.o [1] L S XL XXL S M L Levels: S < M < L < XL < XXL > ls() [1] "aa" "bb" "cards" "coordinates" "dice" [6] "eggs" "ff" "hh" "m" "m.f" [11] "m.o" "mydata" "oldpar" "old.par" "phanerozoic" [16] "rr" "sampled.rows" "sex" "sex.f" "w" [21] "x" "xx" > Ls() # shipunov Name Mode Type Obs Vars Size 1 aa numeric vector 9 1 176 bytes 2 bb numeric matrix 3 3 264 bytes 3 cards character vector 36 1 2.3 Kb 4 coordinates list data.frame 10000 2 94 Kb 5 dice character vector 36 1 2.3 Kb 6 eggs list data.frame 555 3 10.6 Kb 7 ff numeric factor 3 1 648 bytes 8 hh numeric factor 7 1 728 bytes 9 m character vector 7 1 392 bytes 10 m.f numeric factor 7 1 792 bytes 11 m.o numeric factor 7 1 856 bytes 12 mydata list data.frame 3 3 1 Kb 13 oldpar list unknown - - 360 bytes 14 old.par list unknown - - 336 bytes 15 phanerozoic list data.frame 12 2 1.8 Kb 16 rr numeric vector 7 1 112 bytes 17 sampled.rows numeric vector 100 1 448 bytes 18 sex character vector 7 1 224 bytes 19 sex.f numeric factor 7 1 592 bytes 20 w numeric vector 7 1 112 bytes 21 x numeric vector 7 1 112 bytes 22 xx character vector 1 1 112 bytes > # \subsection{Logical vectors and binary data} > > (likes.pizza <- c(T, T, F, F, T, T, F)) [1] TRUE TRUE FALSE FALSE TRUE TRUE FALSE > is.vector(likes.pizza) [1] TRUE > is.factor(likes.pizza) [1] FALSE > is.character(likes.pizza) [1] FALSE > is.logical(likes.pizza) [1] TRUE > likes.pizza * 1 [1] 1 1 0 0 1 1 0 > as.logical(c(1, 1, 0)) [1] TRUE TRUE FALSE > as.numeric(likes.pizza) [1] 1 1 0 0 1 1 0 > Tobin(sex, convert.names=FALSE) female male [1,] 0 1 [2,] 1 0 [3,] 0 1 [4,] 0 1 [5,] 1 0 [6,] 0 1 [7,] 0 1 > (is.male <- sex == "male") [1] TRUE FALSE TRUE TRUE FALSE TRUE TRUE > (is.female <- sex == "female") [1] FALSE TRUE FALSE FALSE TRUE FALSE FALSE > x > 170 [1] TRUE FALSE TRUE TRUE FALSE FALSE TRUE > x[x > 170] [1] 174.0 188.0 192.0 172.5 > ((x < 180) | (w <= 70)) & (sex=="female" | m=="S") [1] FALSE TRUE FALSE FALSE TRUE FALSE FALSE > ifelse(sex=="female", "pink", "blue") [1] "blue" "pink" "blue" "blue" "pink" "blue" "blue" > if(sex[1]=="female") { + "pink" + } else { + "blue" + } [1] "blue" > # \section{Fractions, counts and ranks: secondary data} > > sex.t <- table(sex) > round(100*sex.t/sum(sex.t)) sex female male 29 71 > pdf(file="pics/09860.pdf"); oldpar <- par(mar=c(4, 2, 1, 1)) > load("data/com12.rd") > exists("com12") # check if our object is here [1] TRUE > com12.plot <- barplot(com12, names.arg="") > text(com12.plot, par("usr")[3]*2, srt=45, pos=2, + xpd=TRUE, labels=names(com12)) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/10070.pdf"); oldpar <- par(mar=c(2, 2, 3, 1)) > dotchart(com12) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/06105_to_crop.pdf") > com80 <- read.table("data/com80.txt") > library(wordcloud) Loading required package: RColorBrewer > set.seed(5) # freeze random number generator > wordcloud(words=com80[, 1], freq=com80[, 2], + colors=brewer.pal(8, "Dark2")) > dev.off() pdf 2 > set.seed(1); rnorm(1) [1] -0.6264538 > dotchart(log(com80[, 2]), labels=com80[, 1], cex=.6) > x.ranks <- x > names(x.ranks) <- rank(x) > x.ranks 5 1 6 7 2 3 4 174.0 162.0 188.0 192.0 165.0 168.0 172.5 > sort(x.ranks) # easier to spot 1 2 3 4 5 6 7 162.0 165.0 168.0 172.5 174.0 188.0 192.0 > x.ranks2 <- c(x, x[3]) # duplicate the 3rd item > names(x.ranks2) <- rank(x.ranks2) > sort(x.ranks2) 1 2 3 4 5 6.5 6.5 8 162.0 165.0 168.0 172.5 174.0 188.0 188.0 192.0 > wilcox.test(x.ranks2) Warning in wilcox.test.default(x.ranks2) : cannot compute exact p-value with ties Wilcoxon signed rank test with continuity correction data: x.ranks2 V = 36, p-value = 0.01415 alternative hypothesis: true location is not equal to 0 > # \section{Missing data} > > (hh <- c(8, 10, NA, NA, 8, NA, 8)) [1] 8 10 NA NA 8 NA 8 > mean(hh) [1] NA > mean(hh, na.rm=TRUE) [1] 8.5 > mean(na.omit(hh)) [1] 8.5 > hh.old <- hh > hh.old [1] 8 10 NA NA 8 NA 8 > hh[is.na(hh)] <- mean(hh, na.rm=TRUE) > hh [1] 8.0 10.0 8.5 8.5 8.0 8.5 8.0 > # \section{Outliers, and how to find them} > > m1 <- read.table("data/mydata.txt", sep=";") # wrong! > str(m1) 'data.frame': 4 obs. of 3 variables: $ V1: chr "a" "1" "4" "7" $ V2: chr "b" "2" "5" "8" $ V3: chr "c" "3" "6" "9" > m2 <- read.table("data/mydata.txt", sep=";", h=TRUE) # correct! > str(m2) 'data.frame': 3 obs. of 3 variables: $ a: int 1 4 7 $ b: int 2 5 8 $ c: int 3 6 9 > # \section{Changing data: basics of transformations} > > scale(trees) Girth Height Volume [1,] -1.57685421 -0.9416472 -1.20885469 [2,] -1.48125614 -1.7263533 -1.20885469 [3,] -1.41752409 -2.0402357 -1.21493821 [4,] -0.87580169 -0.6277648 -0.83775985 [5,] -0.81206964 0.7847060 -0.69175532 [6,] -0.78020362 1.0985884 -0.63700362 [7,] -0.71647157 -1.5694121 -0.88642802 [8,] -0.71647157 -0.1569412 -0.72825645 [9,] -0.68460554 0.6277648 -0.46058149 [10,] -0.65273952 -0.1569412 -0.62483658 [11,] -0.62087350 0.4708236 -0.36324513 [12,] -0.58900747 0.0000000 -0.55791784 [13,] -0.58900747 0.0000000 -0.53358375 [14,] -0.49340940 -1.0985884 -0.53966727 [15,] -0.39781133 -0.1569412 -0.67350476 [16,] -0.11101712 -0.3138824 -0.48491557 [17,] -0.11101712 1.4124708 0.22077297 [18,] 0.01644698 1.5694121 -0.16857243 [19,] 0.14391108 -0.7847060 -0.27199230 [20,] 0.17577710 -1.8832945 -0.32066048 [21,] 0.23950915 0.3138824 0.26335763 [22,] 0.30324119 0.6277648 0.09301901 [23,] 0.39883927 -0.3138824 0.37286102 [24,] 0.87682962 -0.6277648 0.49453146 [25,] 0.97242770 0.1569412 0.75612291 [26,] 1.29108793 0.7847060 1.53481372 [27,] 1.35481998 0.9416472 1.55306429 [28,] 1.48228408 0.6277648 1.71123586 [29,] 1.51415010 0.6277648 1.29755636 [30,] 1.51415010 0.6277648 1.26713875 [31,] 2.34266672 1.7263533 2.84885447 attr(,"scaled:center") Girth Height Volume 13.24839 76.00000 30.17097 attr(,"scaled:scale") Girth Height Volume 3.138139 6.371813 16.437846 > # \subsection{How to tell the kind of data} > > # \section{Inside \R} > > # \subsection{Matrices} > > m <- 1:4 > m [1] 1 2 3 4 > ma <- matrix(m, ncol=2, byrow=TRUE) > ma [,1] [,2] [1,] 1 2 [2,] 3 4 > str(ma) int [1:2, 1:2] 1 3 2 4 > str(m) int [1:4] 1 2 3 4 > mb <- m > mb [1] 1 2 3 4 > attr(mb, "dim") <- c(2, 2) > mb [,1] [,2] [1,] 1 3 [2,] 2 4 > ma[1, 2] [1] 2 > ma[, ] [,1] [,2] [1,] 1 2 [2,] 3 4 > (mi <- matrix(c(1, 1, 2, 2), ncol=2, byrow=TRUE)) [,1] [,2] [1,] 1 1 [2,] 2 2 > ma[mi] [1] 1 4 > (mn <- matrix(c(NA, 1, 2, 2), ncol=2, byrow=TRUE)) [,1] [,2] [1,] NA 1 [2,] 2 2 > is.na(mn) [,1] [,2] [1,] TRUE FALSE [2,] FALSE FALSE > mn[is.na(mn)] <- 0 > mn [,1] [,2] [1,] 0 1 [2,] 2 2 > mean(ma) [1] 2.5 > ma[1, 1] <- "a" > mean(ma) Warning in mean.default(ma) : argument is not numeric or logical: returning NA [1] NA > ma [,1] [,2] [1,] "a" "2" [2,] "3" "4" > m3 <- 1:8 > dim(m3) <- c(2, 2, 2) > m3 , , 1 [,1] [,2] [1,] 1 3 [2,] 2 4 , , 2 [,1] [,2] [1,] 5 7 [2,] 6 8 > # \subsection{Lists} > > l <- list("R", 1:3, TRUE, NA, list("r", 4)) > l [[1]] [1] "R" [[2]] [1] 1 2 3 [[3]] [1] TRUE [[4]] [1] NA [[5]] [[5]][[1]] [1] "r" [[5]][[2]] [1] 4 > fred <- list(name="Fred", wife.name="Mary", no.children=3, + child.ages=c(1, 5, 9)) > fred $name [1] "Fred" $wife.name [1] "Mary" $no.children [1] 3 $child.ages [1] 1 5 9 > names(w) <- c("Rick", "Amanda", "Peter", "Alex", "Kathryn", + "Ben", "George") > w Rick Amanda Peter Alex Kathryn Ben George 69 68 93 87 59 82 72 > row.names(ma) <- c("row1", "row2") > colnames(ma) <- c("col1", "col2") > ma col1 col2 row1 "a" "2" row2 "3" "4" > names(w) <- NULL > w [1] 69 68 93 87 59 82 72 > hh[3] [1] 8.5 > ma[2, 1] [1] "3" > l[1] [[1]] [1] "R" > str(l[1]) List of 1 $ : chr "R" > l[[1]] [1] "R" > str(l[[1]]) chr "R" > str(l[[5]]) List of 2 $ : chr "r" $ : num 4 > names(l) <- c("first", "second", "third", "fourth", "fifth") > l$first [1] "R" > str(l$first) chr "R" > l$fir [1] "R" > l$fi NULL > names(w) <- c("Rick", "Amanda", "Peter", "Alex", "Kathryn", + "Ben", "George") > w["Jenny"] NA > x2.wilcox <- wilcox.test(x.ranks2) Warning in wilcox.test.default(x.ranks2) : cannot compute exact p-value with ties > str(x2.wilcox) List of 7 $ statistic : Named num 36 ..- attr(*, "names")= chr "V" $ parameter : NULL $ p.value : num 0.0141 $ null.value : Named num 0 ..- attr(*, "names")= chr "location" $ alternative: chr "two.sided" $ method : chr "Wilcoxon signed rank test with continuity correction" $ data.name : chr "x.ranks2" - attr(*, "class")= chr "htest" > x2.wilcox$p.value [1] 0.0141474 > # \subsection{Data frames} > > d <- data.frame(weight=w, height=x, size=m.o, sex=sex.f) > row.names(d) <- c("Rick", "Amanda", "Peter", "Alex", "Kathryn", + "Ben", "George") > d weight height size sex Rick 69 174.0 L male Amanda 68 162.0 S female Peter 93 188.0 XL male Alex 87 192.0 XXL male Kathryn 59 165.0 S female Ben 82 168.0 M male George 72 172.5 L male > d <- read.table("data/d.txt", h=TRUE, stringsAsFactors=TRUE) > d$size <- ordered(d$size, levels=c("S", "M", "L", "XL", "XXL")) > str(d) 'data.frame': 7 obs. of 4 variables: $ weight: int 69 68 93 87 59 82 72 $ height: num 174 162 188 192 165 ... $ size : Ord.factor w/ 5 levels "S"<"M"<"L"<"XL"<..: 3 1 4 5 1 2 3 $ sex : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 2 > d[, 1] [1] 69 68 93 87 59 82 72 > d[[1]] [1] 69 68 93 87 59 82 72 > d$weight [1] 69 68 93 87 59 82 72 > d[, "weight"] [1] 69 68 93 87 59 82 72 > d[["weight"]] [1] 69 68 93 87 59 82 72 > identical(d$weight, d[, 1]) [1] TRUE > d[, 2:4] # matrix method height size sex Rick 174.0 L male Amanda 162.0 S female Peter 188.0 XL male Alex 192.0 XXL male Kathryn 165.0 S female Ben 168.0 M male George 172.5 L male > d[, c("height", "size", "sex")] height size sex Rick 174.0 L male Amanda 162.0 S female Peter 188.0 XL male Alex 192.0 XXL male Kathryn 165.0 S female Ben 168.0 M male George 172.5 L male > d[2:4] # list method height size sex Rick 174.0 L male Amanda 162.0 S female Peter 188.0 XL male Alex 192.0 XXL male Kathryn 165.0 S female Ben 168.0 M male George 172.5 L male > subset(d, select=2:4) height size sex Rick 174.0 L male Amanda 162.0 S female Peter 188.0 XL male Alex 192.0 XXL male Kathryn 165.0 S female Ben 168.0 M male George 172.5 L male > d[, -1] # negative selection height size sex Rick 174.0 L male Amanda 162.0 S female Peter 188.0 XL male Alex 192.0 XXL male Kathryn 165.0 S female Ben 168.0 M male George 172.5 L male > Str(d) # shipunov 'data.frame': 7 obs. of 4 variables: 1 weight: int 69 68 93 87 59 82 72 2 height: num 174 162 188 192 165 ... 3 size : Ord.factor w/ 5 levels "S"<"M"<"L"<"XL"<..: 3 1 4 5 1 2 3 4 sex : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 2 row.names [1:7] "Rick" "Amanda" "Peter" "Alex" "Kathryn" ... > d[d$sex=="female", ] weight height size sex Amanda 68 162 S female Kathryn 59 165 S female > d$sex=="female" [1] FALSE TRUE FALSE FALSE TRUE FALSE FALSE > d[c(2, 5), ] weight height size sex Amanda 68 162 S female Kathryn 59 165 S female > d[, names(d) != "weight"] height size sex Rick 174.0 L male Amanda 162.0 S female Peter 188.0 XL male Alex 192.0 XXL male Kathryn 165.0 S female Ben 168.0 M male George 172.5 L male > d[d$size== "M" | d$size== "S", ] weight height size sex Amanda 68 162 S female Kathryn 59 165 S female Ben 82 168 M male > d[d$size %in% c("M", "L") & d$sex=="male", ] weight height size sex Rick 69 174.0 L male Ben 82 168.0 M male George 72 172.5 L male > subset(d, sex=="female") weight height size sex Amanda 68 162 S female Kathryn 59 165 S female > d.new <- d > d.new[, 1] <- round(d.new[, 1] * 2.20462) > d.new weight height size sex Rick 152 174.0 L male Amanda 150 162.0 S female Peter 205 188.0 XL male Alex 192 192.0 XXL male Kathryn 130 165.0 S female Ben 181 168.0 M male George 159 172.5 L male > d.new$he <- round(d.new$he * 0.0328084) > d.new weight height size sex he Rick 152 174.0 L male 6 Amanda 150 162.0 S female 5 Peter 205 188.0 XL male 6 Alex 192 192.0 XXL male 6 Kathryn 130 165.0 S female 5 Ben 181 168.0 M male 6 George 159 172.5 L male 6 > data.frame(a=1:4, b=1:2) a b 1 1 1 2 2 2 3 3 1 4 4 2 > d[order(d$sex, d$height), ] weight height size sex Amanda 68 162.0 S female Kathryn 59 165.0 S female Ben 82 168.0 M male George 72 172.5 L male Rick 69 174.0 L male Peter 93 188.0 XL male Alex 87 192.0 XXL male > order(d$sex, d$height) [1] 2 5 6 7 1 3 4 > # \subsection{Overview of data types and modes} > > # \section{Answers to exercises} > > plot(sex.f, col=1:2) > plot(sex.f, col=1:nlevels(sex.f)) > rev(sort(com12)) c plot names read.table length dev.off paste 4128 2896 2031 918 772 722 699 par summary function text as.numeric 677 672 653 628 573 > sort(com12, decreasing=TRUE) c plot names read.table length dev.off paste 4128 2896 2031 918 772 722 699 par summary function text as.numeric 677 672 653 628 573 > comp <- read.table( + "http://ashipunov.me/shipunov/open/compositae.txt", + h=TRUE, sep="\t") > str(comp) 'data.frame': 1396 obs. of 8 variables: $ SPECIES : chr "Achillea cartilaginea" "Achillea cartilaginea" "Achillea cartilaginea" "Achillea cartilaginea" ... $ PLANT : int 118 118 118 118 118 118 118 118 118 118 ... $ HEIGHT : int 460 460 460 460 460 460 460 460 460 460 ... $ N.CIRCLES: chr "1" "1" "1" "1" ... $ N.LEAVES : int 2 2 2 2 2 2 2 2 2 2 ... $ HEAD.D : num 7 8 8 8 10 7 8 9 9 10 ... $ DISC.D : int 2 2 3 3 4 3 3 3 3 3 ... $ RAYS : int 7 7 7 7 7 8 8 8 8 8 ... > c3 <- comp[comp$SPECIES %in% c("Anthemis tinctoria", + "Cosmos bipinnatus", "Tripleurospermum inodorum"), ] > c3$SPECIES <- factor(c3$SPECIES) > with(c3, plot(HEAD.D, RAYS, col=as.numeric(SPECIES), + pch=as.numeric(SPECIES), + xlab="Diameter of head, mm", ylab="Number of rays")) > legend("topright", pch=1:3, col=1:3, legend=levels(c3$SPECIES)) > with(c3, plot(jitter(HEAD.D), jitter(RAYS), + col=as.numeric(SPECIES), pch=as.numeric(SPECIES), + xlab="Diameter of head, mm", ylab="Number of rays")) > with(c3, sunflowerplot(HEAD.D, RAYS)) > with(c3, smoothScatter(HEAD.D, RAYS)) > library(hexbin) > with(c3, plot(hexbin(HEAD.D, RAYS))) > pdf(file="pics/07055.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > with(c3, plot(HEAD.D, RAYS, type="n", + xlab="Diameter of head, mm", ylab="Number of rays")) > with(c3, PPoints(SPECIES, HEAD.D, RAYS, scale=.9, + pchs=1, centers=TRUE)) > legend("topright", pch=1, col=1:3, + legend=levels(c3$SPECIES), text.font=3) > par(oldpar); dev.off() pdf 2 > ma <- matrix(m, ncol=2, byrow=FALSE) > ma [,1] [,2] [1,] 1 3 [2,] 2 4 > jj <- paste0(rep("", 100), c("", "", "Fizz"), c(rep("", 4), "Buzz")) > ifelse(jj == "", 1:100, jj) [1] "1" "2" "Fizz" "4" "Buzz" "Fizz" [7] "7" "8" "Fizz" "Buzz" "11" "Fizz" [13] "13" "14" "FizzBuzz" "16" "17" "Fizz" [19] "19" "Buzz" "Fizz" "22" "23" "Fizz" [25] "Buzz" "26" "Fizz" "28" "29" "FizzBuzz" [31] "31" "32" "Fizz" "34" "Buzz" "Fizz" [37] "37" "38" "Fizz" "Buzz" "41" "Fizz" [43] "43" "44" "FizzBuzz" "46" "47" "Fizz" [49] "49" "Buzz" "Fizz" "52" "53" "Fizz" [55] "Buzz" "56" "Fizz" "58" "59" "FizzBuzz" [61] "61" "62" "Fizz" "64" "Buzz" "Fizz" [67] "67" "68" "Fizz" "Buzz" "71" "Fizz" [73] "73" "74" "FizzBuzz" "76" "77" "Fizz" [79] "79" "Buzz" "Fizz" "82" "83" "Fizz" [85] "Buzz" "86" "Fizz" "88" "89" "FizzBuzz" [91] "91" "92" "Fizz" "94" "Buzz" "Fizz" [97] "97" "98" "Fizz" "Buzz" > d.sorted <- d[order(d$sex, d$height), ] > d.sorted[, order(names(d.sorted))] height sex size weight Amanda 162.0 female S 68 Kathryn 165.0 female S 59 Ben 168.0 male M 82 George 172.5 male L 72 Rick 174.0 male L 69 Peter 188.0 male XL 93 Alex 192.0 male XXL 87 > # \chapter{One-dimensional data} > > # \section{How to estimate general tendencies} > > # \subsection{Median is the best} > > salary <- c(21, 19, 27, 11, 102, 25, 21) > mean(salary) [1] 32.28571 > median(salary) [1] 21 > sort(salary1 <- c(salary, 22)) [1] 11 19 21 21 22 25 27 102 > median(salary1) [1] 21.5 > mean(salary, trim=0.2) [1] 22.6 > sex <- c("male", "female", "male", "male", "female", "male", + "male") > t.sex <- table(sex) > mode <- names(t.sex[which.max(t.sex)]) > mode [1] "male" > attach(trees) > mean(Girth) [1] 13.24839 > mean(Height) [1] 76 > detach(trees) > with(trees, mean(Volume)) # Second way [1] 30.17097 > sapply(trees, mean) Girth Height Volume 13.24839 76.00000 30.17097 > trees.n <- trees > trees.n[2, 1] <- NA > sapply(trees.n, mean) Girth Height Volume NA 76.00000 30.17097 > sapply(trees.n, mean, na.rm=TRUE) Girth Height Volume 13.40333 76.00000 30.17097 > # \subsection{Quartiles and quantiles} > > quantile(salary, c(0, 0.25, 0.5, 0.75, 1)) 0% 25% 50% 75% 100% 11 20 21 26 102 > fivenum(salary) [1] 11 20 21 26 102 > summary(salary) Min. 1st Qu. Median Mean 3rd Qu. Max. 11.00 20.00 21.00 32.29 26.00 102.00 > summary(PlantGrowth) weight group Min. :3.590 ctrl:10 1st Qu.:4.550 trt1:10 Median :5.155 trt2:10 Mean :5.073 3rd Qu.:5.530 Max. :6.310 > summary(hh) Min. 1st Qu. Median Mean 3rd Qu. Max. 8.0 8.0 8.5 8.5 8.5 10.0 > err <- read.table("data/errors.txt", sep="\t", + h=TRUE, stringsAsFactors=TRUE) > str(err) 'data.frame': 7 obs. of 3 variables: $ AGE : Factor w/ 6 levels "12","22","23",..: 3 4 3 5 1 6 2 $ NAME : Factor w/ 6 levels "","John","Kate",..: 2 3 1 4 5 6 2 $ HEIGHT: num 172 163 161 16.1 132 155 183 > summary(err) AGE NAME HEIGHT 12:1 :1 Min. : 16.1 22:1 John :2 1st Qu.:143.5 23:2 Kate :1 Median :161.0 24:1 Lucy :1 Mean :140.3 56:1 Penny:1 3rd Qu.:167.5 a :1 Sasha:1 Max. :183.0 > # \subsection{Variation} > > var(salary); sqrt(var(salary)); sd(salary) [1] 970.9048 [1] 31.15934 [1] 31.15934 > IQR(salary); mad(salary) [1] 6 [1] 5.9304 > with(trees, paste(median(Height), IQR(Height)/2, sep="±")) [1] "76±4" > paste(median(trees$Height), mad(trees$Height), sep="±") [1] "76±5.9304" > paste(quantile(trees$Height, c(0.025, 0.975)), collapse="-") [1] "63.75-86.25" > paste(quantile(trees$Girth, c(0.025, 0.5, 0.975)), collapse="-") [1] "8.525-12.9-18.65" > paste(boxplot.stats(trees$Height)$stats[c(1, 5)], collapse="-") [1] "63-87" > HS <- fivenum(trees$Height) > paste("(", HS[1], ")", HS[2], "-", HS[4], "(", HS[5], ")", sep="") [1] "(63)72-80(87)" > pdf(file="pics/reports_to_crop.pdf") > library(plotrix) > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > draw.circle(-.2, 0, .4) > draw.circle(.1, 0, .9) > text(-.2, 0, "mean()±sd()", cex=1.3) > text(.1, 0.65, "median()±IQR()\nmedian()±mad()", cex=1.3) > dev.off() pdf 2 > 100*sapply(trees, sd)/colMeans(trees) Girth Height Volume 23.686948 8.383964 54.482331 > # \section{1-dimensional plots} > > pdf(file="pics/14390.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > new.1000 <- sample((median(salary) - IQR(salary)) : + (median(salary) + IQR(salary)), 1000, replace=TRUE) > salary2 <- c(salary, new.1000) > boxplot(salary2, log="y") > par(oldpar); dev.off() pdf 2 > pdf(file="pics/boxplot.pdf"); oldpar <- par(mar=c(1, 1, 0, 1)) > Ex.boxplot() > par(oldpar); dev.off() pdf 2 > pdf(file="pics/14560.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > boxplot(scale(trees)) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/14730.pdf"); oldpar <- par(mar=c(2, 4, 1, 1)) > hist(salary2, breaks=20, main="", xlab=0) > par(oldpar); dev.off() pdf 2 > table(cut(salary2, 20)) (10.9,15.6] (15.6,20.1] (20.1,24.6] (24.6,29.2] (29.2,33.8] (33.8,38.3] 71 380 335 220 0 0 (38.3,42.8] (42.8,47.4] (47.4,51.9] (51.9,56.5] (56.5,61] (61,65.6] 0 0 0 0 0 0 (65.6,70.2] (70.2,74.7] (74.7,79.2] (79.2,83.8] (83.8,88.3] (88.3,92.9] 0 0 0 0 0 0 (92.9,97.5] (97.5,102] 0 1 > stem(salary, scale=2) The decimal point is 1 digit(s) to the right of the | 1 | 19 2 | 1157 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 2 > pdf(file="pics/15260.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > plot(density(salary2, adjust=2), main="", xlab="", ylab="") > rug(salary2) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/15580.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > library(beeswarm) > trees.s <- data.frame(scale(trees), class=cut(trees$Girth, 3, + labels=c("thin", "medium", "thick"))) > beeswarm(trees.s[, 1:3], cex=2, col=1:3, + pwpch=rep(as.numeric(trees.s$class), 3)) > bxplot(trees.s[, 1:3], add=TRUE) > legend("top", pch=1:3, legend=levels(trees.s$class)) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/32570.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > library(beanplot) > beanplot(trees.s[, 1:3], col=list(1, 2, 3), + border=1:3, beanlines="median") > par(oldpar); dev.off() pdf 2 > # \section{Confidence intervals} > > t.test(trees$Height) One Sample t-test data: trees$Height t = 66.41, df = 30, p-value < 2.2e-16 alternative hypothesis: true mean is not equal to 0 95 percent confidence interval: 73.6628 78.3372 sample estimates: mean of x 76 > S.value(t.test(trees$Height)) # shipunov [1] 110.9296 > wilcox.test(salary, conf.int=TRUE) Warning in wilcox.test.default(salary, conf.int = TRUE) : cannot compute exact p-value with ties Warning in wilcox.test.default(salary, conf.int = TRUE) : cannot compute exact confidence interval with ties Wilcoxon signed rank test with continuity correction data: salary V = 28, p-value = 0.02225 alternative hypothesis: true location is not equal to 0 95 percent confidence interval: 15.00002 64.49995 sample estimates: (pseudo)median 23 > # \section{Normality} > > pdf(file="pics/17030.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > qqnorm(salary2, main=""); qqline(salary2, col=2) > par(oldpar); dev.off() pdf 2 > shapiro.test(salary) Shapiro-Wilk normality test data: salary W = 0.61164, p-value = 0.0003726 > shapiro.test(salary2) Shapiro-Wilk normality test data: salary2 W = 0.73849, p-value < 2.2e-16 > set.seed(1638) # freeze random number generator > shapiro.test(rnorm(100)) Shapiro-Wilk normality test data: rnorm(100) W = 0.99337, p-value = 0.9094 > ks.test(scale(salary2), "pnorm") Warning in ks.test(scale(salary2), "pnorm") : ties should not be present for the Kolmogorov-Smirnov test One-sample Kolmogorov-Smirnov test data: scale(salary2) D = 0.094068, p-value = 3.641e-08 alternative hypothesis: two-sided > old.options <- options(scipen=100) > ks.test(scale(salary2), "pnorm") Warning in ks.test(scale(salary2), "pnorm") : ties should not be present for the Kolmogorov-Smirnov test One-sample Kolmogorov-Smirnov test data: scale(salary2) D = 0.094068, p-value = 0.00000003641 alternative hypothesis: two-sided > options(old.options) > # \section{How to create your own functions} > > Normality <- function(a) + { + ifelse(shapiro.test(a)$p.value < 0.05, "NOT NORMAL", "NORMAL") + } > Normality(salary) # shipunov [1] "NOT NORMAL" > sapply(trees, Normality) Girth Height Volume "NORMAL" "NORMAL" "NOT NORMAL" > sapply(log(trees+0.01), Normality) Girth Height Volume "NORMAL" "NORMAL" "NORMAL" > sapply(chickwts, + function(.x) names(table(.x))[which.max(table(.x))]) weight feed "248" "soybean" > old.options <- options(warn=-1) > sapply(trees, + function(.x) wilcox.test(.x, conf.int=TRUE)$conf.int) Girth Height Volume [1,] 11.84995 73.50001 22.00000 [2,] 14.44990 78.50003 36.05001 > options(old.options) > # \section{How good is the proportion?} > > table(sex) sex female male 2 5 > table(sex)/length(sex) sex female male 0.2857143 0.7142857 > prop.test(table(sex)) Warning in prop.test(table(sex)) : Chi-squared approximation may be incorrect 1-sample proportions test with continuity correction data: table(sex), null probability 0.5 X-squared = 0.57143, df = 1, p-value = 0.4497 alternative hypothesis: true p is not equal to 0.5 95 percent confidence interval: 0.05112431 0.69743997 sample estimates: p 0.2857143 > prop.test(x=356, n=476, p=0.7, alternative="two.sided") 1-sample proportions test with continuity correction data: 356 out of 476, null probability 0.7 X-squared = 4.9749, df = 1, p-value = 0.02572 alternative hypothesis: true p is not equal to 0.7 95 percent confidence interval: 0.7059174 0.7858054 sample estimates: p 0.7478992 > prop.test(x=0.52*262, n=262, p=0.5, alternative="greater") 1-sample proportions test with continuity correction data: 0.52 * 262 out of 262, null probability 0.5 X-squared = 0.34302, df = 1, p-value = 0.279 alternative hypothesis: true p is greater than 0.5 95 percent confidence interval: 0.4673901 1.0000000 sample estimates: p 0.52 > sapply(1:10, Phyllotaxis) # shipunov [1] "1/2" "1/3" "2/5" "3/8" "5/13" "8/21" "13/34" "21/55" [9] "34/89" "55/144" > # \section{Answers to exercises} > > str(shapiro.test(rnorm(100))) List of 4 $ statistic: Named num 0.992 ..- attr(*, "names")= chr "W" $ p.value : num 0.854 $ method : chr "Shapiro-Wilk normality test" $ data.name: chr "rnorm(100)" - attr(*, "class")= chr "htest" > set.seed(1683) > shapiro.test(rnorm(100))$p.value [1] 0.8424037 > betula <- read.table( + "http://ashipunov.me/shipunov/open/betula.txt", h=TRUE) > Str(betula) # shipunov 'data.frame': 229 obs. of 10 variables: 1 LOC : int 1 1 1 1 1 1 1 1 1 1 ... 2 LEAF.L : int 50 44 45 35 41 53 50 47 52 42 ... 3 LEAF.W : int 37 29 37 26 32 37 40 36 39 40 ... 4 LEAF.MAXW: int 23 20 19 15 18 25 21 21 23 19 ... 5 PUB : int 0 1 0 0 0 0 0 0 1 0 ... 6 PAPILLAE : int 1 1 1 1 1 1 1 1 1 1 ... 7 CATKIN : int 31 25 21 20 24 22 40 25 14 27 ... 8 SCALE.L : num 4 3 4 5.5 5 5 6 5 5 5 ... 9 LOBES * int 0 0 1 0 0 0 0 0 1 0 ... 10 WINGS * int 0 0 0 0 0 0 1 0 0 1 ... > sapply(betula[, c(2:4, 7:8)], Normality) # shipunov LEAF.L LEAF.W LEAF.MAXW CATKIN SCALE.L "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NORMAL" "NOT NORMAL" > pdf(file="pics/26980_to_crop.pdf") > betula.s <- stack(betula[, c(2:4, 7:8)]) > qqmath(~ values | ind, data=betula.s, + panel=function(x) {panel.qqmathline(x); panel.qqmath(x)}) > dev.off() pdf 2 > bwtheme <- standard.theme("pdf", color=FALSE) > histogram(~ values | ind, data=betula.s, par.settings=bwtheme) > CV <- function(x) {} > CV <- function(x) + { + 100*sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) + } > sapply(betula[, c(2:4, 7:8)], CV) LEAF.L LEAF.W LEAF.MAXW CATKIN SCALE.L 17.93473 20.38630 26.08806 24.17354 24.72061 > dact <- scan("data/dact.txt") Read 48 items > str(dact) num [1:48] 88 22 52 31 51 63 32 57 68 27 ... > Normality(dact) # shipunov [1] "NOT NORMAL" > summary(dact)[-4] # no mean Min. 1st Qu. Median 3rd Qu. Max. 0.00 22.00 33.50 65.25 108.00 > IQR(dact) [1] 43.25 > mad(dact) [1] 27.4281 > wilcox.test(dact, conf.int=TRUE)$conf.int Warning in wilcox.test.default(dact, conf.int = TRUE) : cannot compute exact p-value with ties Warning in wilcox.test.default(dact, conf.int = TRUE) : cannot compute exact confidence interval with ties Warning in wilcox.test.default(dact, conf.int = TRUE) : cannot compute exact p-value with zeroes Warning in wilcox.test.default(dact, conf.int = TRUE) : cannot compute exact confidence interval with zeroes [1] 34.49995 53.99997 attr(,"conf.level") [1] 0.95 > pdf(file="pics/37240.pdf"); oldpar <- par(mar=c(2, 4, 0, 0)) > Histr(dact, xlab="", main="") # shipunov Warning in hist.default(x, plot = FALSE, ...) : arguments ‘main’, ‘xlab’ are not made use of > par(oldpar); dev.off() pdf 2 > stem(dact) The decimal point is 1 digit(s) to the right of the | 0 | 0378925789 2 | 0224678901122345 4 | 471127 6 | 035568257 8 | 2785 10 | 458 > pdf(file="pics/moments_to_crop.pdf") > oldpar <- par(mfrow=c(2, 2), mar=c(0, 0, 0, 0)) > library(plotrix) > ## > > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n) > set.seed(11); x <- rnorm(100, sd=.28); y <- rnorm(100, sd=.28) > points(x, y, pch=19) > ## > > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n) > set.seed(11); x <- rnorm(100, sd=.38); y <- rnorm(100, sd=.38) > points(x, y, pch=19) > ## > > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n) > set.seed(11); x <- rnorm(100, sd=.28) - .2; y <- rnorm(100, sd=.28) > points(x, y, pch=19) > ## > > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n) > set.seed(11); x <- c(rnorm(50, sd=.28), rnorm(50, sd=.1)); y <- c(rnorm(50, sd=.28), rnorm(50, sd=.1)) > points(x, y, pch=19) > ## > > par(oldpar) > dev.off() pdf 2 > library(e1071) > skewness(dact) [1] 0.5242118 > kurtosis(dact) [1] -0.8197875 > ny <- read.table( + "http://ashipunov.me/shipunov/open/nymphaeaceae.txt", + h=TRUE, sep="\t") > Str(ny) # shipunov 'data.frame': 267 obs. of 5 variables: 1 SPECIES: chr "Nuphar lutea" "Nuphar lutea" "Nuphar lutea" "Nuphar lutea" ... 2 SEPALS : int 4 5 5 5 5 5 5 5 5 5 ... 3 PETALS : int 14 10 10 11 11 11 12 12 12 12 ... 4 STAMENS* int 131 104 113 106 111 119 73 91 102 109 ... 5 STIGMAS* int 13 12 12 11 13 15 10 12 12 11 ... > oldpar <- par(mfrow=c(2, 2)) > for (i in 2:5) boxplot(ny[, i] ~ ny[, 1], main=names(ny)[i]) > par(oldpar) > library(lattice) > ny.s <- stack(as.data.frame(scale(ny[ ,2:5]))) > ny$SPECIES <- as.factor(ny$SPECIES) > ny.s$SPECIES <- ny$SPECIES > bwplot(SPECIES ~ values | ind, ny.s, xlab="") > pdf(file="pics/44230.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > Boxplots(ny[, 2:5], ny$SPECIES, srt=0, adj=c(.5, 1)) # shipunov > par(oldpar); dev.off() pdf 2 > pdf(file="pics/44400.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > Linechart(ny[, 2:5], ny[, 1], se.lwd=2) # shipunov > par(oldpar); dev.off() pdf 2 > aggregate(ny[, 3:4], by=list(SPECIES=ny[, 1]), Normality) # shipunov SPECIES PETALS STAMENS 1 Nuphar lutea NOT NORMAL NOT NORMAL 2 Nymphaea candida NOT NORMAL NOT NORMAL > aggregate(ny[, 3:4], + by=list(SPECIES=ny[, 1]), + function(.x) wilcox.test(.x, conf.int=TRUE)$conf.int) SPECIES PETALS.1 PETALS.2 STAMENS.1 STAMENS.2 1 Nuphar lutea 14.49997 14.99996 119.00003 125.50005 2 Nymphaea candida 25.49997 27.00001 73.99995 78.49997 > aggregate(ny[, 3:4], by=list(SPECIES=ny[, 1]), function(.x) + paste(median(.x, na.rm=TRUE), mad(.x, na.rm=TRUE), sep="±")) SPECIES PETALS STAMENS 1 Nuphar lutea 14±1.4826 119±19.2738 2 Nymphaea candida 26±2.9652 77±10.3782 > phx <- read.table( + "http://ashipunov.me/shipunov/open/phyllotaxis.txt", + h=TRUE, sep="\t") > str(phx) 'data.frame': 6032 obs. of 4 variables: $ FAMILY : chr "Anacardiaceae" "Anacardiaceae" "Anacardiaceae" "Anacardiaceae" ... $ SPECIES : chr "Cotinus coggygria" "Cotinus coggygria" "Cotinus coggygria" "Cotinus coggygria" ... $ N.CIRCLES: int 2 2 2 2 2 2 2 2 2 2 ... $ N.LEAVES : int 4 4 5 5 5 5 5 5 5 5 ... > nlevels(factor((phx$FAMILY))) [1] 11 > pdf(file="pics/26710.pdf"); oldpar <- par(mar=c(2, 0, 1, 1)) > phx10 <- sapply(1:10, Phyllotaxis) > phx.all <- paste(phx$N.CIRCLES, phx$N.LEAVES, sep="/") > phx.tbl <- table(phx$FAMILY, phx.all %in% phx10) > Dotchart(sort(phx.tbl[,"FALSE"]/(rowSums(phx.tbl)))) # shipunov > par(oldpar); dev.off() pdf 2 > mean.phx.prop <- sum(phx.tbl[, 1])/sum(phx.tbl) > prop.test(phx.tbl["Onagraceae", 1], sum(phx.tbl["Onagraceae", ]), + mean.phx.prop) 1-sample proportions test with continuity correction data: phx.tbl["Onagraceae", 1] out of sum(phx.tbl["Onagraceae", ]), null probability mean.phx.prop X-squared = 227.9, df = 1, p-value < 2.2e-16 alternative hypothesis: true p is not equal to 0.2712202 95 percent confidence interval: 0.6961111 0.8221820 sample estimates: p 0.7647059 > power.prop.test(p1=0.48, p2=0.52, power=0.8) Two-sample comparison of proportions power calculation n = 2451.596 p1 = 0.48 p2 = 0.52 sig.level = 0.05 power = 0.8 alternative = two.sided NOTE: n is number in *each* group > # \chapter{Two-dimensional data: differences} > > # \section{What is a statistical test?} > > bpress <- read.table("data/bpress.txt", h=TRUE) > head(bpress) PLACEBO.1 PLACEBO.10 DRUG.1 DRUG.10 1 180 170 140 120 2 140 150 120 100 3 160 155 180 140 4 170 180 160 140 5 150 150 160 160 > drug.d <- bpress$DRUG.10 - bpress$DRUG.1 > placebo.d <- bpress$PLACEBO.10 - bpress$PLACEBO.1 > mean(drug.d - placebo.d) [1] -21 > boxplot(bpress) > # \subsection{Statistical hypotheses} > > # \subsection{Statistical errors} > > # \section{Is there a difference? Comparing two samples} > > # \subsection{Two sample tests} > > sapply(data.frame(placebo.d, drug.d), Normality) # shipunov placebo.d drug.d "NORMAL" "NORMAL" > t.test(placebo.d, drug.d) Welch Two Sample t-test data: placebo.d and drug.d t = 2.8062, df = 6.7586, p-value = 0.02726 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 3.175851 38.824149 sample estimates: mean of x mean of y 1 -20 > S.value(t.test(placebo.d, drug.d)) # shipunov [1] 5.197132 > df <- 6.7586 > v1 <- var(placebo.d) > v2 <- var(drug.d) > (t.stat <- (mean(placebo.d) - mean(drug.d))/sqrt(v1/5 + v2/5)) [1] 2.806243 > (p.value <- 2*pt(-abs(t.stat), df)) [1] 0.02725892 > S.value(p.value) # shipunov [1] 5.197128 > long <- stack(data.frame(placebo.d, drug.d)) > head(long) values ind 1 -10 placebo.d 2 10 placebo.d 3 -5 placebo.d 4 10 placebo.d 5 0 placebo.d 6 -20 drug.d > t.test(values ~ ind, data=long) Welch Two Sample t-test data: values by ind t = 2.8062, df = 6.7586, p-value = 0.02726 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 3.175851 38.824149 sample estimates: mean in group placebo.d mean in group drug.d 1 -20 > boxplot(values ~ ind, data=long) > tapply(long$values, long$ind, sd) placebo.d drug.d 8.944272 14.142136 > aggregate(values ~ ind, range, data=long) ind values.1 values.2 1 placebo.d -10 10 2 drug.d -40 0 > tapply(beaver2$temp, beaver2$activ, Normality) # shipunov 0 1 "NORMAL" "NORMAL" > boxplot(temp ~ activ, data=beaver2) > t.test(temp ~ activ, data=beaver2) Welch Two Sample t-test data: temp by activ t = -18.548, df = 80.852, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.8927106 -0.7197342 sample estimates: mean in group 0 mean in group 1 37.09684 37.90306 > uu <- unstack(beaver2, temp ~ activ) > str(uu) List of 2 $ 0: num [1:38] 36.6 36.7 36.9 37.1 37.2 ... $ 1: num [1:62] 38 38 38 38.2 38.1 ... > sapply(bpress, Normality) # shipunov PLACEBO.1 PLACEBO.10 DRUG.1 DRUG.10 "NORMAL" "NORMAL" "NORMAL" "NORMAL" > attach(bpress) > t.test(DRUG.1, DRUG.10, paired=TRUE) Paired t-test data: DRUG.1 and DRUG.10 t = 3.1623, df = 4, p-value = 0.03411 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 2.440219 37.559781 sample estimates: mean of the differences 20 > t.test(DRUG.10 - DRUG.1, mu=0) # same results One Sample t-test data: DRUG.10 - DRUG.1 t = -3.1623, df = 4, p-value = 0.03411 alternative hypothesis: true mean is not equal to 0 95 percent confidence interval: -37.559781 -2.440219 sample estimates: mean of x -20 > t.test(DRUG.1, DRUG.10) # non-paired Welch Two Sample t-test data: DRUG.1 and DRUG.10 t = 1.3868, df = 8, p-value = 0.2029 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -13.25766 53.25766 sample estimates: mean of x mean of y 152 132 > detach(bpress) > attach(bpress) > t.test(PLACEBO.10, DRUG.10, alt="greater") # one-tailed test Welch Two Sample t-test data: PLACEBO.10 and DRUG.10 t = 2.4509, df = 6.4729, p-value = 0.0234 alternative hypothesis: true difference in means is greater than 0 95 percent confidence interval: 6.305348 Inf sample estimates: mean of x mean of y 161 132 > t.test(PLACEBO.10, DRUG.10) # "common" test Welch Two Sample t-test data: PLACEBO.10 and DRUG.10 t = 2.4509, df = 6.4729, p-value = 0.04681 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 0.5528512 57.4471488 sample estimates: mean of x mean of y 161 132 > detach(bpress) > tapply(ToothGrowth$len, ToothGrowth$supp, Normality) # shipunov OJ VC "NOT NORMAL" "NORMAL" > boxplot(len ~ supp, data=ToothGrowth, notch=TRUE) > wilcox.test(len ~ supp, data=ToothGrowth) Warning in wilcox.test.default(x = c(15.2, 21.5, 17.6, 9.7, 14.5, 10, 8.2, : cannot compute exact p-value with ties Wilcoxon rank sum test with continuity correction data: len by supp W = 575.5, p-value = 0.06449 alternative hypothesis: true location shift is not equal to 0 > w0 <- ChickWeight$weight[ChickWeight$Time == 0] > w2 <- ChickWeight$weight[ChickWeight$Time == 2] > sapply(data.frame(w0, w2), Normality) w0 w2 "NOT NORMAL" "NOT NORMAL" > boxplot(w0, w2) > wilcox.test(w0, w2, paired=TRUE) Wilcoxon signed rank test with continuity correction data: w0 and w2 V = 8, p-value = 1.132e-09 alternative hypothesis: true location shift is not equal to 0 > pdf(file="pics/20320.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > plot(extra ~ group, data=sleep) > par(oldpar); dev.off() pdf 2 > tapply(sleep$extra, sleep$group, Normality) # shipunov 1 2 "NORMAL" "NORMAL" > t.test(extra ~ group, data=sleep, paired=TRUE) Paired t-test data: extra by group t = -4.0621, df = 9, p-value = 0.002833 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -2.4598858 -0.7001142 sample estimates: mean of the differences -1.58 > power.t.test(n=10, sig.level=0.05, d=1.58) Two-sample t test power calculation n = 10 delta = 1.58 sd = 1 sig.level = 0.05 power = 0.9160669 alternative = two.sided NOTE: n is number in *each* group > (aa <- 1:9) [1] 1 2 3 4 5 6 7 8 9 > (bb <- rep(5, 9)) [1] 5 5 5 5 5 5 5 5 5 > (xx <- 51:59) [1] 51 52 53 54 55 56 57 58 59 > wilcox.test(Ozone ~ Month, data=airquality, + subset = Month %in% c(5, 8), conf.int=TRUE) Warning in wilcox.test.default(x = c(41L, 36L, 12L, 18L, 28L, 23L, 19L, : cannot compute exact p-value with ties Warning in wilcox.test.default(x = c(41L, 36L, 12L, 18L, 28L, 23L, 19L, : cannot compute exact confidence intervals with ties Wilcoxon rank sum test with continuity correction data: Ozone by Month W = 127.5, p-value = 0.0001208 alternative hypothesis: true location shift is not equal to 0 95 percent confidence interval: -52.99999 -14.99998 sample estimates: difference in location -31.99996 > pdf(file="pics/20890.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > boxplot(Ozone ~ Month, data=airquality, + subset=Month %in% c(5, 8), notch=TRUE) > par(oldpar); dev.off() pdf 2 > ar <- read.table("data/argon.txt") > unstack(ar, form=V2 ~ V1) air chemical 1 2.31017 2.30143 2 2.30986 2.29890 3 2.31010 2.29816 4 2.31001 2.30182 5 2.31024 2.29869 6 2.31010 2.29940 7 2.31028 2.29849 8 NA 2.29869 > pdf(file="pics/28580.pdf"); oldpar <- par(mar=c(2, 3, 0, 1)) > means <- tapply(ar$V2, ar$V1, mean, na.rm=TRUE) > oldpar <- par(mfrow=1:2) > boxplot(V2 ~ V1, data=ar) > barplot(means) > par(oldpar) > par(oldpar); dev.off() pdf 2 > # \subsection{Effect sizes} > > wilcox.test(1:10, 1:10 + 0.001, paired=TRUE) Warning in wilcox.test.default(1:10, 1:10 + 0.001, paired = TRUE) : cannot compute exact p-value with ties Wilcoxon signed rank test with continuity correction data: 1:10 and 1:10 + 0.001 V = 0, p-value = 0.005355 alternative hypothesis: true location shift is not equal to 0 > wilcox.test(1:10, 1:10 + 0.0001, paired=TRUE) Warning in wilcox.test.default(1:10, 1:10 + 1e-04, paired = TRUE) : cannot compute exact p-value with ties Wilcoxon signed rank test with continuity correction data: 1:10 and 1:10 + 1e-04 V = 0, p-value = 0.004237 alternative hypothesis: true location shift is not equal to 0 > library(effsize) > cohen.d(extra ~ group, data=sleep) Cohen's d d estimate: -0.8321811 (large) 95 percent confidence interval: lower upper -1.8115649 0.1472027 > cliff.delta(1:10, 1:10+0.001) Cliff's Delta delta estimate: -0.1 (negligible) 95 percent confidence interval: lower upper -0.5344426 0.3762404 > cliff.delta(Ozone ~ Month, data=airquality, + subset = Month %in% c(5, 8)) Cliff's Delta delta estimate: -0.2991453 (small) 95 percent confidence interval: lower upper -0.6255598 0.1163964 > K(extra ~ group, data=sleep) # shipunov 0.3462627 > summary(K(extra ~ group, data=sleep)) Lubischew's K Effect P 0.35 Weak 0.34 > summary(K(aa*3, aa*10)) # shipunov Lubischew's K Effect P 1.5 Fairly moderate 0.19 > cliff.delta(aa*3, aa*10) Cliff's Delta delta estimate: -0.7777778 (large) 95 percent confidence interval: lower upper -0.9493811 -0.2486473 > summary(K(1:10, 1:10+0.001, mad=TRUE)) # shipunov Lubischew's K Effect P 0 Extremely weak 0.5 > (dd <- K(Ozone ~ Month, + data=airquality[airquality$Month %in% c(5, 8), ], + mad=TRUE)) # shipunov 0.6141992 > summary(dd) Lubischew's K Effect P 0.61 Fairly weak 0.29 > # \section{If there are more than two samples: ANOVA} > > # \subsection{One way} > > pdf(file="pics/27420.pdf"); oldpar <- par(mar=c(2, 4, 0, 1)) > str(hwc) # shipunov 'data.frame': 90 obs. of 3 variables: $ COLOR : Factor w/ 3 levels "black","blond",..: 1 1 1 1 1 1 1 1 1 1 ... $ WEIGHT: int 80 82 79 80 81 79 82 83 78 80 ... $ HEIGHT: int 166 170 170 171 169 171 169 170 167 166 ... > boxplot(WEIGHT ~ COLOR, data=hwc, ylab="Weight, kg") > par(oldpar); dev.off() pdf 2 > sapply(hwc[sapply(hwc, is.numeric)], Normality) # shipunov WEIGHT HEIGHT "NORMAL" "NORMAL" > tapply(hwc$WEIGHT, hwc$COLOR, var) black blond brown 8.805747 9.219540 8.896552 > wc.aov <- aov(WEIGHT ~ COLOR, data=hwc) > summary(wc.aov) Df Sum Sq Mean Sq F value Pr(>F) COLOR 2 435.1 217.54 24.24 4.29e-09 *** Residuals 87 780.7 8.97 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > df1 <- 2 > df2 <- 87 > group.size <- 30 > (sq.between <- sum(tapply(hwc$WEIGHT, hwc$COLOR, + function(.x) (mean(.x) - mean(hwc$WEIGHT))^2))*group.size) [1] 435.0889 > (mean.sq.between <- sq.between/df1) [1] 217.5444 > (sq.within <- sum(tapply(hwc$WEIGHT, hwc$COLOR, + function(.x) sum((.x - mean(.x))^2)))) [1] 780.7333 > (mean.sq.within <- sq.within/df2) [1] 8.973946 > (f.value <- mean.sq.between/mean.sq.within) [1] 24.24178 > (p.value <- (1 - pf(f.value, df1, df2))) [1] 4.285683e-09 > aggregate(hwc[, -1], by=list(COLOR=hwc[, 1]), var) COLOR WEIGHT HEIGHT 1 black 8.805747 9.154023 2 blond 9.219540 8.837931 3 brown 8.896552 9.288506 > bartlett.test(WEIGHT ~ COLOR, data=hwc) Bartlett test of homogeneity of variances data: WEIGHT by COLOR Bartlett's K-squared = 0.016654, df = 2, p-value = 0.9917 > fligner.test(WEIGHT ~ COLOR, data=hwc) Fligner-Killeen test of homogeneity of variances data: WEIGHT by COLOR Fligner-Killeen:med chi-squared = 1.1288, df = 2, p-value = 0.5687 > Normality(wc.aov$residuals) [1] "NORMAL" > Eta2 <- function(aov) + { + summary.lm(aov)$r.squared + } > (ewc <- Eta2(wc.aov)) [1] 0.3578557 > Mag(ewc) # shipunov [1] "high" > Normality(chickwts$weight) [1] "NORMAL" > bartlett.test(weight ~ feed, data=chickwts) Bartlett test of homogeneity of variances data: weight by feed Bartlett's K-squared = 3.2597, df = 5, p-value = 0.66 > boxplot(weight ~ feed, data=chickwts) > chicks.aov <- aov(weight ~ feed, data=chickwts) > summary(chicks.aov) Df Sum Sq Mean Sq F value Pr(>F) feed 5 231129 46226 15.37 5.94e-10 *** Residuals 65 195556 3009 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > Eta2(chicks.aov) [1] 0.5416855 > Mag(Eta2(chicks.aov)) # shipunov [1] "very high" > pairwise.t.test(hwc$WEIGHT, hwc$COLOR) Pairwise comparisons using t tests with pooled SD data: hwc$WEIGHT and hwc$COLOR black blond blond 1.7e-08 - brown 8.4e-07 0.32 P value adjustment method: holm > TukeyHSD(wc.aov) Tukey multiple comparisons of means 95% family-wise confidence level Fit: aov(formula = WEIGHT ~ COLOR, data = hwc) $COLOR diff lwr upr p adj blond-black -5.0000000 -6.844335 -3.155665 0.0000000 brown-black -4.2333333 -6.077668 -2.388999 0.0000013 brown-blond 0.7666667 -1.077668 2.611001 0.5843745 > pairwise.Eff(hwc$WEIGHT, hwc$COLOR, eff="cohen.d") # shipunov black blond brown black blond 1.67 (large) brown 1.42 (large) -0.25 (small) > Normality(PlantGrowth$weight) [1] "NORMAL" > bartlett.test(weight ~ group, data=PlantGrowth) Bartlett test of homogeneity of variances data: weight by group Bartlett's K-squared = 2.8786, df = 2, p-value = 0.2371 > plants.aov <- aov(weight ~ group, data=PlantGrowth) > summary(plants.aov) Df Sum Sq Mean Sq F value Pr(>F) group 2 3.766 1.8832 4.846 0.0159 * Residuals 27 10.492 0.3886 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > Eta2(plants.aov) [1] 0.2641483 > Mag(Eta2(plants.aov)) # shipunov [1] "high" > boxplot(weight ~ group, data=PlantGrowth) > with(PlantGrowth, pairwise.t.test(weight, group)) Pairwise comparisons using t tests with pooled SD data: weight and group ctrl trt1 trt1 0.194 - trt2 0.175 0.013 P value adjustment method: holm > boxplot(WEIGHT ~ COLOR, data=hwc2) # shipunov > sapply(hwc2[, 2:3], Normality) # shipunov WEIGHT HEIGHT "NORMAL" "NORMAL" > tapply(hwc2$WEIGHT, hwc2$COLOR, var) black blond brown 62.27126 23.45862 31.11379 > bartlett.test(WEIGHT ~ COLOR, data=hwc2) Bartlett test of homogeneity of variances data: WEIGHT by COLOR Bartlett's K-squared = 7.4914, df = 2, p-value = 0.02362 > oneway.test(WEIGHT ~ COLOR, data=hwc2) One-way analysis of means (not assuming equal variances) data: WEIGHT and COLOR F = 7.0153, num df = 2.000, denom df = 56.171, p-value = 0.001907 > (e2 <- Eta2(aov(WEIGHT ~ COLOR, data=hwc2))) [1] 0.1626432 > Mag(e2) [1] "medium" > pairwise.t.test(hwc2$WEIGHT, hwc2$COLOR) # most applicable post hoc Pairwise comparisons using t tests with pooled SD data: hwc2$WEIGHT and hwc2$COLOR black blond blond 0.00047 - brown 0.00797 0.32349 P value adjustment method: holm > Normality(InsectSprays$count) # shipunov [1] "NOT NORMAL" > Normality(sqrt(InsectSprays$count)) [1] "NORMAL" > bartlett.test(sqrt(count) ~ spray, data=InsectSprays)$p.value [1] 0.5855673 > boxplot(WEIGHT ~ COLOR, data=hwc3) > sapply(hwc3[, 2:3], Normality) # shipunov WEIGHT HEIGHT "NOT NORMAL" "NOT NORMAL" > kruskal.test(WEIGHT ~ COLOR, data=hwc3) Kruskal-Wallis rank sum test data: WEIGHT by COLOR Kruskal-Wallis chi-squared = 32.859, df = 2, p-value = 7.325e-08 > Epsilon2 <- function(kw, n) # n is the number of cases + { + unname(kw$statistic/((n^2 - 1)/(n+1))) + } > kw <- kruskal.test(WEIGHT ~ COLOR, data=hwc3) > Epsilon2(kw, nrow(hwc3)) [1] 0.3691985 > Mag(Epsilon2(kw, nrow(hwc3))) # shipunov [1] "high" > boxplot(WEIGHT ~ COLOR, data=hwc3) > pairwise.wilcox.test(hwc3$WEIGHT, hwc3$COLOR) Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Pairwise comparisons using Wilcoxon rank sum test with continuity correction data: hwc3$WEIGHT and hwc3$COLOR black blond blond 1.1e-06 - brown 1.6e-05 0.056 P value adjustment method: holm > library(dunn.test) > dunn.test(hwc3$WEIGHT, hwc3$COLOR, method="holm", altp=TRUE) Kruskal-Wallis rank sum test data: x and group Kruskal-Wallis chi-squared = 32.8587, df = 2, p-value = 0 Comparison of x by group (Holm) Col Mean-| Row Mean | black blond ---------+---------------------- blond | 5.537736 | 0.0000* | brown | 4.051095 -1.486640 | 0.0001* 0.1371 alpha = 0.05 Reject Ho if p <= alpha > pdf(file="pics/anovas_to_crop.pdf") > library(plotrix) > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > draw.circle(0, .58, .3) > draw.circle(0, .29, .6) > draw.circle(0, 0, .9) > text(0, .75, "normal, \nvariances\nsimilar:", font=3) > text(0, .48, "aov()\nTukeyHSD()", font=2) > text(0, .12, "normal, \nvariances different:", font=3) > text(0, -.12, "oneway.test()\npairwise.t.test()", font=2) > text(0, -.5, "not normal:", font=3) > text(0, -.72, "kruskal.test()\npaiwise.wilcox.test()\n or dunn.test()", font=2) > dev.off() pdf 2 > # \subsection{More then one way} > > pdf(file="pics/49740.pdf"); oldpar <- par(mar=c(2, 4, 0, 1)) > with(ToothGrowth, interaction.plot(supp, dose, len)) > par(oldpar); dev.off() pdf 2 > # \section{Is there an association? Analysis of tables} > > # \subsection{Contingency tables} > > tea <- read.table("data/tea.txt", h=TRUE) > head(tea) GUESS REALITY 1 Milk Milk 2 Milk Milk 3 Milk Milk 4 Milk Tea 5 Tea Tea 6 Tea Tea > (tea.t <- table(tea)) REALITY GUESS Milk Tea Milk 3 1 Tea 1 3 > xtabs(~ GUESS + REALITY, data=tea) REALITY GUESS Milk Tea Milk 3 1 Tea 1 3 > ftable(Titanic) Survived No Yes Class Sex Age 1st Male Child 0 5 Adult 118 57 Female Child 0 1 Adult 4 140 2nd Male Child 0 11 Adult 154 14 Female Child 0 13 Adult 13 80 3rd Male Child 35 13 Adult 387 75 Female Child 17 14 Adult 89 76 Crew Male Child 0 0 Adult 670 192 Female Child 0 0 Adult 3 20 > d <- rep(LETTERS[1:3], 10) > is.na(d) <- 3:4 > d [1] "A" "B" NA NA "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C" "A" [20] "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C" > table(d, useNA="ifany") d A B C 9 10 9 2 > pdf(file="pics/21660.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > titanic <- apply(Titanic, c(1, 4), sum) > titanic Survived Class No Yes 1st 122 203 2nd 167 118 3rd 528 178 Crew 673 212 > mosaicplot(titanic, col=c("#485392", "#204F15"), + main="", cex.axis=1) > par(oldpar); dev.off() pdf 2 > comfort <- ifelse(airquality$Temp < 64 | airquality$Temp > 86, + "uncomfortable", "comfortable") > comf.month <- table(comfort, airquality$Month) > comf.month comfort 5 6 7 8 9 comfortable 18 25 24 21 24 uncomfortable 13 5 7 10 6 > pdf(file="pics/31120.pdf"); oldpar <- par(mar=c(2, 4, 1, 2)) > spineplot(t(comf.month)) > par(oldpar); dev.off() pdf 2 > # \subsection{Table tests} > > chisq.test(comf.month) Pearson's Chi-squared test data: comf.month X-squared = 6.6499, df = 4, p-value = 0.1556 > df <- 4 > (expected <- outer(rowSums(comf.month), + colSums(comf.month), "*")/sum(comf.month)) 5 6 7 8 9 comfortable 22.69281 21.960784 22.69281 22.69281 21.960784 uncomfortable 8.30719 8.039216 8.30719 8.30719 8.039216 > (chi.squared <- sum((comf.month - expected)^2/expected)) [1] 6.649898 > (p.value <- 1 - pchisq(chi.squared, df)) [1] 0.1555872 > pdf(file="pics/21960.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > (HE <- margin.table(HairEyeColor, 1:2)) Eye Hair Brown Blue Hazel Green Black 68 20 15 5 Brown 119 84 54 29 Red 26 17 14 14 Blond 7 94 10 16 > assocplot(HE) > par(oldpar); dev.off() pdf 2 > chisq.test(HE) Pearson's Chi-squared test data: HE X-squared = 138.29, df = 9, p-value < 2.2e-16 > chisq.test(titanic) Pearson's Chi-squared test data: titanic X-squared = 190.4, df = 3, p-value < 2.2e-16 > pairwise.Table2.test(titanic) # shipunov Pairwise comparisons using Pearson's Chi-squared test data: titanic 1st 2nd 3rd 2nd 4.7e-07 - - 3rd < 2e-16 8.3e-07 - Crew < 2e-16 3.9e-08 0.6 P value adjustment method: BH > tox <- read.table("data/poisoning.txt", h=TRUE) > head(tox) ILL CHEESE CRABDIP CRISPS BREAD CHICKEN RICE CAESAR TOMATO ICECREAM CAKE 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1 1 2 1 2 2 2 1 1 3 1 2 2 1 2 1 2 1 2 1 1 4 1 1 2 1 1 1 2 1 2 1 1 5 1 1 1 1 2 1 1 1 1 2 1 6 1 1 1 1 1 1 2 1 1 2 1 JUICE WINE COFFEE 1 1 1 1 2 1 1 2 3 2 1 2 4 2 1 2 5 1 1 1 6 1 2 2 > tox.1 <- lapply(tox[, -1], function(.x) table(tox[, 1], .x)) > tox.2 <- array(unlist(tox.1), + dim=c(dim(tox.1[[1]]), length(tox.1))) # or simply c(2, 2, 13) > dimnames(tox.2) <- list(c("ill","not ill"), + c("took","didn't take"), names(tox.1)) > tox.2[,,"TOMATO"] took didn't take ill 24 5 not ill 6 10 > pdf(file="pics/33940_to_crop.pdf") > fourfoldplot(tox.2, conf.level=0, col=c("yellow","black")) > dev.off() pdf 2 > cbind(apply(tox.2, 3, function(.x) chisq.test(.x)$p.value)) Warning in chisq.test(.x) : Chi-squared approximation may be incorrect Warning in chisq.test(.x) : Chi-squared approximation may be incorrect Warning in chisq.test(.x) : Chi-squared approximation may be incorrect Warning in chisq.test(.x) : Chi-squared approximation may be incorrect Warning in chisq.test(.x) : Chi-squared approximation may be incorrect Warning in chisq.test(.x) : Chi-squared approximation may be incorrect Warning in chisq.test(.x) : Chi-squared approximation may be incorrect [,1] CHEESE 0.8408996794 CRABDIP 0.9493138514 CRISPS 1.0000000000 BREAD 0.3498177243 CHICKEN 0.3114822175 RICE 0.5464344359 CAESAR 0.0002034102 TOMATO 0.0059125029 ICECREAM 0.5977125948 CAKE 0.8694796709 JUICE 1.0000000000 WINE 1.0000000000 COFFEE 0.7265552461 > p.adjust(c(0.005, 0.05, 0.1), method="BH") [1] 0.015 0.075 0.100 > chisq.test(c(5474, 1850), p=c(3/4, 1/4)) Chi-squared test for given probabilities data: c(5474, 1850) X-squared = 0.26288, df = 1, p-value = 0.6081 > pdf(file="pics/55440.pdf"); oldpar <- par(mar=c(4, 6, 3, 1)) > sp <- read.table("data/species.txt", sep="\t") > species <- sp[, 2] > names(species) <- sp[, 1] > Dotchart(rev(sort(log10(species))), + xlab="Decimal logarithm of species number") > chisq.test(species) Chi-squared test for given probabilities data: species X-squared = 4771684, df = 7, p-value < 2.2e-16 > par(oldpar); dev.off() pdf 2 > fisher.test(tea.t) Fisher's Exact Test for Count Data data: tea.t p-value = 0.4857 alternative hypothesis: true odds ratio is not equal to 1 95 percent confidence interval: 0.2117329 621.9337505 sample estimates: odds ratio 6.408309 > fourfoldplot(tea.t) > ee <- read.table("data/teeth.txt", h=TRUE) > chisq.test(table(ee)) Warning in chisq.test(table(ee)) : Chi-squared approximation may be incorrect Pearson's Chi-squared test with Yates' continuity correction data: table(ee) X-squared = 1.1398, df = 1, p-value = 0.2857 > summary(table(ee)) # No correction in summary.table() Number of cases in table: 42 Number of factors: 2 Test for independence of all factors: Chisq = 2.3858, df = 1, p-value = 0.1224 Chi-squared approximation may be incorrect > chisq.test(table(ee), simulate.p.value=TRUE) Pearson's Chi-squared test with simulated p-value (based on 2000 replicates) data: table(ee) X-squared = 2.3858, df = NA, p-value = 0.1754 > sqrt(chisq.test(tea.t, correct=FALSE)$statistic/sum(tea.t)) Warning in chisq.test(tea.t, correct = FALSE) : Chi-squared approximation may be incorrect X-squared 0.5 > (x <- margin.table(Titanic, 1:2)) Sex Class Male Female 1st 180 145 2nd 179 106 3rd 510 196 Crew 862 23 > VTcoeffs(x) # shipunov coefficients values comments 1 Cramer's V 0.3987227 medium 2 Cramer's V (corrected) 0.3970098 medium 3 Tschuprow's T 0.3029637 4 Tschuprow's T (corrected) 0.3016622 > # \section{Answers to exercises} > > # \subsection{Exercises on two samples} > > aa <- c(1, 2, 3, 4, 5, 6, 7, 8, 9) > bb <- c(5, 5, 5, 5, 5, 5, 5, 5, 5) > dif <- aa - bb > pos.dif <- dif[dif > 0] > prop.test(length(pos.dif), length(dif)) Warning in prop.test(length(pos.dif), length(dif)) : Chi-squared approximation may be incorrect 1-sample proportions test with continuity correction data: length(pos.dif) out of length(dif), null probability 0.5 X-squared = 0, df = 1, p-value = 1 alternative hypothesis: true p is not equal to 0.5 95 percent confidence interval: 0.1534306 0.7734708 sample estimates: p 0.4444444 > ozone.month <- airquality[, c("Ozone", "Month")] > ozone.month.list <- unstack(ozone.month) > sapply(ozone.month.list, Normality) # shipunov 5 6 7 8 9 "NOT NORMAL" "NORMAL" "NORMAL" "NORMAL" "NOT NORMAL" > sapply(unstack(ar, form=V2 ~ V1), Normality) air chemical "NORMAL" "NOT NORMAL" > wilcox.test(jitter(V2) ~ V1, data=ar) Wilcoxon rank sum exact test data: jitter(V2) by V1 W = 56, p-value = 0.0003108 alternative hypothesis: true location shift is not equal to 0 > cashiers <- read.table("data/cashiers.txt", h=TRUE) > head(cashiers) CASHIER.1 CASHIER.2 1 3 12 2 12 12 3 13 9 4 5 6 5 4 2 6 11 9 > (cashiers.m <- sapply(cashiers, median)) CASHIER.1 CASHIER.2 8 9 > with(cashiers, wilcox.test(CASHIER.1, CASHIER.2, alt="greater")) Warning in wilcox.test.default(CASHIER.1, CASHIER.2, alt = "greater") : cannot compute exact p-value with ties Wilcoxon rank sum test with continuity correction data: CASHIER.1 and CASHIER.2 W = 236, p-value = 0.3523 alternative hypothesis: true location shift is greater than 0 > grades <- read.table("data/grades.txt") > classes <- split(grades$V1, grades$V2) > sapply(classes, Normality) # shipunov A1 A2 B1 "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" > sapply(classes, median, na.rm=TRUE) A1 A2 B1 4 4 5 > wilcox.test(classes$A1, classes$A2, paired=TRUE, conf.int=TRUE) Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) : cannot compute exact p-value with ties Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) : cannot compute exact confidence interval with ties Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) : cannot compute exact p-value with zeroes Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) : cannot compute exact confidence interval with zeroes Wilcoxon signed rank test with continuity correction data: classes$A1 and classes$A2 V = 15.5, p-value = 0.8605 alternative hypothesis: true location shift is not equal to 0 95 percent confidence interval: -1.499923 1.500018 sample estimates: (pseudo)median 0 > wilcox.test(classes$B1, classes$A1, alt="greater", conf.int=TRUE) Warning in wilcox.test.default(classes$B1, classes$A1, alt = "greater", : cannot compute exact p-value with ties Warning in wilcox.test.default(classes$B1, classes$A1, alt = "greater", : cannot compute exact confidence intervals with ties Wilcoxon rank sum test with continuity correction data: classes$B1 and classes$A1 W = 306, p-value = 0.02382 alternative hypothesis: true location shift is greater than 0 95 percent confidence interval: 6.957242e-05 Inf sample estimates: difference in location 6.160018e-05 > cliff.delta(classes$A1, classes$A2) Cliff's Delta delta estimate: 0.03557312 (negligible) 95 percent confidence interval: lower upper -0.2744203 0.3388690 > cliff.delta(classes$B1, classes$A1) Cliff's Delta delta estimate: 0.3246753 (small) 95 percent confidence interval: lower upper 0.0003599976 0.5871917025 > aa <- read.table( + "http://ashipunov.me/shipunov/open/aegopodium.txt", h=TRUE) > aa$SUN <- factor(aa$SUN, labels=c("shade","sun")) > Str(aa) 'data.frame': 100 obs. of 5 variables: 1 PET.L : num 25.5 24.1 27.4 24.9 26.2 37.4 18 30.3 26.1 17.8 ... 2 TERM.L: num 8.9 5.8 8.2 6.8 8.2 13.1 7.2 7.8 7.1 5.8 ... 3 LEAF.L: num 34.4 29.9 35.6 31.7 34.4 50.5 25.2 38.1 33.2 23.6 ... 4 BLADES: int 5 7 7 7 7 4 7 6 7 7 ... 5 SUN : Factor w/ 2 levels "shade","sun": 1 1 1 1 1 1 1 1 1 1 ... > pdf(file="pics/47860.pdf"); oldpar <- par(mar=c(1, 2, 0, 1)) > aggregate(aa[, -5], list(light=aa[, 5]), Normality) # shipunov light PET.L TERM.L LEAF.L BLADES 1 shade NORMAL NORMAL NORMAL NOT NORMAL 2 sun NORMAL NOT NORMAL NORMAL NOT NORMAL > Linechart(aa[, 1:4], aa[, 5], xmarks=FALSE, lcolor=1, + se.lwd=2, mad=TRUE) # shipunov > par(oldpar); dev.off() pdf 2 > t.test(LEAF.L ~ SUN, data=aa) Welch Two Sample t-test data: LEAF.L by SUN t = 14.846, df = 63.691, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 14.20854 18.62746 sample estimates: mean in group shade mean in group sun 35.534 19.116 > library(effsize) > cohen.d(LEAF.L ~ SUN, data=aa) Cohen's d d estimate: 2.969218 (large) 95 percent confidence interval: lower upper 2.393787 3.544650 > summary(K(LEAF.L ~ SUN, data=aa)) Lubischew's K Effect P 4.41 Strong 0.07 > # \subsection{Exercises on ANOVA} > > summary(aov(HEIGHT ~ COLOR, data=hwc)) Df Sum Sq Mean Sq F value Pr(>F) COLOR 2 2787.3 1393.6 153.3 <2e-16 *** Residuals 87 791.1 9.1 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > pairwise.t.test(hwc$HEIGHT, hwc$COLOR) Pairwise comparisons using t tests with pooled SD data: hwc$HEIGHT and hwc$COLOR black blond blond < 2e-16 - brown 1.7e-10 3.3e-16 P value adjustment method: holm > mm <- read.table( + "http://ashipunov.me/shipunov/open/melampyrum.txt", h=TRUE) > Str(mm) 'data.frame': 126 obs. of 9 variables: 1 LOC : int 1 1 1 1 1 1 1 1 1 1 ... 2 P.HEIGHT : int 246 235 270 260 300 250 205 190 275 215 ... 3 NODES * int NA NA NA NA NA NA NA NA NA NA ... 4 V.NODES * int NA NA NA NA NA NA NA NA NA NA ... 5 LEAF.L * int 23 27 35 20 38 46 17 22 42 26 ... 6 LEAF.W * int 5 3 4 4 6 5 3 3 4 3 ... 7 LEAF.MAXW* int 3 5 5 4 6 11 5 4 5 3 ... 8 TEETH * int 3 2 2 6 2 4 4 5 4 3 ... 9 TOOTH.L * int 4 2 2 5 6 2 11 3 5 4 ... > pdf(file="pics/61660.pdf") > old.par <- par(mfrow=c(2, 1), mai=c(0.5, 0.5, 0.1, 0.1)) > boxplot(P.HEIGHT ~ LOC, data=mm, col=grey(0.8)) > boxplot(LEAF.L ~ LOC, data=mm, col=rgb(173, 204, 90, max=255)) > par(old.par) > dev.off() pdf 2 > sapply(mm[, c(2, 5)], Normality) P.HEIGHT LEAF.L "NORMAL" "NOT NORMAL" > bartlett.test(P.HEIGHT ~ LOC, data=mm) Bartlett test of homogeneity of variances data: P.HEIGHT by LOC Bartlett's K-squared = 17.014, df = 6, p-value = 0.00923 > oneway.test(P.HEIGHT ~ LOC, data=mm) One-way analysis of means (not assuming equal variances) data: P.HEIGHT and LOC F = 18.376, num df = 6.000, denom df = 49.765, p-value = 4.087e-11 > pairwise.t.test(mm$P.HEIGHT, mm$LOC) Pairwise comparisons using t tests with pooled SD data: mm$P.HEIGHT and mm$LOC 1 2 3 4 5 6 2 0.05381 - - - - - 3 0.00511 1.1e-08 - - - - 4 1.00000 0.00779 0.00511 - - - 5 1.00000 0.04736 0.00041 1.00000 - - 6 4.2e-05 1.8e-11 0.62599 2.3e-05 1.1e-06 - 7 0.28824 1.9e-05 0.39986 0.39986 0.09520 0.01735 P value adjustment method: holm > kruskal.test(LEAF.L ~ LOC, data=mm) Kruskal-Wallis rank sum test data: LEAF.L by LOC Kruskal-Wallis chi-squared = 22.6, df = 6, p-value = 0.0009422 > pairwise.wilcox.test(mm$LEAF.L, mm$LOC) Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Warning in wilcox.test.default(xi, xj, paired = paired, ...) : cannot compute exact p-value with ties Pairwise comparisons using Wilcoxon rank sum test with continuity correction data: mm$LEAF.L and mm$LOC 1 2 3 4 5 6 2 0.6249 - - - - - 3 1.0000 0.1538 - - - - 4 0.4999 0.0064 1.0000 - - - 5 1.0000 0.5451 1.0000 1.0000 - - 6 0.6599 1.0000 0.1434 0.0028 0.5451 - 7 1.0000 1.0000 1.0000 0.5904 1.0000 1.0000 P value adjustment method: holm > # \subsection{Exercises on tables} > > pr <- read.table("data/seedlings.txt", h=TRUE) > str(pr) 'data.frame': 80 obs. of 2 variables: $ CID : int 63 63 63 63 63 63 63 63 63 63 ... $ GERM.14: int 1 1 1 1 1 1 1 1 1 1 ... > pdf(file="pics/59370.pdf") > (pr.t <- table(pr)) GERM.14 CID 0 1 0 1 19 63 3 17 80 17 3 105 10 10 > Dotchart(t(pr.t)) > par(oldpar) > pdf(file="pics/47350_to_crop.pdf") > library(vcd) Loading required package: grid > assoc(pr.t, shade=TRUE, gp=shading_Friendly2, + gp_args=list(interpolate=c(1, 1.8))) > par(oldpar) > chisq.test(pr.t, simulate=TRUE) Pearson's Chi-squared test with simulated p-value (based on 2000 replicates) data: pr.t X-squared = 33.443, df = NA, p-value = 0.0004998 > pairwise.Table2.test(table(pr), exact=TRUE) Pairwise comparisons using Fisher's Exact Test data: table(pr) 0 63 80 63 0.6050 - - 80 2.4e-06 5.8e-05 - 105 0.0067 0.0489 0.0489 P value adjustment method: BH > tox <- read.table("data/poisoning.txt", h=TRUE) > tox.p.values <- apply(tox[, -1], 2, + function(.x) fisher.test(table(tox[, 1], .x))$p.value) > sort(round(p.adjust(tox.p.values, method="BH"), 3)) CAESAR TOMATO CHEESE CRABDIP BREAD CHICKEN RICE ICECREAM 0.001 0.021 0.985 0.985 0.985 0.985 0.985 0.985 CAKE COFFEE CRISPS JUICE WINE 0.985 0.985 1.000 1.000 1.000 > cc <-read.table( + "http://ashipunov.me/shipunov/open/cochlearia.txt", h=TRUE) > cc$LOC <- factor(cc$LOC, labels=paste0("loc", levels(cc$LOC))) > cc$IS.CREEPING <- factor(cc$IS.CREEPING, + labels=c("upright", "creeping")) > str(cc) 'data.frame': 174 obs. of 8 variables: $ LOC : Factor w/ 8 levels "loc1","loc2",..: 1 1 1 1 1 1 1 1 1 1 ... $ STEM : int 162 147 170 432 146 207 91 166 204 172 ... $ IS.CREEPING: Factor w/ 2 levels "upright","creeping": 1 1 1 1 1 1 1 2 2 2 ... $ LATERAL : int 0 0 0 0 0 0 0 0 0 0 ... $ PETAL.L : num NA NA NA NA NA NA NA 3 NA NA ... $ FRUIT.L : int 6 7 9 7 7 6 7 6 7 8 ... $ FRUIT.W : int 4 3 7 5 4 5 4 4 5 5 ... $ SEED.S : int 2 2 1 1 1 1 1 2 1 1 ... > pdf(file="pics/51170.pdf"); oldpar <- par(mar=c(4, 4, 1, 2)) > s.cols <- colorRampPalette(c("white", "forestgreen"))(5)[3:4] > spineplot(IS.CREEPING ~ LOC, data=cc, col=s.cols) > par(oldpar); dev.off() pdf 2 > (cc.lc <- xtabs(~ LOC + IS.CREEPING, data=cc)) IS.CREEPING LOC upright creeping loc1 15 24 loc2 39 0 loc3 0 1 loc4 5 0 loc5 15 0 loc6 46 0 loc7 15 0 loc8 7 7 > chisq.test(cc.lc, simulate.p.value=TRUE) Pearson's Chi-squared test with simulated p-value (based on 2000 replicates) data: cc.lc X-squared = 89.177, df = NA, p-value = 0.0004998 > VTcoeffs(cc.lc)[2, ] # shipunov Warning in chisq.test(table, correct = correct, ...) : Chi-squared approximation may be incorrect coefficients values comments 2 Cramer's V (corrected) 0.6872265 large > (betula.ll <- table(betula[betula$LOC < 3, c("LOC","LOBES")])) LOBES LOC 0 1 1 17 4 2 14 16 > pdf(file="pics/48360.pdf"); oldpar <- par(mar=c(4, 4, 0, 2)) > birch.cols <- colorRampPalette(c("black", "forestgreen"))(5)[3:4] > spineplot(betula.ll, col=birch.cols) > par(oldpar); dev.off() pdf 2 > prop.test(betula.ll) 2-sample test for equality of proportions with continuity correction data: betula.ll X-squared = 4.7384, df = 1, p-value = 0.0295 alternative hypothesis: two.sided 95 percent confidence interval: 0.05727638 0.62843791 sample estimates: prop 1 prop 2 0.8095238 0.4666667 > fisher.test(betula.ll) Fisher's Exact Test for Count Data data: betula.ll p-value = 0.01987 alternative hypothesis: true odds ratio is not equal to 1 95 percent confidence interval: 1.156525 23.904424 sample estimates: odds ratio 4.704463 > chisq.test(betula.ll) Pearson's Chi-squared test with Yates' continuity correction data: betula.ll X-squared = 4.7384, df = 1, p-value = 0.0295 > VTcoeffs(betula.ll)[2, ] # shipunov coefficients values comments 2 Cramer's V (corrected) 0.3159734 medium > (betula.lw <- table(betula[, c("LOBES", "WINGS")])) WINGS LOBES 0 1 0 61 69 1 50 45 > pdf(file="pics/50140.pdf") > fourfoldplot(betula.lw, col=birch.cols) > dev.off() pdf 2 > mcnemar.test(betula.lw) McNemar's Chi-squared test with continuity correction data: betula.lw McNemar's chi-squared = 2.7227, df = 1, p-value = 0.09893 > # \chapter{Two-dimensional data: models} > > # \section{Analysis of correlation} > > # \subsection{Plot it first} > > classic.desc <- function(.x) {c(mean=mean(.x, na.rm=TRUE), + var=var(.x, na.rm=TRUE))} > sapply(anscombe, classic.desc) x1 x2 x3 x4 y1 y2 y3 y4 mean 9 9 9 9 7.500909 7.500909 7.50000 7.500909 var 11 11 11 11 4.127269 4.127629 4.12262 4.123249 > pdf(file="pics/anscombe.pdf"); oldpar <- par(mar=c(4, 4, 2, 1)) > a.vars <- data.frame(i=c(1, 5), ii=c(2, 6), + iii=c(3, 7), iv=c(4, 8)) > oldpar <- par(mfrow=c(2, 2), mar=c(4, 4, 1, 1)) > for (i in 1:4) { plot(anscombe[a.vars[, i]], pch=19, cex=1.2); + abline(lm(anscombe[rev(a.vars[, i])]), lty=2) } > par(oldpar); dev.off() pdf 2 > robust.desc <- function(.x) {c(median=median(.x, na.rm=TRUE), + IQR=IQR(.x, na.rm=TRUE), mad=mad(.x, na.rm=TRUE))} > sapply(anscombe, robust.desc) x1 x2 x3 x4 y1 y2 y3 y4 median 9.0000 9.0000 9.0000 8 7.580000 8.140000 7.110000 7.040000 IQR 5.0000 5.0000 5.0000 0 2.255000 2.255000 1.730000 2.020000 mad 4.4478 4.4478 4.4478 0 1.823598 1.467774 1.527078 1.897728 > # \subsection{Correlation} > > cor(5:15, 7:17) [1] 1 > cor(5:15, c(7:16, 23)) [1] 0.9375093 > cor(trees) Girth Height Volume Girth 1.0000000 0.5192801 0.9671194 Height 0.5192801 1.0000000 0.5982497 Volume 0.9671194 0.5982497 1.0000000 > with(trees, cor(Girth, Height)) [1] 0.5192801 > (v1 <- var(trees$Girth)) [1] 9.847914 > (v2 <- var(trees$Height)) [1] 40.6 > (v12 <- var(trees$Girth - trees$Height)) [1] 29.68125 > (pearson.r <- (v1 + v2 - v12)/(2*sqrt(v1)*sqrt(v2))) [1] 0.5192801 > with(trees, cov(Girth, Height)/(sd(Girth)*sd(Height))) [1] 0.5192801 > noquote(apply(cor(trees), 1:2, + function(.x) Mag(.x, squared=FALSE))) # shipunov Girth Height Volume Girth very high high very high Height high very high high Volume very high high very high > diag(cor(anscombe[, 1:4], anscombe[, 5:8])) [1] 0.8164205 0.8162365 0.8162867 0.8165214 > with(trees, cor(Girth, Height, method="spearman")) [1] 0.4408387 > diag(cor(anscombe[, 1:4], anscombe[, 5:8], method="s")) [1] 0.8181818 0.6909091 0.9909091 0.5000000 > with(trees, cor(Girth, Height, method="k")) [1] 0.3168641 > diag(cor(anscombe[, 1:4], anscombe[, 5:8], method="k")) [1] 0.6363636 0.5636364 0.9636364 0.4264014 > with(trees, cor.test(Girth, Height)) Pearson's product-moment correlation data: Girth and Height t = 3.2722, df = 29, p-value = 0.002758 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.2021327 0.7378538 sample estimates: cor 0.5192801 > symnum(cor(longley)) GNP. GNP U A P Y E GNP.deflator 1 GNP B 1 Unemployed , , 1 Armed.Forces . . 1 Population B B , . 1 Year B B , . B 1 Employed B B . . B B 1 attr(,"legend") [1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1 > pdf(file="pics/22880_to_crop.pdf") > library(lattice) > mycorr <- abs(cor(iris[, -5])) > levelplot(mycorr, xlab="", ylab="") # axes names a redundant > par(oldpar); dev.off() pdf 2 > pdf(file="pics/23120_to_crop.pdf") > library(ellipse) Attaching package: ‘ellipse’ The following object is masked from ‘package:graphics’: pairs > colors <- cm.colors(7) > cor.l <- cor(longley) > plotcorr(cor.l, type="lower", col=colors[5*cor.l + 2]) > dev.off() pdf 2 > detach(package:ellipse) # because it masks base::pairs() > pdf(file="pics/35870.pdf") > tox.cor <- cor(tox, method="k") > Pleiad(tox.cor, corr=TRUE, lcol="black") # shipunov > dev.off() pdf 2 > Cor(tox, method="kendall", dec=2) # shipunov Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties ILL CHEESE CRABDIP CRISPS BREAD CHICKEN RICE CAESAR TOMATO ILL - 0.08 0.06 -0.03 -0.19 0.21 0.14 0.6* 0.46* CHEESE 0.08 - -0.38* -0.11 0.34* 0.04 0.03 0.08 0.22 CRABDIP 0.06 -0.38* - 0.64* -0.3* 0.27 0.18 0.04 0.19 CRISPS -0.03 -0.11 0.64* - -0.05 0.33* 0.17 0.25 0.21 BREAD -0.19 0.34* -0.3* -0.05 - 0.05 0.09 0.03 -0.03 CHICKEN 0.21 0.04 0.27 0.33* 0.05 - 0.17 0.02 0.12 RICE 0.14 0.03 0.18 0.17 0.09 0.17 - 0.1 0.28 CAESAR 0.6* 0.08 0.04 0.25 0.03 0.02 0.1 - 0.64* TOMATO 0.46* 0.22 0.19 0.21 -0.03 0.12 0.28 0.64* - ICECREAM 0.13 -0.02 -0.13 -0.13 -0.09 0.21 0.04 -0.2 -0.13 CAKE 0.08 0.09 0.02 0.09 -0.26 0.08 -0.05 -0.08 0.11 JUICE -0.04 0.06 0.32* 0.16 0.11 0.07 0.08 -0.21 -0.17 WINE 0.01 0.14 -0.04 0.12 0.02 -0.03 -0.02 0.06 0.04 COFFEE -0.1 0.12 0.21 0.05 0.04 0.18 0.41* -0.13 0.03 ICECREAM CAKE JUICE WINE COFFEE ILL 0.13 0.08 -0.04 0.01 -0.1 CHEESE -0.02 0.09 0.06 0.14 0.12 CRABDIP -0.13 0.02 0.32* -0.04 0.21 CRISPS -0.13 0.09 0.16 0.12 0.05 BREAD -0.09 -0.26 0.11 0.02 0.04 CHICKEN 0.21 0.08 0.07 -0.03 0.18 RICE 0.04 -0.05 0.08 -0.02 0.41* CAESAR -0.2 -0.08 -0.21 0.06 -0.13 TOMATO -0.13 0.11 -0.17 0.04 0.03 ICECREAM - -0.13 -0.14 0.01 -0.1 CAKE -0.13 - 0.16 0.01 -0.16 JUICE -0.14 0.16 - -0.32* -0.01 WINE 0.01 0.01 -0.32* - -0.12 COFFEE -0.1 -0.16 -0.01 -0.12 - > Topm(tox.cor, level=0.4) # shipunov Var1 Var2 Value Magnitude 1 TOMATO CAESAR 0.6449020 high 2 CRISPS CRABDIP 0.6359727 high 3 CAESAR ILL 0.6039006 high 4 TOMATO ILL 0.4595725 medium 5 COFFEE RICE 0.4134925 medium > # \section{Analysis of regression} > > # \subsection{Single line} > > pdf(file="pics/23870.pdf"); oldpar <- par(mar=c(4, 4, 1, 1)) > women.lm <- lm(weight ~ height, data=women) > plot(weight ~ height, data=women, + xlab="Height, in", ylab="Weight, lb") > grid() > abline(women.lm, col="red") > Cladd(women.lm, data=women) # shipunov > legend("bottomright", col=2:1, lty=1:2, + legend=c("linear relationship", "95% confidence bands")) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/56910.pdf"); oldpar <- par(mar=c(4, 4, 1, 1)) > plot(weight ~ height, data=women, pch=19, col="red") > abline(women.lm) > with(women, segments(height, fitted(women.lm), height, weight, + col="red")) > par(oldpar); dev.off() pdf 2 > summary(women.lm) Call: lm(formula = weight ~ height, data = women) Residuals: Min 1Q Median 3Q Max -1.7333 -1.1333 -0.3833 0.7417 3.1167 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -87.51667 5.93694 -14.74 1.71e-09 *** height 3.45000 0.09114 37.85 1.09e-14 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.525 on 13 degrees of freedom Multiple R-squared: 0.991, Adjusted R-squared: 0.9903 F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14 > Mag(0.9903) # shipunov [1] "very high" > (atan(women.lm$coefficients[[2]]) * 180)/pi [1] 73.8355 > oldpar <- par(mfrow=c(3, 3)) > ## Uniform variation and no trend: > > for (i in 1:9) plot(1:50, rnorm(50), xlab="Fitted", + ylab="Residuals") > title("'Good' Residuals vs. Fitted", outer=TRUE, line=-2) > ## Non-uniform variation plus trend: > > for (i in 1:9) plot(1:50, ((1:50)*rnorm(50) + 50:1), + xlab="Fitted",ylab="Residuals") > title("'Bad' Residuals vs. Fitted", outer=TRUE, line=-2) > par(oldpar) > oldpar <- par(mfrow=c(3, 3)) > for (i in 1:9) { aa <- rnorm(50); + qqnorm(aa, main=""); qqline(aa) } > title("'Good' normality QQ plots", outer=TRUE, line=-2) > for (i in 1:9) { aa <- rnorm(50)^2; + qqnorm(aa, main=""); qqline(aa) } > title("'Bad' normality QQ plots", outer=TRUE, line=-2) > par(oldpar) > Normality(women.lm$residuals) # shipunov [1] "NORMAL" > pdf(file="pics/38550.pdf"); oldpar <- par(mar=c(4, 4, 2, 1)) > women.lm3 <- lm(weight ~ height + I(height^3), data=women) > summary(women.lm3) Call: lm(formula = weight ~ height + I(height^3), data = women) Residuals: Min 1Q Median 3Q Max -0.49318 -0.26391 -0.02908 0.28097 0.54213 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.466e+02 1.589e+01 9.223 8.52e-07 *** height -1.981e+00 3.679e-01 -5.385 0.000164 *** I(height^3) 4.274e-04 2.890e-05 14.788 4.57e-09 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.362 on 12 degrees of freedom Multiple R-squared: 0.9995, Adjusted R-squared: 0.9995 F-statistic: 1.282e+04 on 2 and 12 DF, p-value: < 2.2e-16 > plot(women.lm3, which=1) # just residuals vs. fitted > par(oldpar); dev.off() pdf 2 > eggs <- read.table("data/eggs.txt") > eggs.lm <- lm(V2 ~ V1, data=eggs) > plot(eggs.lm) > summary(eggs.lm) Call: lm(formula = V2 ~ V1, data = eggs) Residuals: Min 1Q Median 3Q Max -7.4934 -1.7508 -0.0615 1.8350 11.2049 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.84827 0.49581 3.728 0.000213 *** V1 0.69823 0.01212 57.597 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.617 on 553 degrees of freedom Multiple R-squared: 0.8571, Adjusted R-squared: 0.8569 F-statistic: 3317 on 1 and 553 DF, p-value: < 2.2e-16 > confint(eggs.lm) 2.5 % 97.5 % (Intercept) 0.8743690 2.8221677 V1 0.6744162 0.7220408 > Mag(summary(eggs.lm)$adj.r.squared) [1] "very high" > ee <- read.table("http://ashipunov.me/data/exams.txt", h=TRUE) > str(ee) 'data.frame': 201 obs. of 3 variables: $ EXAM.N : int 3 3 3 3 3 3 3 3 3 3 ... $ ORDER : int 1 2 3 4 5 6 7 8 9 10 ... $ POINTS.50: int 42 23 30 32 27 19 37 30 36 28 ... > ee3 <- ee[ee$EXAM.N == 3,] > plot(POINTS.50 ~ ORDER, data=ee3) > ee3.lm <- lm(POINTS.50 ~ ORDER, data=ee3) > summary(ee3.lm) Call: lm(formula = POINTS.50 ~ ORDER, data = ee3) Residuals: Min 1Q Median 3Q Max -16.0118 -4.7561 0.4708 4.4344 13.4695 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 33.35273 1.24634 26.761 <2e-16 *** ORDER -0.02005 0.02143 -0.936 0.352 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 6.185 on 98 degrees of freedom Multiple R-squared: 0.008859, Adjusted R-squared: -0.001254 F-statistic: 0.876 on 1 and 98 DF, p-value: 0.3516 > plot(ee3.lm) > abline(ee3.lm) > Cladd(ee3.lm, data=ee3) > newcolor <- relevel(hwc$COLOR, "brown") > summary(lm(cbind(WEIGHT, HEIGHT) ~ newcolor, data=hwc)) Response WEIGHT : Call: lm(formula = WEIGHT ~ newcolor, data = hwc) Residuals: Min 1Q Median 3Q Max -9.2333 -2.0000 -0.2333 1.9417 7.0000 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 75.0000 0.5469 137.129 < 2e-16 *** newcolorblack 4.2333 0.7735 5.473 4.2e-07 *** newcolorblond -0.7667 0.7735 -0.991 0.324 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.996 on 87 degrees of freedom Multiple R-squared: 0.3579, Adjusted R-squared: 0.3431 F-statistic: 24.24 on 2 and 87 DF, p-value: 4.286e-09 Response HEIGHT : Call: lm(formula = HEIGHT ~ newcolor, data = hwc) Residuals: Min 1Q Median 3Q Max -6.300 -2.233 -0.050 1.750 9.133 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 164.2333 0.5506 298.302 < 2e-16 *** newcolorblack 5.6333 0.7786 7.235 1.72e-10 *** newcolorblond -7.9333 0.7786 -10.189 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 3.016 on 87 degrees of freedom Multiple R-squared: 0.7789, Adjusted R-squared: 0.7738 F-statistic: 153.3 on 2 and 87 DF, p-value: < 2.2e-16 > # \subsection{Many lines} > > pdf(file="pics/32650.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > ipo <- read.table("data/ipomopsis.txt", h=TRUE) > with(ipo, plot(Root, Fruit, pch=as.numeric(Grazing))) Warning in plot.xy(xy, type, ...) : NAs introduced by coercion > abline(lm(Fruit ~ Root, data=subset(ipo, Grazing=="Grazed"))) > abline(lm(Fruit ~ Root, data=subset(ipo, Grazing=="Ungrazed")), + lty=2) > legend("topleft", lty=1:2, legend=c("Grazed","Ungrazed")) > par(oldpar); dev.off() pdf 2 > ipo.lm <- lm(Fruit ~ Root * Grazing, data=ipo) > summary(ipo.lm) Call: lm(formula = Fruit ~ Root * Grazing, data = ipo) Residuals: Min 1Q Median 3Q Max -17.3177 -2.8320 0.1247 3.8511 17.1313 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -125.173 12.811 -9.771 1.15e-11 *** Root 23.240 1.531 15.182 < 2e-16 *** GrazingUngrazed 30.806 16.842 1.829 0.0757 . Root:GrazingUngrazed 0.756 2.354 0.321 0.7500 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 6.831 on 36 degrees of freedom Multiple R-squared: 0.9293, Adjusted R-squared: 0.9234 F-statistic: 157.6 on 3 and 36 DF, p-value: < 2.2e-16 > ipo.lm2 <- update(ipo.lm, . ~ . - Root:Grazing) > summary(ipo.lm2) Call: lm(formula = Fruit ~ Root + Grazing, data = ipo) Residuals: Min 1Q Median 3Q Max -17.1920 -2.8224 0.3223 3.9144 17.3290 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -127.829 9.664 -13.23 1.35e-15 *** Root 23.560 1.149 20.51 < 2e-16 *** GrazingUngrazed 36.103 3.357 10.75 6.11e-13 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 6.747 on 37 degrees of freedom Multiple R-squared: 0.9291, Adjusted R-squared: 0.9252 F-statistic: 242.3 on 2 and 37 DF, p-value: < 2.2e-16 > AIC(ipo.lm) [1] 273.0135 > AIC(ipo.lm2) [1] 271.1279 > AIC(women.lm) [1] 59.08158 > AIC(women.lm3) [1] 16.73861 > elections <- read.table("data/elections.txt", h=TRUE) > str(elections) 'data.frame': 153 obs. of 7 variables: $ DISTRICT: int 1 2 3 4 5 6 7 8 9 10 ... $ VOTER : int 329786 144483 709903 696114 717095 787593 696087 691688 730050 164275 ... $ INVALID : int 2623 859 5656 4392 3837 4715 10456 1819 2403 89 ... $ VALID : int 198354 97863 664195 619629 653133 655486 397797 632045 667994 161470 ... $ CAND.1 : int 24565 7884 30491 54999 36880 72166 43559 53046 60355 234 ... $ CAND.2 : int 11786 6364 11152 11519 10002 25204 28189 5504 2680 451 ... $ CAND.3 : int 142627 68573 599105 525814 559653 485669 267776 563502 599798 159496 ... > attach(elections) > PROP <- cbind(CAND.1, CAND.2, CAND.3) / VOTER > ATTEN <- (VALID + INVALID) / VOTER > pdf(file="pics/27760.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > lm.1 <- lm(CAND.1/VOTER ~ ATTEN) > lm.2 <- lm(CAND.2/VOTER ~ ATTEN) > lm.3 <- lm(CAND.3/VOTER ~ ATTEN) > plot(CAND.3/VOTER ~ ATTEN, xlim=c(0, 1), ylim=c(0, 1), + xlab="Attendance", ylab="Percent voted for the candidate") > points(CAND.1/VOTER ~ ATTEN, pch=2) > points(CAND.2/VOTER ~ ATTEN, pch=3) > abline(lm.3) > abline(lm.2, lty=2) > abline(lm.1, lty=3) > legend("topleft", lty=c(3, 2, 1), + legend=c("Party 1", "Party 2", "Party 3")) > detach(elections) > par(oldpar); dev.off() pdf 2 > elections2 <- cbind(ATTEN, stack(data.frame(PROP))) > names(elections2) <- c("atten", "percn", "cand") > str(elections2) 'data.frame': 459 obs. of 3 variables: $ atten: num 0.609 0.683 0.944 0.896 0.916 ... $ percn: num 0.0745 0.0546 0.043 0.079 0.0514 ... $ cand : Factor w/ 3 levels "CAND.1","CAND.2",..: 1 1 1 1 1 1 1 1 1 1 ... > ancova.v <- lm(percn ~ atten * cand, data=elections2) > summary(ancova.v) Call: lm(formula = percn ~ atten * cand, data = elections2) Residuals: Min 1Q Median 3Q Max -0.116483 -0.015470 -0.000699 0.014825 0.102810 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.117115 0.011973 9.781 < 2e-16 *** atten -0.070726 0.018266 -3.872 0.000124 *** candCAND.2 -0.023627 0.016933 -1.395 0.163591 candCAND.3 -0.547179 0.016933 -32.315 < 2e-16 *** atten:candCAND.2 0.004129 0.025832 0.160 0.873074 atten:candCAND.3 1.393336 0.025832 53.938 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.02591 on 453 degrees of freedom Multiple R-squared: 0.9824, Adjusted R-squared: 0.9822 F-statistic: 5057 on 5 and 453 DF, p-value: < 2.2e-16 > # \subsection{More then one way, again} > > ToothGrowth.1 <- with(ToothGrowth, + data.frame(len, supp, fdose=factor(dose))) > str(ToothGrowth.1) 'data.frame': 60 obs. of 3 variables: $ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ... $ supp : Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ... $ fdose: Factor w/ 3 levels "0.5","1","2": 1 1 1 1 1 1 1 1 1 1 ... > Normality(ToothGrowth$len) [1] "NORMAL" > with(ToothGrowth, fligner.test(split(len, list(dose, supp)))) Fligner-Killeen test of homogeneity of variances data: split(len, list(dose, supp)) Fligner-Killeen:med chi-squared = 7.7488, df = 5, p-value = 0.1706 > summary(aov(len ~ supp * fdose, data=ToothGrowth.1)) Df Sum Sq Mean Sq F value Pr(>F) supp 1 205.4 205.4 15.572 0.000231 *** fdose 2 2426.4 1213.2 92.000 < 2e-16 *** supp:fdose 2 108.3 54.2 4.107 0.021860 * Residuals 54 712.1 13.2 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > AIC(aov(len ~ supp * fdose, data=ToothGrowth.1)) [1] 332.7056 > summary(aov(len ~ supp + fdose, data=ToothGrowth.1)) Df Sum Sq Mean Sq F value Pr(>F) supp 1 205.4 205.4 14.02 0.000429 *** fdose 2 2426.4 1213.2 82.81 < 2e-16 *** Residuals 56 820.4 14.7 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > AIC(aov(len ~ supp + fdose, data=ToothGrowth.1)) [1] 337.2013 > Eta2(aov(len ~ supp * fdose, data=ToothGrowth.1)) [1] 0.7937246 > TukeyHSD(aov(len ~ supp * fdose, data=ToothGrowth.1)) Tukey multiple comparisons of means 95% family-wise confidence level Fit: aov(formula = len ~ supp * fdose, data = ToothGrowth.1) $supp diff lwr upr p adj VC-OJ -3.7 -5.579828 -1.820172 0.0002312 $fdose diff lwr upr p adj 1-0.5 9.130 6.362488 11.897512 0.0e+00 2-0.5 15.495 12.727488 18.262512 0.0e+00 2-1 6.365 3.597488 9.132512 2.7e-06 $`supp:fdose` diff lwr upr p adj VC:0.5-OJ:0.5 -5.25 -10.048124 -0.4518762 0.0242521 OJ:1-OJ:0.5 9.47 4.671876 14.2681238 0.0000046 VC:1-OJ:0.5 3.54 -1.258124 8.3381238 0.2640208 OJ:2-OJ:0.5 12.83 8.031876 17.6281238 0.0000000 VC:2-OJ:0.5 12.91 8.111876 17.7081238 0.0000000 OJ:1-VC:0.5 14.72 9.921876 19.5181238 0.0000000 VC:1-VC:0.5 8.79 3.991876 13.5881238 0.0000210 OJ:2-VC:0.5 18.08 13.281876 22.8781238 0.0000000 VC:2-VC:0.5 18.16 13.361876 22.9581238 0.0000000 VC:1-OJ:1 -5.93 -10.728124 -1.1318762 0.0073930 OJ:2-OJ:1 3.36 -1.438124 8.1581238 0.3187361 VC:2-OJ:1 3.44 -1.358124 8.2381238 0.2936430 OJ:2-VC:1 9.29 4.491876 14.0881238 0.0000069 VC:2-VC:1 9.37 4.571876 14.1681238 0.0000058 VC:2-OJ:2 0.08 -4.718124 4.8781238 1.0000000 > pdf(file="pics/64650.pdf"); oldpar <- par(mar=c(4, 7, 3, 1)) > plot(TukeyHSD(aov(len ~ supp * fdose, data=ToothGrowth.1)), las=1) > par(oldpar); dev.off() pdf 2 > # \section{Probability of the success: logistic regression} > > l <- read.table("data/logit.txt") > l$V2 <- as.factor(l$V2) > head(l) V1 V2 1 14 F 2 29 F 3 6 F 4 25 S 5 18 S 6 4 F > pdf(file="pics/40380.pdf"); oldpar <- par(mar=c(4, 4, 1, 2)) > cdplot(V2 ~ V1, data=l, + xlab="Experience, months", ylab="Success") > par(oldpar); dev.off() pdf 2 > l.logit <- glm(V2 ~ V1, family=binomial, data=l) > summary(l.logit) Call: glm(formula = V2 ~ V1, family = binomial, data = l) Deviance Residuals: Min 1Q Median 3Q Max -1.9987 -0.4584 -0.2245 0.4837 1.5005 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -4.9638 2.4597 -2.018 0.0436 * V1 0.2350 0.1163 2.021 0.0432 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 18.249 on 13 degrees of freedom Residual deviance: 10.301 on 12 degrees of freedom AIC: 14.301 Number of Fisher Scoring iterations: 5 > tox.logit <- glm(formula=I(2-ILL) ~ CAESAR + TOMATO, + family=binomial, data=tox) > tox.logit2 <- update(tox.logit, . ~ . - TOMATO) > tox.logit3 <- update(tox.logit, . ~ . - CAESAR) > tox.logit$aic [1] 47.40782 > tox.logit2$aic [1] 45.94004 > tox.logit3$aic [1] 53.11957 > # \section{Answers to exercises} > > # \subsection{Correlation and linear models} > > traits <- read.table("data/traits.txt", h=TRUE, row.names=1) > Str(traits) 'data.frame': 21 obs. of 9 variables: 1 TONGUE : int 0 0 0 0 0 0 1 0 1 1 ... 2 EARLOBE: int 0 0 0 1 0 0 0 1 1 1 ... 3 PINKY : int 0 1 1 1 1 0 1 0 1 0 ... 4 ARM : int 0 1 0 1 1 0 0 1 1 0 ... 5 CHEEK : int 1 0 0 0 1 0 0 1 0 0 ... 6 CHIN : int 1 1 1 1 1 1 1 1 0 0 ... 7 THUMB : int 0 0 0 1 0 0 1 1 0 1 ... 8 TOE : int 0 1 0 0 1 1 1 0 0 0 ... 9 PEAK : int 0 1 1 1 0 0 1 1 0 0 ... row.names [1:21] "A" "B" "C" "D" "E" ... > Cor(traits, method="kendall", dec=1) # shipunov Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties Warning in cor.test.default(X[, i], X[, j], ...) : Cannot compute exact p-value with ties TONGUE EARLOBE PINKY ARM CHEEK CHIN THUMB TOE PEAK TONGUE - 0.1 -0.2 -0.1 -0.1 -0.6* 0.5* 0 -0.2 EARLOBE 0.1 - -0.2 0.1 -0.1 -0.4 0.3 -0.3 0 PINKY -0.2 -0.2 - 0.2 0.1 0 -0.1 0 0.2 ARM -0.1 0.1 0.2 - 0.2 0 -0.1 0.1 0 CHEEK -0.1 -0.1 0.1 0.2 - -0.1 0 -0.1 -0.1 CHIN -0.6* -0.4 0 0 -0.1 - -0.5* 0 0.4 THUMB 0.5* 0.3 -0.1 -0.1 0 -0.5* - 0.1 0.1 TOE 0 -0.3 0 0.1 -0.1 0 0.1 - 0 PEAK -0.2 0 0.2 0 -0.1 0.4 0.1 0 - > traits.c <- cor(traits, method="kendall") > Topm(traits.c) # shipunov Var1 Var2 Value Magnitude 1 CHIN TONGUE -0.6264145 high 2 THUMB TONGUE 0.5393599 high 3 THUMB CHIN -0.4853627 medium > pdf(file="pics/cover_to_crop.pdf") > Pleiad(traits.c, corr=TRUE, lcol=1, legend=FALSE, off=1.12, + pch=21, bg="white", cex=1.1) # shipunov > dev.off() pdf 2 > pdf(file="pics/26450.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > cor.test(hwc$WEIGHT, hwc$HEIGHT) Pearson's product-moment correlation data: hwc$WEIGHT and hwc$HEIGHT t = 5.0682, df = 88, p-value = 2.199e-06 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.2975308 0.6212688 sample estimates: cor 0.4753337 > w.h <- lm(WEIGHT ~ HEIGHT, data=hwc) > summary(w.h) Call: lm(formula = WEIGHT ~ HEIGHT, data = hwc) Residuals: Min 1Q Median 3Q Max -7.966 -2.430 0.305 2.344 5.480 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 30.86387 8.94310 3.451 0.00086 *** HEIGHT 0.27707 0.05467 5.068 2.2e-06 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 3.27 on 88 degrees of freedom Multiple R-squared: 0.2259, Adjusted R-squared: 0.2171 F-statistic: 25.69 on 1 and 88 DF, p-value: 2.199e-06 > plot(WEIGHT ~ HEIGHT, data=hwc, xlab="Height, cm", + ylab="Weight, kg") > abline(w.h) > Cladd(w.h, data=hwc) # shipunov > par(oldpar); dev.off() pdf 2 > ee <- read.table( + "http://ashipunov.me/shipunov/open/erophila.txt", h=TRUE) > Str(ee) # shipunov 'data.frame': 111 obs. of 5 variables: 1 LOC : int 1 1 1 1 1 1 1 1 1 1 ... 2 FRUIT.L : num 4.8 5.1 4.9 4.7 4.7 5 4.7 4.8 5.5 4.5 ... 3 FRUIT.W : num 1.8 2 2.3 2.1 2.4 2.8 2.8 2.8 2.8 1.8 ... 4 FRUIT.MAXW: num 3.5 3 3 2.5 2.8 3.3 1.5 2 3 2.5 ... 5 FRUIT.A : int 2 2 2 2 2 1 0 0 2 2 ... > sapply(ee[, 2:4], Normality) # shipunov FRUIT.L FRUIT.W FRUIT.MAXW "NORMAL" "NORMAL" "NOT NORMAL" > Topm(cor(ee[, 2:4], method="spearman")) # shipunov Var1 Var2 Value Magnitude 1 FRUIT.MAXW FRUIT.L 0.7109781 very high 2 FRUIT.W FRUIT.L 0.4642429 medium > with(ee, cor.test(FRUIT.L, FRUIT.MAXW, method="spearman")) Warning in cor.test.default(FRUIT.L, FRUIT.MAXW, method = "spearman") : Cannot compute exact p-value with ties Spearman's rank correlation rho data: FRUIT.L and FRUIT.MAXW S = 65874, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.7109781 > pdf(file="pics/52690.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > ee.lm <- lm(FRUIT.MAXW ~ FRUIT.L, data=ee) > plot(FRUIT.MAXW ~ FRUIT.L, data=ee, type="n") > Points(ee$FRUIT.L, ee$FRUIT.MAXW, scale=.5) # shipunov > Cladd(ee.lm, ee, ab.lty=1) # shipunov > par(oldpar); dev.off() pdf 2 > summary(ee.lm) Call: lm(formula = FRUIT.MAXW ~ FRUIT.L, data = ee) Residuals: Min 1Q Median 3Q Max -1.17383 -0.23371 -0.03309 0.27678 1.08666 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.14037 0.30475 -0.461 0.646 FRUIT.L 0.59877 0.06091 9.830 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4196 on 109 degrees of freedom Multiple R-squared: 0.4699, Adjusted R-squared: 0.4651 F-statistic: 96.64 on 1 and 109 DF, p-value: < 2.2e-16 > plot(ee.lm, which=1) > Normality(ee.lm$residuals) [1] "NORMAL" > he <- read.table( + "http://ashipunov.me/shipunov/open/heterostyly.txt", h=TRUE) > he$SPECIES <- as.factor(he$SPECIES) > str(he) 'data.frame': 993 obs. of 6 variables: $ LOC : int 1 1 1 1 1 1 1 1 1 1 ... $ STYLE.L : num 7.5 14 8 6 10.5 16 7 7 8 13 ... $ STAMEN.L: num 9 10 17 13 10.5 10 17 12 16 10.5 ... $ TUBE.L : num 15 20 19.5 19 16 17 19 21 20 17.5 ... $ COLOR : chr "yl" "ww" "ww" "yl" ... $ SPECIES : Factor w/ 2 levels "veris","vulgaris": 2 2 2 2 2 2 2 2 2 2 ... > boxplot((STYLE.L-STAMEN.L) ~ (STYLE.L-STAMEN.L)>0, + names=c("short","long"), data=he) > pdf(file="pics/48090.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > plot(STYLE.L ~ STAMEN.L, data=he, type="n", + xlab="Length of stamens, mm", ylab="Length of style, mm") > PPoints(he$SPECIES, he$STAMEN.L, he$STYLE.L, scale=.9, cols=1) > abline(lm(STYLE.L ~ STAMEN.L, data=he[he$SPECIES=="veris", ])) > abline(lm(STYLE.L ~ STAMEN.L, data=he[he$SPECIES=="vulgaris", ]), + lty=2) > legend("topright", pch=1:2, lty=1:2, legend=c("Primula veris", + "P. vulgaris"), text.font=3) > par(oldpar); dev.off() pdf 2 > summary(he.lm1 <- lm(STYLE.L ~ STAMEN.L * SPECIES, data=he)) Call: lm(formula = STYLE.L ~ STAMEN.L * SPECIES, data = he) Residuals: Min 1Q Median 3Q Max -10.2707 -1.7731 -0.2508 1.7541 10.2368 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 19.13735 1.17900 16.232 < 2e-16 *** STAMEN.L -0.74519 0.09137 -8.156 1.05e-15 *** SPECIESvulgaris -1.84688 1.23060 -1.501 0.1337 STAMEN.L:SPECIESvulgaris 0.24272 0.09509 2.552 0.0108 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.921 on 989 degrees of freedom Multiple R-squared: 0.3077, Adjusted R-squared: 0.3056 F-statistic: 146.5 on 3 and 989 DF, p-value: < 2.2e-16 > plot(he.lm1, which=1:2) > AIC(he.lm1) [1] 4953.172 > summary(he.lm2 <- update(he.lm1, . ~ . - STAMEN.L:SPECIES)) Call: lm(formula = STYLE.L ~ STAMEN.L + SPECIES, data = he) Residuals: Min 1Q Median 3Q Max -10.3613 -2.0086 -0.1926 1.8496 10.2020 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 16.34607 0.44188 36.992 < 2e-16 *** STAMEN.L -0.52109 0.02538 -20.536 < 2e-16 *** SPECIESvulgaris 1.18400 0.32400 3.654 0.000271 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.93 on 990 degrees of freedom Multiple R-squared: 0.3031, Adjusted R-squared: 0.3017 F-statistic: 215.3 on 2 and 990 DF, p-value: < 2.2e-16 > AIC(he.lm2) [1] 4957.692 > with(he, interaction.plot(cut(STAMEN.L, quantile(STAMEN.L)), + SPECIES, STYLE.L)) > Str(drosera) # shipunov 'data.frame': 1165 obs. of 11 variables: 1 POP : Factor w/ 19 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ... 2 YOUNG.L * int 0 0 0 1 0 1 0 1 0 1 ... 3 MATURE.L* int 3 2 5 5 3 5 4 4 3 3 ... 4 OLD.L * int 3 2 5 2 1 2 1 1 3 2 ... 5 INSECTS * int 3 2 10 5 0 5 7 3 1 1 ... 6 INFL.L * int 0 0 128 0 0 0 0 0 0 0 ... 7 STALK.L * int 0 0 100 0 0 0 0 0 0 0 ... 8 N.FLOW * int 0 0 3 0 0 0 0 0 0 0 ... 9 LEAF.L * num 4 4 6 5 3 5 6 4 3 4 ... 10 LEAF.W * num 4 4 6 6 4 6 7 5 4 5 ... 11 PET.L * int 10 7 16 15 4 5 13 14 11 10 ... > sapply(drosera[, -1], Normality) YOUNG.L MATURE.L OLD.L INSECTS INFL.L STALK.L "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" N.FLOW LEAF.L LEAF.W PET.L "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" > pdf(file="pics/51270.pdf") > dr.cor <- cor(drosera[, -1], method="spearman", use="pairwise") > Topm(dr.cor) # shipunov Var1 Var2 Value Magnitude 1 STALK.L INFL.L 0.9901613 very high 2 N.FLOW STALK.L 0.9774198 very high 3 N.FLOW INFL.L 0.9593589 very high 4 LEAF.W LEAF.L 0.8251841 very high 5 PET.L LEAF.W 0.7129433 very high 6 PET.L LEAF.L 0.6972402 high 7 PET.L INFL.L 0.4795218 medium 8 PET.L STALK.L 0.4661593 medium 9 INSECTS MATURE.L 0.4644699 medium > Pleiad(dr.cor, corr=TRUE, legtext=2, legpos="bottom", + leghoriz=TRUE, pch=19, cex=1.2) # shipunov > dev.off() pdf 2 > summary(dr.lm <- lm(INFL.L ~ STALK.L, data=drosera)) Call: lm(formula = INFL.L ~ STALK.L, data = drosera) Residuals: Min 1Q Median 3Q Max -100.015 0.046 0.046 0.046 74.026 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.046294 0.212065 -0.218 0.827 STALK.L 1.139452 0.004637 245.719 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 6.285 on 1158 degrees of freedom (5 observations deleted due to missingness) Multiple R-squared: 0.9812, Adjusted R-squared: 0.9812 F-statistic: 6.038e+04 on 1 and 1158 DF, p-value: < 2.2e-16 > plot(dr.lm, which=1:2) > (largest3 <- rev(sort(table(drosera[, 1])))[1:3]) Q1 L N1 211 201 144 > dr3 <- drosera[drosera$POP %in% names(largest3), ] > dr3$POP <- droplevels(dr3$POP) > boxplot(LEAF.L ~ POP, data=dr3) > tapply(dr3$LEAF.L, dr3$POP, mad, na.rm=TRUE) L N1 Q1 1.4826 1.4826 1.4826 > fligner.test(LEAF.L ~ POP, data=dr3) Fligner-Killeen test of homogeneity of variances data: LEAF.L by POP Fligner-Killeen:med chi-squared = 8.1408, df = 2, p-value = 0.01707 > kruskal.test(LEAF.L ~ POP, data=dr3) Kruskal-Wallis rank sum test data: LEAF.L by POP Kruskal-Wallis chi-squared = 97.356, df = 2, p-value < 2.2e-16 > pairwise.wilcox.test(dr3$LEAF.L, dr3$POP) Pairwise comparisons using Wilcoxon rank sum test with continuity correction data: dr3$LEAF.L and dr3$POP L N1 N1 5.2e-16 - Q1 0.74 < 2e-16 P value adjustment method: holm > # \subsection{Logistic regression} > > seeing <- read.table("data/seeing.txt") > attach(seeing) > seeing.logit <- glm(V3 ~ V2, family=binomial, data=seeing) > summary(seeing.logit) Call: glm(formula = V3 ~ V2, family = binomial, data = seeing) Deviance Residuals: Min 1Q Median 3Q Max -2.4029 -0.8701 0.4299 0.7825 1.5197 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.6776 0.7923 -2.117 0.03423 * V2 0.9015 0.2922 3.085 0.00203 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 62.687 on 49 degrees of freedom Residual deviance: 49.738 on 48 degrees of freedom AIC: 53.738 Number of Fisher Scoring iterations: 4 > pdf(file="pics/32450.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > tries <- seq(1, 5, length=50) # exactly 50 numbers from 1 to 5 > seeing.p <- predict(seeing.logit, list(V2=tries), + type="response") > plot(V3 ~ jitter(V2, amount=.1), data=seeing, xlab="", ylab="") > lines(tries, seeing.p) > par(oldpar); dev.off() pdf 2 > jj <- read.table( + "http://ashipunov.me/shipunov/open/juniperus.txt", h=TRUE) > jj$LOC <- factor(jj$LOC, labels=paste0("loc", levels(jj$LOC))) > Str(jj) # shipunov 'data.frame': 61 obs. of 7 variables: 1 LOC : Factor w/ 3 levels "loc1","loc2",..: 1 1 1 1 1 1 1 1 1 1 ... 2 HEIGHT : num 90 55 20 80 80 65 25 40 55 40 ... 3 WIDTH : num 40 25 45 100 135 35 55 25 45 55 ... 4 NEEDLE.L: int 8 8 5 6 10 6 6 9 5 5 ... 5 PROTR : num 1 1 1.5 1 0 1 0 0.5 1 1 ... 6 STEM.D * num 2.4 2.3 3.5 6 2.6 4.5 3.2 0.5 NA 1.7 ... 7 PINE.N : int 1 0 2 2 0 2 3 2 0 3 ... > pdf(file="pics/57390.pdf"); oldpar <- par(mar=c(4, 0, 0, 1)) > j.cols <- colorRampPalette(c("steelblue", "white"))(5)[2:4] > Boxplots(jj[, 2:7], jj$LOC, legpos="top", + boxcols=j.cols) # shipunov > par(oldpar); dev.off() pdf 2 > pdf(file="pics/57560.pdf"); oldpar <- par(mar=c(4, 4, 1, 2)) > spineplot(LOC ~ NEEDLE.L, data=jj, col=j.cols) > par(oldpar); dev.off() pdf 2 > Normality(jj$NEEDLE.L) [1] "NORMAL" > tapply(jj$NEEDLE.L, jj$LOC, var) loc1 loc2 loc3 2.461905 3.607895 2.407895 > bartlett.test(NEEDLE.L ~ LOC, data=jj) Bartlett test of homogeneity of variances data: NEEDLE.L by LOC Bartlett's K-squared = 1.0055, df = 2, p-value = 0.6049 > oneway.test(NEEDLE.L ~ LOC, data=jj) One-way analysis of means (not assuming equal variances) data: NEEDLE.L and LOC F = 14.129, num df = 2.000, denom df = 38.232, p-value = 2.546e-05 > (eta.squared <- + summary(lm(NEEDLE.L ~ LOC, data=jj))$adj.r.squared) [1] 0.3337755 > pairwise.t.test(jj$NEEDLE.L, jj$LOC) Pairwise comparisons using t tests with pooled SD data: jj$NEEDLE.L and jj$LOC loc1 loc2 loc2 0.00031 - loc3 0.14564 3.1e-06 P value adjustment method: holm > is.sibirica <- with(jj, (NEEDLE.L < 8 & HEIGHT < 100)) > sibirica <- factor(is.sibirica, labels=c("communis", "sibirica")) > summary(sibirica) communis sibirica 24 37 > pdf(file="pics/58290.pdf"); oldpar <- par(mar=c(4, 4, 1, 2)) > cdplot(sibirica ~ PINE.N, data=jj, col=j.cols[c(1, 3)]) > summary(glm(sibirica ~ PINE.N, data=jj, family=binomial)) Call: glm(formula = sibirica ~ PINE.N, family = binomial, data = jj) Deviance Residuals: Min 1Q Median 3Q Max -1.8549 -1.0482 0.7397 1.0042 1.3123 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -0.3117 0.4352 -0.716 0.4738 PINE.N 0.3670 0.1776 2.066 0.0388 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 81.772 on 60 degrees of freedom Residual deviance: 77.008 on 59 degrees of freedom AIC: 81.008 Number of Fisher Scoring iterations: 4 > par(oldpar); dev.off() pdf 2 > # \section{How to choose the right method} > > # \chapter{Draw} > > # \section{Pictographs} > > pdf(file="pics/27400.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > eq8 <- read.table("data/eq8.txt", h=TRUE) > Str(eq8) # shipunov 'data.frame': 832 obs. of 9 variables: 1 SPECIES * chr "arvense" "arvense" "arvense" "arvense" ... 2 DL.R : num 424 339 321 509 462 350 405 615 507 330 ... 3 DIA.ST * num 2.3 2 2.5 3 2.5 1.8 2 2.2 2 1.8 ... 4 N.REB : int 13 11 15 14 12 9 14 11 10 8 ... 5 N.ZUB : int 12 12 14 14 13 9 15 11 10 8 ... 6 DL.OSN.Z* num 2 1 2 1.5 1.1 1.1 1 1 1 1 ... 7 DL.TR.V * num 5 4 5 5 4 4 4 4 5 5 ... 8 DL.BAZ : num 3 2.5 2.3 2.2 2.1 2 2 2 1.9 1 ... 9 DL.PER : num 25 13 13 23 12 15 13 14 10 9 ... > eq8m <- Aggregate1(eq8[, 2:9], eq8[, 1], + median, na.rm=TRUE) # shipunov > stars(eq8m, cex=1.2, lwd=1.2, col.stars=rep("darkseagreen", 8)) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/27580_to_crop.pdf") > library(TeachingDemos) > faces(eq8m) > dev.off() pdf 2 > pdf(file="pics/06645.pdf"); oldpar <- par(mar=c(0, 4, 0, 0)) > image(scale(iris[, -5]), axes=FALSE) > axis(2, at=seq(0, 1, length.out=4), + labels=abbreviate(colnames(iris[, -5])), las=2) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/34390.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > library(MASS) > parcoord(iris[, -5], col=as.numeric(iris[, 5]), lwd=2) > legend("top", bty="n", lty=1, lwd=2, col=1:3, + legend=names(table(iris[, 5]))) > par(oldpar); dev.off() pdf 2 > # \section{Grouped plots} > > Boxplots(iris[, 1:4], iris[, 5], srt=0, adj=c(.5, 1), + legpos="topright") # shipunov > Linechart(iris[, 1:4], iris[, 5], mad=TRUE) # shipunov > matplot(iris[, 1:2], iris[, 3:4], pch=1, + xlab="Sepals", ylab="Petals") > legend("topleft", pch=1, col=1:2, legend=c("Length", "Width")) > pdf(file="pics/27180.pdf") > pairs(iris[, 1:4], pch=19, col=as.numeric(iris[, 5]), + oma=c(2, 2, 4, 2)) > oldpar <- par(xpd=TRUE) > legend(0, 1.06, horiz=TRUE, legend=levels(iris[, 5]), + pch=19, col=1:3, bty="n") > par(oldpar) > dev.off() pdf 2 > pdf(file="pics/27000_to_crop.pdf") > betula <- read.table( + "http://ashipunov.me/shipunov/open/betula.txt", h=TRUE) > library(lattice) > d.tmp <- do.call(make.groups, betula[, c(2:4, 7:8)]) > d.tmp$LOC <- betula$LOC > bwplot(data ~ factor(LOC) | which, data=d.tmp, ylab="") > dev.off() pdf 2 > pdf(file="pics/61060_to_crop.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > eq.s <- stack(as.data.frame(scale(eq8m))) > eq.s$SPECIES <- row.names(eq8m) > dotplot(SPECIES ~ values | ind, eq.s, xlab="") > par(oldpar); dev.off() pdf 2 > pdf(file="pics/35270.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > coplot(percn ~ atten | cand, data=elections2, col="red", + bg="pink", pch=21, bar.bg=c(fac="lightblue")) > par(oldpar); dev.off() pdf 2 > # \section{3D plots} > > pdf(file="pics/55170_to_crop.pdf") > library(vcd) > ternaryplot(scale(iris[, 2:4], center=FALSE), grid_color="black", + cex=0.3, col=iris[, 5], main="") > grid_legend(0.8, 0.7, pch=19, size=.5, col=1:3, levels(iris[, 5])) > dev.off() pdf 2 > pdf(file="pics/54020_to_crop.pdf") > library(scatterplot3d) > i3d <- scatterplot3d(iris[, 2:4], color=as.numeric(iris[, 5]), + type="h", pch=14 + as.numeric(iris[, 5]), xlab="Sepal width", + ylab="", zlab="Petal width") > dims <- par("usr") > x <- dims[1]+ 0.82*diff(dims[1:2]) > y <- dims[3]+ 0.1*diff(dims[3:4]) > text(x, y, "Petal length", srt=40) > legend(i3d$xyz.convert(3.8, 6.5, 1.5), col=1:3, pch=(14 + 1:3), + legend=levels(iris[, 5]), bg="white") > dev.off() pdf 2 > pdf(file="pics/54440_to_crop.pdf") > library(lattice) > p <- cloud(Sepal.Length ~ Petal.Length * Petal.Width, data=iris, + groups=Species, par.settings=list(clip=list(panel="off")), + auto.key=list(space="top", columns=3, points=TRUE)) > update(p[rep(1, 4)], layout=c(2, 2), function(..., screen) + panel.cloud(..., screen=list(z=c(-70, 110)[current.column()], + x=-70, y=c(140, 0)[current.row()]))) > dev.off() pdf 2 > # \chapter{Discover} > > # \section{Discovery with primary data} > > # \subsection{Shadows of hyper clouds: PCA} > > pdf(file="pics/90530.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > library(png) # package to read PNG images as data > aa <- readPNG("data/pear2d_bw.png") > bb <- which(aa == 0, arr.ind=TRUE) # pixels to coordinates > ## plot together original (green) and PCA-rotated (gray): > > bbs <- scale(bb) > pps <- scale(prcomp(bb)$x[, 1:2]) # only two PCs > xx <- range(c(bbs[, 1], pps[, 1])) > yy <- range(c(bbs[, 2], pps[, 2])) > plot(pps, pch=".", col=adjustcolor("black", alpha=0.5), + xlim=xx, ylim=yy) > points(bbs, pch=".", col=adjustcolor("green", alpha=0.5)) > legend("bottomright", fill=adjustcolor(c("green", "black"), + alpha=0.5), legend=c("Original", "PCA-rotated"), + bty="n", border=0) > par(oldpar); dev.off() pdf 2 > ca <- read.table( + "http://ashipunov.me/shipunov/open/carex.txt", h=TRUE) > Str(ca) 'data.frame': 62 obs. of 5 variables: 1 LOC : int 1 1 1 1 1 1 1 1 1 1 ... 2 HEIGHT : int 157 103 64 77 21 27 19 35 43 92 ... 3 LEAF.W : num 2.5 2.5 2 2 1.5 1.5 2 1.5 2 2 ... 4 SPIKE.L: num 9.5 9 7.5 7 4 5 3.5 6 6 6.5 ... 5 SPIKE.W: num 6.5 6.5 6 5 4 4 3.5 5 5 5.5 ... > ca.pca <- princomp(scale(ca[, -1])) > pdf(file="pics/27890.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > plot(ca.pca, main="") > par(oldpar); dev.off() pdf 2 > summary(ca.pca) Importance of components: Comp.1 Comp.2 Comp.3 Comp.4 Standard deviation 1.6448264 0.7759300 0.59318563 0.52544578 Proportion of Variance 0.6874514 0.1529843 0.08940939 0.07015485 Cumulative Proportion 0.6874514 0.8404358 0.92984515 1.00000000 > pdf(file="pics/28180.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > ca.p <- ca.pca$scores[, 1:2] > plot(ca.p, type="n", xlab="PC1", ylab="PC2") > text(ca.p, labels=ca[, 1], col=ca[, 1]) > Hulls(ca.p, ca[, 1]) # shipunov > par(oldpar); dev.off() pdf 2 > ca.h <- Hulls(ca.p, ca[, 1], plot=FALSE) # shipunov > ca.o <- Overlap(ca.h) # shipunov > summary(ca.o) Overlaps for each hull, %: mean.overlap total.overlap 1 30.43 60.86 2 24.89 49.79 3 38.08 76.17 4 0.00 0.00 Mean of all overlaps 31.14 % > oldpar <- par() > pdf(file="pics/28360.pdf"); oldpar <- par(mar=c(4, 4, 2, 2)) > biplot(ca.pca, xlabs=ca[, 1]) > par(oldpar); dev.off() pdf 2 > par(oldpar) > loadings(ca.pca) Loadings: Comp.1 Comp.2 Comp.3 Comp.4 HEIGHT 0.518 0.845 0.125 LEAF.W 0.468 0.721 -0.263 -0.437 SPIKE.L 0.534 -0.432 0.724 SPIKE.W 0.476 -0.688 -0.178 -0.518 Comp.1 Comp.2 Comp.3 Comp.4 SS loadings 1.00 1.00 1.00 1.00 Proportion Var 0.25 0.25 0.25 0.25 Cumulative Var 0.25 0.50 0.75 1.00 > iris.pca <- prcomp(iris[, -5]) > iris.pca$rotation PC1 PC2 PC3 PC4 Sepal.Length 0.36138659 -0.65658877 0.58202985 0.3154872 Sepal.Width -0.08452251 -0.73016143 -0.59791083 -0.3197231 Petal.Length 0.85667061 0.17337266 -0.07623608 -0.4798390 Petal.Width 0.35828920 0.07548102 -0.54583143 0.7536574 > iris.p <- iris.pca$x[, 1:2] > plot(iris.p, type="n", xlab="PC1", ylab="PC2") > text(iris.p, labels=abbreviate(iris[, 5], 1, method="both.sides"), + col=as.numeric(iris[, 5])) > Ellipses(iris.p[, 1:2], as.numeric(iris[, 5])) # shipunov > pdf(file="pics/28750.pdf") > library(ade4) > iris.dudi <- dudi.pca(iris[, 1:4], scannf=FALSE) > s.class(iris.dudi$li, iris[, 5]) > dev.off() pdf 2 > iris.between <- bca(iris.dudi, iris[, 5], scannf=FALSE) > randtest(iris.between) Monte-Carlo test Call: randtest.between(xtest = iris.between) Observation: 0.7224358 Based on 999 replicates Simulated p-value: 0.001 Alternative hypothesis: greater Std.Obs Expectation Variance 6.912789e+01 1.377969e-02 1.050907e-04 > detach("package:ade4", unload=TRUE) # cca() clashes with vegan > library(vegan) > anosim(iris.p, iris$Species, permutations=99, distance="euclidean") Call: anosim(x = iris.p, grouping = iris$Species, permutations = 99, distance = "euclidean") Dissimilarity: euclidean ANOSIM statistic R: 0.8732 Significance: 0.01 Permutation: free Number of permutations: 99 > # \subsection{Correspondence} > > pdf(file="pics/37300.pdf"); oldpar <- par(mar=c(2, 2, 2, 2)) > library(MASS) > HE <- margin.table(HairEyeColor, 1:2) > HE.df <- Table2df(HE) # shipunov > biplot(corresp(HE.df, nf = 2), xpd=TRUE) > legend("topleft", fill=1:2, legend=c("hair","eyes")) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/44460.pdf"); oldpar <- par(mar=c(2, 2, 1, 1)) > library(vegan) > alla <- read.table("data/lakesea_abio.txt", sep="\t", h=TRUE) > allc <- read.table("data/lakesea_bio.txt", sep="\t", h=TRUE) > oldnames <- names(alla) # names are too long for the plot > names(alla) <- LETTERS[1:ncol(alla)] > all.cca <- cca(allc, alla[, -14]) > plot(all.cca, display=c("sp", "cn")) > points(all.cca, display="sites", pch=ifelse(alla[, 14], 15, 0)) > legend("topleft", pch=c(15, 0, 3), col=c(1, 1, 2), + legend=c("lake sites", "sea sites", "plant species")) > legend("bottomright", pch=names(alla)[-14], col="blue", + legend=oldnames[-14]) > text(-1.6, -4.2, "Carex lasiocarpa", pos=4) > par(oldpar); dev.off() pdf 2 > # \subsection{Projections, unfolds, t-SNE and UMAP} > > if(0){ # to activate plot, comment out here and below + + pdf(file="pics/tapkee.pdf") + + library(tapkee) + + SR <- Gen.dr.data("swissroll") + + COL <- rainbow(1100)[1:1000] # separates colors better + + TM <- c("lle", "npe", "ltsa", "hlle", "la", "isomap", "spe", "pca") + + names(TM) <- c("Locally Linear Embedding", + "Neighborhood Preserving1 Embd.", + "Local Tangent Space Alignment", + "Hessian Locally Linear Embd.", + "Laplacian Eigenmaps", "Isomap", + "Stochastic Proximity Embd.", "PCA") + + oldpar <- par(mfrow=c(3, 3), xaxt="n", yaxt="n") + + scatterplot3d(SR, color=COL, pch=20, main="Swiss Roll", + cex.symbols=1.2, xlab="", ylab="", zlab="", + axis=FALSE, tick.marks=FALSE, + label.tick.marks=FALSE, mar=c(1, 1, 2, 1)) + + for (n in 1:length(TM)) plot(Tapkee(SR, method=TM[n]), + col=COL, pch=20, main=names(TM)[n]) + + par(oldpar) + + par(oldpar) + + } > if(0){ # to activate plot, comment out here and below + + pdf(file="pics/tapkee_umap.pdf") + + library(uwot) + + library(tapkee) + + ## library(Rtsne) + + SR <- Gen.dr.data("swissroll") + + COL <- rainbow(1100)[1:1000] # separates colors better + + oldpar <- par(mfrow=c(1, 2), mar=c(1, 1, 3, 1), xaxt="n", yaxt="n") + + ## plot(Rtsne(SR)$Y, col=COL, pch=20, main="t-SNE (Rtsne)") + + plot(Tapkee(SR, method="t-sne"), + col=COL, pch=20, main="t-SNE (tapkee)") + + plot(umap(SR, n_neighbors=15, approx_pow=TRUE, pca=50), + col=COL, pch=20, main="UMAP (uwot)") + + par(oldpar) + + dev.off() + + } > # \subsection{Non-negative matrix factorization} > > library(NMFN) > set.seed(1) > iris.nnmf <- nnmf(iris[, -5], k=2) # we want two factors Multiplicative Update Algorithm Iter = 50 relative error = 0.06931816 diff = 1311.549 eucl dist = 62.21723 Iter = 100 relative error = 0.03606683 diff = 114.538 eucl dist = 16.09118 Iter = 150 relative error = 0.03541534 diff = 4.631678 eucl dist = 15.7323 Iter = 200 relative error = 0.03527807 diff = 1.202054 eucl dist = 15.66371 Iter = 250 relative error = 0.03520953 diff = 0.6464263 eucl dist = 15.6331 Iter = 300 relative error = 0.03517994 diff = 0.4066489 eucl dist = 15.61535 Iter = 350 relative error = 0.03516875 diff = 0.2881102 eucl dist = 15.60357 Iter = 400 relative error = 0.03516604 diff = 0.2140224 eucl dist = 15.59505 Iter = 450 relative error = 0.03516523 diff = 0.1651553 eucl dist = 15.58848 Iter = 500 relative error = 0.03516451 diff = 0.1248346 eucl dist = 15.58309 Iter = 550 relative error = 0.03516344 diff = 0.09692363 eucl dist = 15.57842 Iter = 600 relative error = 0.03516205 diff = 0.08006654 eucl dist = 15.57419 Iter = 650 relative error = 0.03516066 diff = 0.06978153 eucl dist = 15.57031 Iter = 700 relative error = 0.03515925 diff = 0.06437713 eucl dist = 15.56672 Iter = 750 relative error = 0.03515794 diff = 0.06170888 eucl dist = 15.56344 Iter = 800 relative error = 0.03515679 diff = 0.05935957 eucl dist = 15.56049 Iter = 850 relative error = 0.03515578 diff = 0.05688894 eucl dist = 15.55789 Iter = 900 relative error = 0.03515488 diff = 0.05400323 eucl dist = 15.55565 Iter = 950 relative error = 0.03515407 diff = 0.05063829 eucl dist = 15.55375 Iter = 1000 relative error = 0.03515329 diff = 0.04684556 eucl dist = 15.55217 > plot(iris.nnmf$W, col=iris$Species, xlab="F1", ylab="F2") > iris.nh <- Hulls(iris.nnmf$W, iris$Species) # shipunov > summary(Overlap(iris.nh)) # shipunov Overlaps for each hull, %: mean.overlap total.overlap setosa 0.00 0.00 versicolor 4.98 4.98 virginica 6.73 6.73 Mean of all overlaps 5.85 % > # \section{Discovery with distances} > > # \subsection{Distances} > > nd <- read.table("data/nd.txt", h=TRUE, sep="\t", row.names=1) > nd.d <- as.dist(nd) > str(nd.d) 'dist' int [1:28] 110 122 170 208 268 210 173 219 101 135 ... - attr(*, "Labels")= chr [1:8] "Minot" "Bismarck" "Williston" "Jamestown" ... - attr(*, "Size")= int 8 - attr(*, "call")= language as.dist.default(m = nd) - attr(*, "Diag")= logi FALSE - attr(*, "Upper")= logi FALSE > library(cluster) > iris.dist <- daisy(iris[, 1:4], metric="manhattan") > str(iris.dist) 'dissimilarity' num [1:11175] 0.7 0.8 1 0.2 1.2 ... - attr(*, "Size")= int 150 - attr(*, "Metric")= chr "manhattan" > iris.gower <- Gower.dist(iris) # shipunov > str(iris.gower) 'dist' num [1:11175] 0.0528 0.0506 0.0645 0.0139 0.0768 ... - attr(*, "Labels")= chr [1:150] "1" "2" "3" "4" ... - attr(*, "Size")= int 150 - attr(*, "call")= language as.dist.default(m = out) - attr(*, "Diag")= logi FALSE - attr(*, "Upper")= logi FALSE > library(MASS) > eqscplot(moldino_l$LON, moldino_l$LAT, + cex=round(log(moldino_l$SQUARE))-5.5, + axes=FALSE, xlab="", ylab="", xpd=TRUE) # shipunov > text(moldino_l$LON, moldino_l$LAT, labels=row.names(moldino_l), + pos=4, offset=1, cex=.9) > m1 <- t((moldino > 0) * 1) # convert to 0/1 and transpose > library(smirnov) > m1.Txy <- smirnov(m1) > m1.s <- (1 - m1.Txy) # convert similarity into dissimilarity > dimnames(m1.s) <- list(row.names(m1)) > symnum(m1.s) Bobrovyj + Ekslibris + + Gnutyj , + * Leda , + , * Malinovyj.Kruglyj + + + + + Slitnyj , , , , + * Tajnik , , , , + , * Verik + + + + , , + + Zakhar , + + , + + , + + attr(,"legend") [1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1 > m1.Txx <- diag(m1.Txy) > names(m1.Txx) <- row.names(m1) > rev(sort(round(m1.Txx, 3))) Verik Malinovyj.Kruglyj Ekslibris Bobrovyj 0.189 0.130 0.124 0.106 Zakhar Leda Slitnyj Gnutyj 0.105 0.092 0.088 0.087 Tajnik 0.078 > # \subsection{Making maps: multidimensional scaling} > > pdf(file="pics/56450_to_crop.pdf") > nd.d <- as.dist(nd) > nd.c <- cmdscale(nd.d) > new.names <- sub("y C", "y\nC", row.names(nd)) > library(MASS) > eqscplot(nd.c, type="n", axes=FALSE, xlab="", ylab="") > points(nd.c, pch=19) > text(nd.c, labels=new.names, xpd=TRUE, pos=3, cex=0.8) > dev.off() pdf 2 > pdf(file="pics/36340.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > library(KernSmooth) KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009 > iris.c <- cmdscale(iris.dist) > est <- bkde2D(iris.c, bandwidth=c(.7, 1.5)) > plot(iris.c, type="n", xlab="Dim. 1", ylab="Dim. 2") > text(iris.c, + labels=abbreviate(iris[, 5], 1, method="both.sides")) > contour(est$x1, est$x2, est$fhat, add=TRUE, + drawlabels=FALSE, lty=3) > par(oldpar); dev.off() pdf 2 > pdf(file="pics/94140_to_crop.pdf") > persp(est$x1, est$x2, est$fhat, theta=135, phi=45, + col="purple3", shade=0.75, border=NA, + xlab="Dim. 1", ylab="Dim. 2", zlab="Density") > dev.off() pdf 2 > iris.dist2 <- dist(iris[, 1:4], method="manhattan") > ## to remove zero distances: > > iris.dist2[iris.dist2 == 0] <- abs(jitter(0)) > library(MASS) > iris.m <- isoMDS(iris.dist2) initial value 5.444949 iter 5 value 4.117042 final value 4.075094 converged > cor(iris[, 1:4], iris.m$points) # variable importance ("loadings") [,1] [,2] Sepal.Length 0.9041779 -0.30455326 Sepal.Width -0.3996573 -0.87872539 Petal.Length 0.9956071 0.04209809 Petal.Width 0.9661085 -0.01138353 > (vv <- MDSv(iris.m$points)) # shipunov [1] 97.509738 2.490262 > xxlab <- paste0("Dim 1 (", round(vv[1], 2), "%)") > yylab <- paste0("Dim 1 (", round(vv[2], 2), "%)") > aabb <- abbreviate(iris$Species, 1, method="both.sides") > plot(iris.m$points, pch=aabb, xlab=xxlab, ylab=yylab) > Biarrows(iris.m$points, iris[, -5]) # shipunov > # \subsection{Making trees: hierarchical clustering} > > pdf(file="pics/29530.pdf"); oldpar <- par(mar=c(0, 2, 2, 1)) > aa <- t(atmospheres) # shipunov > aa.dist <- dist(aa) # planets are columns > aa.hclust <- hclust(aa.dist, method="ward.D") > plot(aa.hclust, xlab="", ylab="", sub="") > par(oldpar); dev.off() pdf 2 > cutree(aa.hclust, k=2) # we want 2 groups Mercury Venus Earth Mars Jupiter Saturn Uranus Neptune 1 1 1 1 2 2 2 2 > library(cluster) # advanced clustering package > iris.dist <- daisy(iris[, 1:4], metric="manhattan") # distances > iris.hclust <- hclust(iris.dist) > iris.3 <- cutree(iris.hclust, 3) # we want 3 groups > head(iris.3) # show cluster numbers [1] 1 1 1 1 1 1 > Misclass(iris.3, iris[, 5], best=TRUE) # shipunov Best classification table: obs pred setosa versicolor virginica 1 50 0 0 2 0 50 16 3 0 0 34 Misclassification errors (%): setosa versicolor virginica 0 0 32 Mean misclassification error: 10.7% > Adj.Rand(iris.3, iris[, 5]) # shipunov [1] 0.7322981 > pdf(file="pics/95700.pdf") > library(cetcolor) > heatmap(aa, col=cet_pal(12, "coolwarm"), margins=c(9, 6)) > dev.off() pdf 2 > pdf(file="pics/57440.pdf"); oldpar <- par(mar=c(2, 0, 0, 0)) > Ploth(iris.hclust, col=as.numeric(iris[, 5]), + pch=16, col.edges=TRUE, horiz=TRUE, leaflab="none") # shipunov > legend("topleft", fill=1:nlevels(iris[, 5]), + legend=levels(iris[, 5])) > par(oldpar); dev.off() pdf 2 > oldpar <- par(mar=c(2, 0, 0, 4)) > tox.dist <- as.dist(1 - abs(tox.cor)) > Ploth(hclust(tox.dist, method="ward.D"), horiz=TRUE) # shipunov > par(oldpar) > # \subsection{How to know the best clustering method} > > PlotBest.hclust(as.dist(m1.s)) # shipunov > PlotBest.dist(m1) # shipunov > pdf(file="pics/141940.pdf") > set.seed(21) > palette(c("#377EB8", "#FF7F00", "#4DAF4A", "#F781BF", "#A65628", + "#984EA3", "#999999", "#E41A1C", "#DEDE00", "#000000")) > nsam <- 500 # how many points to use > K <- 3 # how many clusters to target > noisy.moons <- Gen.cl.data(type="moons", N=nsam, noise=0.05) > no.struct <- list(samples=cbind(runif(nsam), runif(nsam)), + labels=rep(1, nsam)) > varied <- Gen.cl.data(type="blobs", N=nsam, bnoise=c(1, 2.5, 0.5)) > oldpar <- par(mfrow=c(3, 3), mar=c(1, 1, 2, 1), xaxt="n", yaxt="n") > for (n in c("no.struct", "noisy.moons", "varied")) { + newlabels <- cutree(hclust(dist(get(n)$samples), + method="ward.D2"), k=K) + plot(get(n)$samples, col=newlabels, pch=19, + main=paste0("Ward: ", n)) + newlabels <- cutree(hclust(dist(get(n)$samples), + method="single"), k=K) + plot(get(n)$samples, col=newlabels, pch=19, + main=paste0("Single linkage: ", n)) + newlabels <- kmeans(get(n)$samples, centers=K)$cluster + plot(get(n)$samples, col=newlabels, pch=19, + main=paste0("k-means: ", n)) + } > par(oldpar) > palette("default") > set.seed(NULL) > dev.off() pdf 2 > # \subsection{How to compare clusterings} > > aa.d1 <- hclust(dist(aa)) > aa.d2 <- hclust(as.dist(1 - abs(cor(atmospheres, method="s"))), + method="ward.D") # shipunov > library(ape) > aa.ph1 <- unroot(as.phylo(aa.d1)) # convert > aa.ph2 <- unroot(as.phylo(aa.d2)) > dist.topo(aa.ph1, aa.ph2) tree1 tree2 2 > phangorn::treedist(aa.ph1, aa.ph2) symmetric.difference branch.score.difference path.difference 2.000000 109.201423 3.872983 quadratic.path.difference 598.444274 > pdf(file="pics/96960.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > ass <- cbind(aa.ph1$tip.label, aa.ph1$tip.label) > aa.ph2r <- rotate(aa.ph2, c("Earth", "Neptune")) > cophyloplot(aa.ph1, aa.ph2r, assoc=ass, space=30, lty=2) > par(oldpar); dev.off() pdf 2 > plot(consensus(aa.ph1, aa.ph2r)) > aa12.match <- Hclust.match(aa.d1, aa.d2) # shipunov > library(cetcolor) > cols <- cet_pal(max(aa12.match), "blues") > heatmap(aa12.match, scale="none", col=cols) > aa.h <- hclust(dist(t(atmospheres))) > MRH(aa.h, method="branches") # shipunov [,1] [,2] [,3] [,4] [,5] [,6] [,7] Mercury 0 0 0 0 1 1 1 Venus 1 0 0 0 0 1 1 Earth 0 0 0 0 1 1 1 Mars 1 0 0 0 0 1 1 Jupiter 0 0 1 1 0 0 1 Saturn 0 0 1 1 0 0 1 Uranus 0 1 0 1 0 0 1 Neptune 0 1 0 1 0 0 1 > oldpar <- par() > m1.dist <- as.dist(m1.s) > m1.hclust <- hclust(m1.dist) > m1.c <- cmdscale(m1.dist) > library(vegan3d) > orditree3d(m1.c, m1.hclust, text=attr(m1.dist, "Labels"), type="t") > par(oldpar) Warning in par(oldpar) : graphical parameter "cin" cannot be set Warning in par(oldpar) : graphical parameter "cra" cannot be set Warning in par(oldpar) : graphical parameter "csi" cannot be set Warning in par(oldpar) : graphical parameter "cxy" cannot be set Warning in par(oldpar) : graphical parameter "din" cannot be set Warning in par(oldpar) : graphical parameter "page" cannot be set > # \subsection{How good are resulted clusters} > > pdf(file="pics/58350.pdf"); oldpar <- par(mar=c(1, 3, 1, 1)) > (m1.j <- Jclust(m1, 3, iter=1000)) # shipunov ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ Bootstrap support for 3 clusters, 1000 iterations: support cluster members 91.65 1 Bobrovyj, Zakhar 81.90 2 Ekslibris, Gnutyj, Leda, Slitnyj, Tajnik 62.40 3 Malinovyj.Kruglyj, Verik > plot(m1.j, rect.lty=2, sub="") > par(oldpar); dev.off() pdf 2 > pdf(file="pics/29730.pdf"); oldpar <- par(mar=c(0, 2, 0, 0)) > library(pvclust) > m1.pvclust <- pvclust(t(m1), method.dist="manhattan", + nboot=100, parallel=TRUE) Creating a temporary cluster...done: socket cluster with 3 nodes on host ‘localhost’ Multiscale bootstrap... Done. > plot(m1.pvclust, col.pv=c(au="red", bp="darkgreen", edge=0), main="") > par(oldpar); dev.off() pdf 2 > m1.ba <- BootA(m1, FUN=function(.x) # shipunov + as.phylo(hclust(dist(.x, method="minkowski"), + method="average")), iter=100, mc.cores=4) Running parallel bootstraps... done. Calculating bootstrap values... done. > plot(m1.ba$boot.tree, show.node.label=TRUE) > plot(m1.ba$cons.tree) # majority-rule consensus > aa.bcl <- Bclust(t(atmospheres), iter=100) # shipunov .................................................................................................... > plot(aa.bcl$hclust) > Bclabels(aa.bcl$hclust, aa.bcl$values, pos=3, offset=0.1) # shipunov > # \subsection{Making groups: $k$-means and friends} > > iris.km <- kmeans(iris[, -5], nstart=5, centers=3) # 3 clusters > str(iris.km$cluster) int [1:150] 2 2 2 2 2 2 2 2 2 2 ... > Misclass(iris$Species, iris.km$cluster, best=TRUE) Best classification table: obs pred 2 3 1 setosa 50 0 0 versicolor 0 48 2 virginica 0 14 36 Misclassification errors (%): 2 3 1 0.0 22.6 5.3 Mean misclassification error: 9.3% > pdf(file="pics/92950.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > library(kernlab) Attaching package: ‘kernlab’ The following object is masked from ‘package:permute’: how The following object is masked from ‘package:ggplot2’: alpha > data(spirals) > set.seed(1) > sc <- specc(spirals, centers=2) > plot(spirals, col=sc, xlab="", ylab="") > par(oldpar); dev.off() pdf 2 > library(cluster) > iris.f <- fanny(iris[, 1:4], 3) > head(data.frame(sp=iris[, 5], iris.f$membership)) sp X1 X2 X3 1 setosa 0.9142273 0.03603116 0.04974153 2 setosa 0.8594576 0.05854637 0.08199602 3 setosa 0.8700857 0.05463714 0.07527719 4 setosa 0.8426296 0.06555926 0.09181118 5 setosa 0.9044503 0.04025288 0.05529687 6 setosa 0.7680227 0.09717445 0.13480286 > Misclass(iris.f$clustering, iris[, 5], best=TRUE) # shipunov Best classification table: obs pred setosa virginica versicolor 1 50 0 0 2 0 41 4 3 0 9 46 Misclassification errors (%): setosa virginica versicolor 0 18 8 Mean misclassification error: 8.7% > # \subsection{How to know cluster numbers} > > pdf(file="pics/59100.pdf"); oldpar <- par(mar=c(5, 0, 3, 0)) > library(cluster) > eq.a <- agnes(eq[, -1]) > plot(eq.a, which=1, col=c(0, "maroon")) > par(oldpar); dev.off() pdf 2 > library(mclust) Package 'mclust' version 5.4.6 Type 'citation("mclust")' for citing this R package in publications. > iris.mclust <- Mclust(iris[, -5]) > summary(iris.mclust) # 2 clusters ---------------------------------------------------- Gaussian finite mixture model fitted by EM algorithm ---------------------------------------------------- Mclust VEV (ellipsoidal, equal shape) model with 2 components: log-likelihood n df BIC ICL -215.726 150 26 -561.7285 -561.7289 Clustering table: 1 2 50 100 > iris.mclust$classification [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [149] 2 2 > library(dbscan) > kNNdistplot(iris[, -5], k = 5) # look on the knee > abline(h=.5, col = "red", lty=2) > (iris.dbscan <- dbscan(iris[, -5], eps = .5, minPts = 5)) DBSCAN clustering for 150 objects. Parameters: eps = 0.5, minPts = 5 The clustering contains 2 cluster(s) and 17 noise points. 0 1 2 17 49 84 Available fields: cluster, eps, minPts > Misclass(iris.dbscan$cluster, iris$Species, best=TRUE, ignore="0") Best classification table: setosa versicolor virginica 1 49 0 0 2 0 44 40 0 0 0 Misclassification errors (%): setosa versicolor virginica 0 0 100 Mean misclassification error: 33.3% Note: data contains NAs > plot(iris.p, type="n", xlab="", ylab="") > text(iris.p, labels=abbreviate(iris[, 5], 1, + method="both.sides"), col=iris.dbscan$cluster+1) > iris.hdbscan <- hdbscan(iris[, -5], minPts=10) > plot(iris.hdbscan$hc) > library(MeanShift) Loading required package: parallel Loading required package: wavethresh WaveThresh: R wavelet software, release 4.6.8, installed Copyright Guy Nason and others 1993-2016 Note: nlevels has been renamed to nlevelsWT > bandwidth <- quantile(dist(iris[, -5]), 0.25) > (bmsClustering(t(iris[, -5]), h=bandwidth)) Running blurring mean-shift algorithm... Blurring mean-shift algorithm ran successfully. Finding clusters... The algorithm found 3 clusters. $components mode1 mode2 mode3 Sepal.Length 5.0007162 6.179920 7.104628 Sepal.Width 3.4178891 2.869793 3.070013 Petal.Length 1.4701932 4.800869 5.957860 Petal.Width 0.2442342 1.640072 2.046728 $labels [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 [112] 2 2 2 2 2 2 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 [149] 2 2 > aa <- prcomp(iris[, -5])$x[, 1] # only PC1 > Histr(aa, overlay="density", breaks=10) > bb <- c(dist(iris[, -5])) # coverts "distance" object into vector > Histr(bb, overlay="density", breaks=10) > pdf(file=NULL); oldpar <- par() > library(NbClust) > iris.nb <- NbClust(iris[, -5], method="ward.D") # wait! *** : The Hubert index is a graphical method of determining the number of clusters. In the plot of Hubert index, we seek a significant knee that corresponds to a significant increase of the value of the measure i.e the significant peak in Hubert index second differences plot. *** : The D index is a graphical method of determining the number of clusters. In the plot of D index, we seek a significant knee (the significant peak in Dindex second differences plot) that corresponds to a significant increase of the value of the measure. ******************************************************************* * Among all indices: * 9 proposed 2 as the best number of clusters * 10 proposed 3 as the best number of clusters * 3 proposed 6 as the best number of clusters * 1 proposed 10 as the best number of clusters * 1 proposed 15 as the best number of clusters ***** Conclusion ***** * According to the majority rule, the best number of clusters is 3 ******************************************************************* > par(oldpar); dev.off() Warning in par(oldpar) : graphical parameter "cin" cannot be set Warning in par(oldpar) : graphical parameter "cra" cannot be set Warning in par(oldpar) : graphical parameter "csi" cannot be set Warning in par(oldpar) : graphical parameter "cxy" cannot be set Warning in par(oldpar) : graphical parameter "din" cannot be set Warning in par(oldpar) : graphical parameter "page" cannot be set pdf 2 > # \subsection{Use projection pursuit for clustering} > > library(PPCI) Loading required package: rARPACK > iris.mcdr <- mcdr(iris[, -5], p=3) # dimension reduction > plot(iris.mcdr$fitted, col=iris$Species) > iris.mcdc <- mcdc(iris[, -5], K=3) # clustering > plot(iris.mcdc, labels=iris$Species) > Misclass(iris.mcdc$cluster, iris$Species, best=TRUE) # shipunov Best classification table: obs pred virginica setosa versicolor 1 49 0 2 2 0 50 0 3 1 0 48 Misclassification errors (%): virginica setosa versicolor 2 0 4 Mean misclassification error: 2% > # \subsection{How to compare different ordinations} > > pdf(file="pics/93900.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > set.seed(42) > library(Rtsne) > iris.unique <- unique(iris) > tsne.out <- Rtsne(as.matrix(iris.unique[, -5])) > irisu.pca <- prcomp(iris.unique[, -5]) > irisu.p <- irisu.pca$x[, 1:2] > library(vegan) > irisu.pr <- procrustes(irisu.p, tsne.out$Y) > plot(irisu.pr, ar.col=iris.unique$Species, pch=" ", + xlab="", ylab="", main="") # arrows point to target (PCA) > points(irisu.pr$Yrot, col=iris$Species, pch=21, bg="white") > with(iris.unique, legend("topright", lty=1, col=1:nlevels(Species), + legend=levels(Species), bty="n")) > legend("bottomright", lty=2:1, legend=c("PCA", "t-SNE"), bty="n") > set.seed(NULL) > par(oldpar); dev.off() pdf 2 > # \section{Answers to exercises} > > s50 <- read.table("data/stars.txt", h=TRUE, sep="\t", + as.is=TRUE, quote="") > str(s50) 'data.frame': 50 obs. of 13 variables: $ STAR.N : int 1 2 3 4 5 6 7 8 9 10 ... $ NAME : chr "Sirius" "Canopus" "Arcturus" "Rigil Kentaurus" ... $ IDENT : chr "9Alpha" "Alpha" "16Alpha" "Alpha1" ... $ CONSTEL.A: chr "CMa" "Car" "Boo" "Cen" ... $ CONSTEL : chr "Canis Major" "Carina" "Bootes" "Centaurus" ... $ VMAG : num -1.46 -0.72 -0.04 -0.01 0.03 0.08 0.12 0.38 0.46 0.5 ... $ DIST : num 8.6 312.6 36.7 4.4 25.3 ... $ PREC : num 0.4 5.1 0.8 0.2 0.4 1.2 19.2 0.3 2.5 21.5 ... $ AMAG : num 1.43 -5.63 -0.3 4.34 0.58 -0.48 -6.75 2.66 -2.76 -5.09 ... $ SPEC : chr "A1Vm" "F0II" "K1.5IIIFe-0.5" "G2V" ... $ RA : chr "06 45 08.9" "06 23 57.1" "14 15 39.7" "14 39 35.9" ... $ DEC : chr "-16 42 58" "-52 41 45" "+19 10 57" "-60 50 07" ... $ HIP : int 32349 30438 69673 71683 91262 24608 24436 37279 7588 27989 ... > RA10 <- as.numeric(substr(s50$RA, 1, 2)) + # hours + as.numeric(substr(s50$RA, 4, 5))/60 + # minutes + as.numeric(substr(s50$RA, 7, 10))/3600 # seconds > DEC10 <- sign(as.numeric(substr(s50$DEC, 1, 3))) * + (as.numeric(substr(s50$DEC, 2, 3)) + + as.numeric(substr(s50$DEC, 5, 6))/60 + + as.numeric(substr(s50$DEC, 8, 9))/3600) > coo <- cbind(RA10, DEC10) > oldpar <- par(bg="black", fg="white", mar=rep(0, 4)) > plot(coo, pch="*", cex=(3 - s50$VMAG)) # constellation plot > Hulls(coo, as.numeric(factor(s50$CONSTEL)), # shipunov + usecolors=rep("white", nlevels(factor(s50$CONSTEL)))) > points(runif(100, min(RA10), max(RA10)), + runif(100, min(DEC10), max(DEC10)), pch=".") # random "stars" > par(oldpar) > plot(coo, type="n") # constellation names > text(coo, s50$CONSTEL.A) > plot(coo, type="n") # star names > text(coo, s50$NAME) > library(dbscan) > for (eps in 1:20) cat(c(eps, ":", + names(table(dbscan(coo, eps=eps)$cluster))), "\n") 1 : 0 2 : 0 3 : 0 4 : 0 1 5 : 0 1 6 : 0 1 7 : 0 1 8 : 0 1 2 9 : 0 1 2 3 10 : 0 1 2 3 11 : 0 1 2 12 : 0 1 2 13 : 0 1 2 14 : 0 1 15 : 0 1 16 : 0 1 17 : 0 1 18 : 0 1 19 : 0 1 20 : 0 1 > pdf(file="pics/108660.pdf") > s50.db <- dbscan(coo, eps=9) > oldpar <- par(bg="black", fg="white", mar=rep(0, 4)) > plot(coo, pch=8, cex=(3 - s50$VMAG)) > Hulls(coo, s50.db$cluster, # shipunov + usecolors=c("black", "white", "white", "white")) > points(runif(100, min(RA10), max(RA10)), + runif(100, min(DEC10), max(DEC10)), pch=".") > par(oldpar) > dev.off() pdf 2 > Adj.Rand(as.numeric(factor(s50$CONSTEL)), s50.db$cluster) # shipunov [1] 0.1061416 > beer <- read.table("data/beer.txt", sep="\t", h=TRUE) > head(beer) C01 C02 C03 C04 C05 C06 C07 C08 C09 C10 C11 C12 C13 C14 C15 C16 Afanas.light 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 Baltika.3 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 1 Baltika.6 0 1 1 1 1 0 1 0 1 1 1 0 1 0 0 1 Baltika.9 0 1 1 1 1 0 1 0 1 1 1 0 1 0 0 1 Bochk.light 0 1 0 0 1 1 0 1 1 1 1 1 0 0 0 1 Budweiser 0 1 1 0 1 0 1 0 1 1 0 1 0 0 0 0 C17 C18 C19 C20 Afanas.light 0 0 0 0 Baltika.3 0 0 0 0 Baltika.6 0 0 0 1 Baltika.9 0 0 0 1 Bochk.light 0 0 0 0 Budweiser 1 0 0 1 > pdf(file="pics/38270.pdf"); oldpar <- par(mar=c(0, 2, 0, 1)) > library(vegan) > beer.d <- vegdist(beer, "jaccard") > plot(hclust(beer.d, method="ward.D"), main="", xlab="", sub="") > par(oldpar); dev.off() pdf 2 > pdf(file="pics/62550.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > kubricks <- read.table("data/kubricks.txt", h=TRUE, row.names=1) > kubricks.d <- dist(kubricks, method="binary") > kubricks.c <- cmdscale(kubricks.d) > plot(kubricks.c, type="n", axes=FALSE) > text(kubricks.c, labels=row.names(kubricks), cex=2) > library(vegan) > lines(spantree(kubricks.d), kubricks.c[, 1:2], lty=2) > par(oldpar); dev.off() pdf 2 > library(phangorn) Attaching package: ‘phangorn’ The following objects are masked from ‘package:vegan’: diversity, treedist > k <- as.phyDat(data.frame(t(kubricks)), type="USER", + levels = c(0, 1)) > kd <- dist.hamming(k) # Hamming distance for morphological data > kdnj <- NJ(kd) # neighbor-joining tree for starter > kp <- optim.parsimony(kdnj, k) # find most parsimonous tree Final p-score 13 after 1 nni operations > ktree <- root(kp, outgroup="H", resolve.root=TRUE) # re-root > plot(ktree, cex=2) > # \chapter{Learn} > > iris.train <- iris[seq(1, nrow(iris), 5), ] > iris.unknown <- iris[-seq(1, nrow(iris), 5), ] > # \section{Learning with regression} > > # \subsection{Linear discriminant analysis} > > library(MASS) > iris.lda <- lda(Species ~ . , data=iris.train) > iris.predicted <- predict(iris.lda, iris.unknown[, 1:4]) > Misclass(iris.predicted$class, iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 40 7 virginica 0 0 33 Misclassification errors (%): setosa versicolor virginica 0.0 0.0 17.5 Mean misclassification error: 5.8% > ldam <- manova(as.matrix(iris.unknown[, 1:4]) ~ + iris.predicted$class) > summary(ldam, test="Wilks") Df Wilks approx F num Df den Df Pr(>F) iris.predicted$class 2 0.026878 145.34 8 228 < 2.2e-16 *** Residuals 117 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > summary(aov(as.matrix(iris.unknown[, 1:4]) ~ + iris.predicted$class)) Response Sepal.Length : Df Sum Sq Mean Sq F value Pr(>F) iris.predicted$class 2 51.047 25.5237 108.9 < 2.2e-16 *** Residuals 117 27.423 0.2344 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Response Sepal.Width : Df Sum Sq Mean Sq F value Pr(>F) iris.predicted$class 2 9.9452 4.9726 46.072 1.749e-15 *** Residuals 117 12.6278 0.1079 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Response Petal.Length : Df Sum Sq Mean Sq F value Pr(>F) iris.predicted$class 2 341.30 170.650 826.29 < 2.2e-16 *** Residuals 117 24.16 0.207 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Response Petal.Width : Df Sum Sq Mean Sq F value Pr(>F) iris.predicted$class 2 62.270 31.1350 828.54 < 2.2e-16 *** Residuals 117 4.397 0.0376 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > library(vegan) > adonis(as.matrix(iris.unknown[, 1:4]) ~ iris.predicted$class) Call: adonis(formula = as.matrix(iris.unknown[, 1:4]) ~ iris.predicted$class) Permutation: free Number of permutations: 999 Terms added sequentially (first to last) Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) iris.predicted$class 2 1.85808 0.92904 437.53 0.88206 0.001 *** Residuals 117 0.24843 0.00212 0.11794 Total 119 2.10651 1.00000 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > pdf(file="pics/30320.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > iris.lda2 <- lda(iris[, 1:4], iris[, 5]) > iris.ldap2 <- predict(iris.lda2, dimen=2)$x > plot(iris.ldap2, type="n", xlab="LD1", ylab="LD2") > text(iris.ldap2, labels=abbreviate(iris[, 5], 1, + method="both.sides")) > Ellipses(iris.ldap2, as.numeric(iris[, 5]), + centers=TRUE) # shipunov > par(oldpar); dev.off() pdf 2 > set.seed(16881) > iris.sample2 <- sample(nrow(iris), 30) > set.seed(NULL) # set random number generator default > iris.train2 <- iris[iris.sample2, ] > iris.unknown2 <- iris[-iris.sample2, ] > iris.lda2 <- lda(Species ~ . , data=iris.train2) > iris.predicted2 <- predict(iris.lda2, iris.unknown2[, 1:4]) > Misclass(iris.predicted2$class, iris.unknown2[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 44 0 0 versicolor 0 32 12 virginica 0 0 32 Misclassification errors (%): setosa versicolor virginica 0.0 0.0 27.3 Mean misclassification error: 9.1% > table(iris.train2$Species) setosa versicolor virginica 6 18 6 > sam <- Class.sample(iris[, 5], nsam=10) # shipunov > nrow(iris[sam, ]) [1] 30 > RSS <- function() { + sam <- Class.sample(iris[, 5], 5) # 5 samples per class + iris.lda <- lda(Species ~ ., data=iris[sam, ]) + iris.ldp <- predict(iris.lda, iris[!sam, -5]) + iris.ldm <- Misclass(iris.ldp$class, iris[!sam, 5], quiet=TRUE) + mean(((CS <- colSums(iris.ldm)) - diag(iris.ldm))/CS) * 100 + } > res <- replicate(100, RSS()) > quantile(res, c(0.025, 0.5, 0.975)) 2.5% 50% 97.5% 1.833333 4.444444 10.370370 > hist(res) > # \subsection{Recursive partitioning} > > pdf(file="pics/30520.pdf"); oldpar <- par(mar=c(0, 1, 0, 1)) > library(tree) > iris.tree <- tree(Species ~ ., data=iris) > plot(iris.tree) > text(iris.tree) > par(oldpar); dev.off() pdf 2 > iris.tree2 <- tree(Species ~ ., data=iris.train) > iris.tp2 <- predict(iris.tree2, iris.unknown[, -5], type="class") > Misclass(iris.tp2, iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 38 6 virginica 0 2 34 Misclassification errors (%): setosa versicolor virginica 0 5 15 Mean misclassification error: 6.7% > pdf(file="pics/69320_to_crop.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > library(party) Loading required package: mvtnorm Attaching package: ‘mvtnorm’ The following object is masked from ‘package:mclust’: dmvnorm Loading required package: modeltools Loading required package: stats4 Attaching package: ‘modeltools’ The following object is masked from ‘package:kernlab’: prior Loading required package: strucchange Loading required package: zoo Attaching package: ‘zoo’ The following objects are masked from ‘package:base’: as.Date, as.Date.numeric Loading required package: sandwich Attaching package: ‘party’ The following object is masked from ‘package:ape’: where > SA <- abbreviate(iris$Species, 1, method="both.sides") > iris.ct <- ctree(factor(SA) ~ ., data=iris[, 1:4]) > plot(iris.ct) > Misclass(predict(iris.ct), SA) # shipunov Classification table: obs pred a s v a 49 0 5 s 0 50 0 v 1 0 45 Misclassification errors (%): a s v 2 0 10 Mean misclassification error: 4% > par(oldpar); dev.off() pdf 2 > library(OneR) Attaching package: ‘OneR’ The following object is masked from ‘package:PPCI’: breastcancer > iris.one <- OneR(Species ~ ., data=iris.train) Warning in OneR.data.frame(x = data, ties.method = ties.method, verbose = verbose, : data contains unused factor levels > summary(iris.one) Call: OneR.formula(formula = Species ~ ., data = iris.train) Rules: If Petal.Width = (0.198,0.66] then Species = setosa If Petal.Width = (0.66,1.12] then Species = versicolor If Petal.Width = (1.12,1.58] then Species = versicolor If Petal.Width = (1.58,2.04] then Species = virginica If Petal.Width = (2.04,2.5] then Species = virginica Accuracy: 28 of 30 instances classified correctly (93.33%) Contingency table: Petal.Width Species (0.198,0.66] (0.66,1.12] (1.12,1.58] (1.58,2.04] (2.04,2.5] Sum setosa * 10 0 0 0 0 10 versicolor 0 * 2 * 6 2 0 10 virginica 0 0 0 * 3 * 7 10 Sum 10 2 6 5 7 30 --- Maximum in each column: '*' Pearson's Chi-squared test: X-squared = 52.8, df = 8, p-value = 1.179e-08 > iris.pre <- predict(iris.one, iris.unknown[, -5]) > Misclass(as.character(iris.pre), iris.unknown$Species, + ignore="UNSEEN", useNA="ifany", best=TRUE) # shipunov Best classification table: obs pred setosa versicolor virginica setosa 35 0 0 versicolor 0 37 3 virginica 0 3 37 5 0 0 Misclassification errors (%): setosa versicolor virginica 12.5 7.5 7.5 Mean misclassification error: 9.2% Note: data contains NAs > # \section{Ensemble learnig} > > # \subsection{Random Forest} > > library(randomForest) randomForest 4.6-14 Type rfNews() to see new features/changes/bug fixes. Attaching package: ‘randomForest’ The following object is masked from ‘package:ggplot2’: margin > set.seed(17) # Random Forest is stochastic method > iris.rf <- randomForest(Species ~ ., data=iris.train) > iris.rfp <- predict(iris.rf, iris.unknown[, -5]) > Misclass(iris.rfp, iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 39 7 virginica 0 1 33 Misclassification errors (%): setosa versicolor virginica 0.0 2.5 17.5 Mean misclassification error: 6.7% > pdf(file="pics/30890.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > set.seed(17) > iris.urf <- randomForest(iris[, -5]) # no response variable > iris.rfc <- cmdscale(1 - iris.urf$proximity) > plot(iris.rfc, xlab="", ylab="", + pch=abbreviate(iris[, 5], 1, method="both.sides")) > Hulls(iris.rfc, as.numeric(iris[, 5]), outliers=FALSE, + coef=0.8, usecolor=rep(1, 3), lty=2) # shipunov > Biarrows(iris.rfc, iris[, -5], shift=c(-0.1, 0)) # shipunov > par(oldpar); dev.off() pdf 2 > # \subsection{Gradient boosting} > > library(gbm) Loaded gbm 2.1.8 > set.seed(4) > iris.gbm <- gbm(Species ~ ., + data=rbind(iris.train, iris.train)) # to make training bigger Distribution not specified, assuming multinomial ... Warning: Setting `distribution = "multinomial"` is ill-advised as it is currently broken. It exists only for backwards compatibility. Use at your own risk. > iris.gbm.p1 <- predict(iris.gbm, iris.unknown[, -5], + n.trees=iris.gbm$n.trees) > iris.gbm.p2 <- apply(iris.gbm.p1, 1, # membership trick + function(.x) colnames(iris.gbm.p1)[which.max(.x)]) > Misclass(iris.gbm.p2, iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 38 6 virginica 0 2 34 Misclassification errors (%): setosa versicolor virginica 0 5 15 Mean misclassification error: 6.7% > # \section{Learning with proximity} > > library(class) > iris.knn.pred <- knn(train=iris.train[, -5], + test=iris.unknown[, -5], cl=iris.train[, 5], k=5) > Misclass(iris.knn.pred, iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 40 11 virginica 0 0 29 Misclassification errors (%): setosa versicolor virginica 0.0 0.0 27.5 Mean misclassification error: 9.2% > pdf(file="pics/97860.pdf"); oldpar <- par(mar=c(0, 0, 0, 0)) > iris.p <- prcomp(iris[, -5])$x[, 1:2] > iris.p1 <- iris.p[seq(1, nrow(iris.p), 5), ] > iris.p2 <- iris.p[-seq(1, nrow(iris.p), 5), ] > library(tripack) > iris.v <- voronoi.mosaic(iris.p1[, 1], iris.p1[, 2], + duplicate="remove") > plot(iris.v, do.points=FALSE, main="", sub="") > points(iris.p1[, 1:2], col=iris.train$Species, pch=16, cex=2) > points(iris.p2[, 1:2], col=iris.unknown$Species) > par(oldpar); dev.off() pdf 2 > library(stringdist) > library(english) > iris.en <- apply(round(iris[, -5]), 1, + function(.x) paste(as.character(as.english(.x)), collapse=" ")) > head(iris.en) [1] "five four one zero" "five three one zero" "five three one zero" [4] "five three two zero" "five four one zero" "five four two zero" > iris.end <- stringdistmatrix(iris.en) > cl1 <- iris$Species > sam <- Class.sample(iris[, 5], nsam=10, uniform=TRUE) > cl1[!sam] <- NA # DNN wants unknown classes as NAs > iris.pred <- DNN(dst=iris.end, cl=cl1, k=5) > Misclass(iris.pred, iris$Species[is.na(cl1)]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 3 2 versicolor 0 34 5 virginica 0 3 33 Misclassification errors (%): setosa versicolor virginica 0.0 15.0 17.5 Mean misclassification error: 10.8% > library(ddalpha) Loading required package: robustbase Loading required package: sfsmisc Loading required package: geometry > iris.dd <- ddalpha.train(Species ~ ., data=iris.train) Selected columns: Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width > iris.p <- predict(iris.dd, iris.unknown[, -5]) # default method > Misclass(unlist(iris.p), iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 40 7 virginica 0 0 33 Misclassification errors (%): setosa versicolor virginica 0.0 0.0 17.5 Mean misclassification error: 5.8% > iris.pp <- predict(iris.dd, iris.unknown[, -5], + outsider.method="Ignore") > sapply(iris.pp, as.character) # shows points outside train clouds [1] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [6] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [11] "Ignored" "Ignored" "Ignored" "setosa" "Ignored" [16] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [21] "Ignored" "setosa" "setosa" "Ignored" "Ignored" [26] "Ignored" "Ignored" "setosa" "Ignored" "Ignored" [31] "Ignored" "setosa" "Ignored" "Ignored" "Ignored" [36] "Ignored" "Ignored" "Ignored" "Ignored" "setosa" [41] "versicolor" "Ignored" "Ignored" "Ignored" "Ignored" [46] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [51] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [56] "Ignored" "Ignored" "Ignored" "Ignored" "versicolor" [61] "Ignored" "Ignored" "versicolor" "Ignored" "Ignored" [66] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [71] "Ignored" "versicolor" "versicolor" "versicolor" "Ignored" [76] "versicolor" "versicolor" "versicolor" "Ignored" "versicolor" [81] "Ignored" "virginica" "Ignored" "Ignored" "Ignored" [86] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [91] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [96] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [101] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [106] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" [111] "Ignored" "Ignored" "Ignored" "Ignored" "virginica" [116] "Ignored" "Ignored" "Ignored" "Ignored" "Ignored" > # \section{Learning with rules} > > library(e1071) > iris.nb <- naiveBayes(Species ~ ., data=iris.train) > iris.nb Naive Bayes Classifier for Discrete Predictors Call: naiveBayes.default(x = X, y = Y, laplace = laplace) A-priori probabilities: Y setosa versicolor virginica 0.3333333 0.3333333 0.3333333 Conditional probabilities: Sepal.Length Y [,1] [,2] setosa 5.16 0.2988868 versicolor 5.96 0.6257440 virginica 6.94 0.5059644 Sepal.Width Y [,1] [,2] setosa 3.47 0.4423423 versicolor 2.87 0.4270051 virginica 3.10 0.1490712 Petal.Length Y [,1] [,2] setosa 1.49 0.1663330 versicolor 4.32 0.3966527 virginica 5.77 0.4762119 Petal.Width Y [,1] [,2] setosa 0.26 0.0843274 versicolor 1.34 0.2366432 virginica 2.19 0.2282786 > iris.nbp <- predict(iris.nb, iris.unknown[, -5]) > Misclass(iris.nbp, iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 39 13 virginica 0 1 27 Misclassification errors (%): setosa versicolor virginica 0.0 2.5 32.5 Mean misclassification error: 11.7% > library(arulesCBA) Loading required package: Matrix Loading required package: arules Attaching package: ‘arules’ The following object is masked from ‘package:modeltools’: info The following object is masked from ‘package:wavethresh’: support The following object is masked from ‘package:kernlab’: size The following objects are masked from ‘package:base’: abbreviate, write Attaching package: ‘arulesCBA’ The following object is masked from ‘package:party’: response > irisd <- as.data.frame(lapply(iris[1:4], discretize, breaks=9)) > irisd$Species <- iris$Species > irisd.train <- irisd[seq(1, nrow(irisd), 5), ] > irisd.unknown <- irisd[-seq(1, nrow(irisd), 5), ] > irisd.cba <- CBA(Species ~ ., irisd.train, supp=0.05, conf=0.8) > inspect(irisd.cba$rules) lhs rhs support confidence [1] {Petal.Length=[1.5,2.63)} => {Species=setosa} 0.2000000 1.0000000 [2] {Petal.Length=[5.7,6.9]} => {Species=virginica} 0.2000000 1.0000000 [3] {Petal.Width=[2.14,2.5]} => {Species=virginica} 0.2000000 1.0000000 [4] {Petal.Width=[0.2,0.211)} => {Species=setosa} 0.2000000 1.0000000 [5] {Petal.Width=[0.867,1.3)} => {Species=versicolor} 0.1333333 1.0000000 [6] {Petal.Width=[0.211,0.867)} => {Species=setosa} 0.1333333 1.0000000 [7] {Petal.Length=[4.5,4.9)} => {Species=versicolor} 0.1333333 1.0000000 [8] {Petal.Length=[4.1,4.5)} => {Species=versicolor} 0.1333333 1.0000000 [9] {} => {Species=virginica} 0.3333333 0.3333333 coverage lift count size coveredTransactions totalErrors [1] 0.2000000 3 6 2 6 14 [2] 0.2000000 3 6 2 6 8 [3] 0.2000000 3 6 2 3 5 [4] 0.2000000 3 6 2 2 3 [5] 0.1333333 3 4 2 4 3 [6] 0.1333333 3 4 2 2 1 [7] 0.1333333 3 4 2 4 1 [8] 0.1333333 3 4 2 2 0 [9] NA 1 30 1 1 0 > irisd.cbap <- predict(irisd.cba, irisd.unknown) > Misclass(irisd.cbap, irisd.unknown$Species) # shipunov Classification table: obs pred setosa versicolor virginica setosa 37 0 0 versicolor 0 31 3 virginica 3 9 37 Misclassification errors (%): setosa versicolor virginica 7.5 22.5 7.5 Mean misclassification error: 12.5% > # \section{Learning from the black boxes} > > # \subsection{Support Vector Machines} > > library(e1071) > iris.svm <- svm(Species ~ ., data=iris.train) > iris.svmp <- predict(iris.svm, iris.unknown[, -5]) > Misclass(iris.svmp, iris.unknown[, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 0 0 versicolor 0 40 14 virginica 0 0 26 Misclassification errors (%): setosa versicolor virginica 0 0 35 Mean misclassification error: 11.7% > pdf(file="pics/70010.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > iris.p <- prcomp(iris[, -5])$x[, 1:2] > iris.svm.pca <- svm(Species ~ ., data=cbind(iris[5], iris.p)) > plot(iris.p, type="n", xlab="", ylab="") > Gradd(iris.svm.pca, iris.p, cex=0.4) # shipunov > text(iris.p, col=as.numeric(iris[, 5]), + labels=abbreviate(iris[, 5], 1, method="both.sides")) > par(oldpar); dev.off() pdf 2 > # \subsection{Neural Networks} > > library(nnet) > iris.nn <- nnet(Species ~ . , data=iris.train, size=4) # weights: 35 initial value 38.235938 iter 10 value 0.305649 iter 20 value 0.019828 iter 30 value 0.001140 iter 40 value 0.000331 iter 50 value 0.000155 final value 0.000086 converged > iris.nnp <- predict(iris.nn, iris.unknown[, -5], type="class") > Misclass(iris.nnp, iris.unknown$Species) # shipunov Classification table: obs pred setosa versicolor virginica setosa 40 1 0 versicolor 0 31 10 virginica 0 8 30 Misclassification errors (%): setosa versicolor virginica 0.0 22.5 25.0 Mean misclassification error: 15.8% > pdf(file="pics/112300_to_crop.pdf") > library(NeuralNetTools) > oldpar <- par(mar=c(0, 2, 0, 1), xpd=TRUE) > plotnet(iris.nn) > par(oldpar) > dev.off() pdf 2 > Perceptron <- function(x, y, eta, n) { + ## x dataset, y classes, eta learning speed, n cycles + ## make initial weights randomly + weight <- runif(ncol(x) + 1) + ## make initial errors vector: we need it for visualization + errors <- rep(0, n) + ## loop over number of epochs (learning cycles) n + for (jj in 1:n) { + ## loop through training data set + for (ii in 1:length(y)) { + ## main equation: "w1*x + w2*y + ... + b" (_w_eights and _b_ias) + z <- sum(as.numeric(x[ii, ]) * weight[-1]) + weight[1] + ## decision step: Heaviside activation + ypred <- as.numeric(z > 0) # all > 0 become 1 + ## update weight, it changes if the predicted value is incorrect + weightdiff <- eta * (y[ii] - ypred) * c(1, as.numeric(x[ii, ])) + weight <- weight + weightdiff + ## update error vector + if ((y[ii] - ypred) != 0) errors[jj] <- errors[jj] + 1 + } + } + ## name coefficients + names(weight) <- c("bias", names(x)) + ## output will be a list + list(weight=weight, errors=errors) + } > ## new "species": "setosa" and "non-setosa" (1 or 0) > > y <- as.numeric(iris[, 5] == "setosa") > ## make sample (10% of each class) > > sam <- Class.sample(y, prop=0.1, uniform=TRUE) # shipunov > ## training classes > > y.trn <- y[sam] > ## select two characters (but more characters is usually better) > > x <- iris[, c(1, 3)] > ## check if everything is separable > > plot(x, col=y+1) > ## training data > > x.trn <- x[sam, ] > (P <- Perceptron(x=x.trn, y=y.trn, eta=0.5, n=10)) $weight bias Sepal.Length Petal.Length 0.7122841 1.3388011 -2.2789806 $errors [1] 1 3 1 0 0 0 0 0 0 0 > ## make testing sample > > x.tst <- x[!sam, ] > ## select weights from perceptron object > > W <- P$weight > ## predict each row of x.tst with the main Perceptron() equation > > z.tst <- apply(x.tst, 1, function(.x) sum(.x * W[-1] + W[1])) > ## decision > > z.tst <- as.numeric(z.tst > 0) > ## testing classes to compare with prediction > > y.tst <- y[!sam] > Misclass(z.tst, y.tst) # shipunov Classification table: obs pred 0 1 0 74 0 1 16 45 Misclassification errors (%): 0 1 17.8 0.0 Mean misclassification error: 8.9% > pdf(file="pics/63190_to_crop.pdf") > library(NeuralNetTools) > oldpar <- par(mar=c(0, 2, 0, 1), xpd=TRUE) > plotnet(W, struct=c(2, 1), x_names=names(x), y_names="setosa") > par(oldpar) > dev.off() pdf 2 > # \section{Semi-supervised learning} > > library(SSL) Registered S3 methods overwritten by 'klaR': method from predict.rda vegan print.rda vegan plot.rda vegan > iris.10 <- seq(1, nrow(iris), 10) # only 10 labeled points! > iris.sslt1 <- sslSelfTrain(iris[iris.10, -5], iris[iris.10, 5], + iris[-iris.10, -5], nrounds=20, n=5) > iris.sslt2 <- levels(iris$Species)[iris.sslt1] > Misclass(iris.sslt2, iris[-iris.10, 5]) # shipunov Classification table: obs pred setosa versicolor virginica setosa 45 0 0 versicolor 0 43 6 virginica 0 2 39 Misclassification errors (%): setosa versicolor virginica 0.0 4.4 13.3 Mean misclassification error: 5.9% > set.seed(0) > library(kpeaks) > nclusters <- max(findk(iris[, -5])$pcounts) # step 1 > iris.cl <- kmeans(iris[, -5], centers=nclusters)$cluster # step 2 > sel <- Class.sample(iris.cl, 10) # shipunov > iris.svm <- svm(as.factor(iris.cl[sel]) ~ ., data=iris[sel, -5]) > iris.svmp <- predict(iris.svm, iris[!sel, -5]) # step 3 > Misclass(iris.svmp, iris[!sel, 5], best=TRUE) # shipunov Best classification table: obs pred virginica setosa versicolor 1 28 0 2 2 0 40 0 3 10 0 40 Misclassification errors (%): virginica setosa versicolor 26.3 0.0 4.8 Mean misclassification error: 10.4% > set.seed(NULL) > iris.dms <- Classproj(iris[, -5], iris$Species) # shipunov > plot(iris.dms$proj, col=iris$Species) > text(iris.dms$centers, levels(iris$Species), col=1:3) > sam <- Class.sample(iris$Species, 10) # shipunov > newclasses <- iris$Species > newclasses[!sam] <- NA > iris.dms <- Classproj(iris[, -5], newclasses) # shipunov > plot(iris.dms$proj, col=iris$Species, pch=ifelse(sam, 19, 1)) > text(iris.dms$centers, levels(iris$Species), col=1:3) > library(ksharp) > iris.km <- kmeans(iris[, -5], centers=3) > Misclass(iris.km$cluster, iris$Species, best=TRUE) # shipunov Best classification table: obs pred setosa versicolor virginica 1 50 0 0 2 0 48 14 3 0 2 36 Misclassification errors (%): setosa versicolor virginica 0 4 28 Mean misclassification error: 10.7% > names(iris.km$cluster) <- row.names(iris) # ksharp requirement > iris.ksharp <- ksharp(iris.km, data=iris[, -5]) > plot(prcomp(iris[, -5])$x, col=iris$Species, + pch=ifelse(iris.ksharp$cluster == 0, -1, 14)+iris.km$cluster) > legend("topright", col=c(1:3, rep(1, 4)), + pch=c(rep(19, 3), 15:17, 1), + legend=c("I. setosa", "I. versicolor", "I. virginica", + "Cluster I", "Cluster II", "Cluster III", "'noise'")) > Misclass(iris.ksharp$cluster, iris$Species, ignore=0, best=TRUE) Best classification table: obs pred setosa versicolor virginica 1 50 0 0 2 0 44 7 3 0 0 34 Misclassification errors (%): setosa versicolor virginica 0.0 0.0 17.1 Mean misclassification error: 5.7% Note: data contains NAs > aad <- dist(t(atmospheres)) > aadu <- Updist(aad, unlink=list(c("Earth", "Mercury"))) # shipunov > plot(hclust(aadu)) > # \section{How to choose the right method} > > # \section{Answers to exercises} > > pdf(file="pics/37980.pdf"); oldpar <- par(mar=c(0, 1, 0, 1)) > eq.tree <- tree(eq[, 1] ~ ., eq[, -1]) # shipunov > plot(eq.tree); text(eq.tree) > par(oldpar); dev.off() pdf 2 > # \chapter{Example of \R session} > > # \section{Starting...} > > dir("data") [1] "argon_c.txt" "argon.txt" "beer_c.txt" [4] "beer.txt" "bigaln2.txt" "bigaln_c.txt" [7] "bigaln.tps" "bigaln.txt" "bpress_c.txt" [10] "bpress.txt" "bugs_c.txt" "bugs.txt" [13] "cashiers_c.txt" "cashiers.txt" "com12_c.txt" [16] "com12.rd" "com80_c.txt" "com80.txt" [19] "dact_c.txt" "dact.txt" "d_c.txt" [22] "distances.nts" "d.txt" "eggs_c.txt" [25] "eggs.txt" "elections_c.txt" "elections.txt" [28] "eq8_c.txt" "eq8.txt" "errors_c.txt" [31] "errors.txt" "exams_c.txt" "exams.txt" [34] "grades_c.txt" "grades.txt" "ipomopsis_c.txt" [37] "ipomopsis.txt" "kubricks_c.txt" "kubricks.txt" [40] "lakesea_abio.txt" "lakesea_bio_abio_c.txt" "lakesea_bio.txt" [43] "logit_c.txt" "logit.txt" "mydata1.txt" [46] "mydata2.txt" "mydata3.txt" "mydata_c.txt" [49] "mydata.txt" "nd_c.txt" "nd.txt" [52] "pear2d_bw.png" "phanerozoic_c.txt" "phanerozoic.txt" [55] "poisoning_c.txt" "poisoning.txt" "seedlings_c.txt" [58] "seedlings.txt" "seeing_c.txt" "seeing.txt" [61] "species_c.txt" "species.txt" "spur_c.txt" [64] "spur.txt" "stars_c.txt" "stars.txt" [67] "sundew_c.txt" "sundew.txt" "tea_c.txt" [70] "tea.txt" "teeth_c.txt" "teeth.txt" [73] "traits_c.txt" "traits.txt" > data <- read.table("data/bugs.txt", h=TRUE) > data <- read.table("http://ashipunov.me/data/bugs.txt", h=TRUE) > head(data) SEX COLOR WEIGHT LENGTH 1 0 1 10.68 9.43 2 1 1 10.02 10.66 3 0 2 10.18 10.41 4 1 1 8.01 9.00 5 0 3 10.23 8.98 6 1 3 9.70 9.71 > str(data) 'data.frame': 10 obs. of 4 variables: $ SEX : int 0 1 0 1 0 1 1 0 1 1 $ COLOR : int 1 1 2 1 3 3 2 3 1 2 $ WEIGHT: num 10.68 10.02 10.18 8.01 10.23 ... $ LENGTH: num 9.43 10.66 10.41 9 8.98 ... > data.f <- data[data$SEX == 0, ] > data.m.big <- data[data$SEX == 1 & data$LENGTH > 10, ] > data$WEIGHT.R <- data$WEIGHT/data$LENGTH > write.table(data, file="data/bugs_new.txt", quote=FALSE) > # \section{Describing...} > > summary(data) SEX COLOR WEIGHT LENGTH Min. :0.0 Min. :1.00 Min. : 8.010 Min. : 8.970 1st Qu.:0.0 1st Qu.:1.00 1st Qu.: 9.707 1st Qu.: 9.023 Median :1.0 Median :2.00 Median :10.100 Median : 9.330 Mean :0.6 Mean :1.90 Mean :10.041 Mean : 9.582 3rd Qu.:1.0 3rd Qu.:2.75 3rd Qu.:10.568 3rd Qu.:10.182 Max. :1.0 Max. :3.00 Max. :11.450 Max. :10.660 WEIGHT.R Min. :0.8900 1st Qu.:0.9832 Median :1.0475 Mean :1.0496 3rd Qu.:1.1263 Max. :1.2156 > data1 <- data > data1$SEX <- factor(data1$SEX, labels=c("female", "male")) > data1$COLOR <- factor(data1$COLOR, + labels=c("red", "blue", "green")) > summary(data$WEIGHT) Min. 1st Qu. Median Mean 3rd Qu. Max. 8.010 9.707 10.100 10.041 10.568 11.450 > min(data$WEIGHT) [1] 8.01 > max(data$WEIGHT) [1] 11.45 > median(data$WEIGHT) [1] 10.1 > mean(data$WEIGHT) [1] 10.041 > colMeans(data) SEX COLOR WEIGHT LENGTH WEIGHT.R 0.600000 1.900000 10.041000 9.582000 1.049648 > round(colMeans(data), 1) SEX COLOR WEIGHT LENGTH WEIGHT.R 0.6 1.9 10.0 9.6 1.0 > data2 <- data > data2[3, 3] <- NA > mean(data2$WEIGHT) [1] NA > mean(data2$WEIGHT, na.rm=TRUE) [1] 10.02556 > data2.o <- na.omit(data2) > sum(data$WEIGHT) [1] 100.41 > sum(data[2, ]) [1] 23.61996 > apply(data, 1, sum) [1] 22.24256 23.61996 23.56791 19.90000 23.34920 24.40897 22.89041 24.66560 [9] 21.18453 25.89735 > table(data$SEX) 0 1 4 6 > table(data$COLOR) 1 2 3 4 3 3 > 100*prop.table(table(data$SEX)) 0 1 40 60 > sd(data$WEIGHT) [1] 0.994501 > sapply(data[, 3:4], sd) WEIGHT LENGTH 0.9945010 0.6585641 > sapply(data2[, 3:4], sd, na.rm=TRUE) WEIGHT LENGTH 1.0535548 0.6585641 > 100*sd(data$WEIGHT)/mean(data$WEIGHT) [1] 9.904402 > tapply(data$WEIGHT, data$SEX, mean) 0 1 10.577500 9.683333 > table(data$COLOR, data$SEX) 0 1 1 1 3 2 1 2 3 2 1 > 100*prop.table(table(data$COLOR, data$SEX)) 0 1 1 10 30 2 10 20 3 20 10 > tapply(data$WEIGHT, list(data$SEX, data$COLOR), mean) 1 2 3 0 10.680000 10.18 10.725 1 9.073333 10.59 9.700 > # \section{Plotting...} > > hist(data$WEIGHT, breaks=3) > boxplot(data$LENGTH) > boxplot(data$LENGTH ~ data$SEX) > qqnorm(data$WEIGHT); qqline(data$WEIGHT) > plot(data$LENGTH, data$WEIGHT, type="p") > plot(data$LENGTH, data$WEIGHT, type="p", cex=0.5) > plot(data$LENGTH, data$WEIGHT, type="p", cex=2) > pdf(file="pics/47690.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > Ex.points() # shipunov > par(oldpar); dev.off() pdf 2 > Ex.lines() # shipunov [1] 0 1 2 3 4 5 6 > Ex.cols() # shipunov > Ex.fonts() # shipunov > Ex.types() # shipunov > plot(data$LENGTH, data$WEIGHT, type="p", pch=2) > plot(data$LENGTH, data$WEIGHT, type="n") > text(data$LENGTH, data$WEIGHT, labels=data$SEX) > plot(data$LENGTH, data$WEIGHT, pch=as.character(data$SEX)) > plot(data$LENGTH, data$WEIGHT, type="n") > text(data$LENGTH, data$WEIGHT, labels=data$SEX, col=data$SEX+1) > plot(data$LENGTH, data$WEIGHT, type="n") > points(data$LENGTH, data$WEIGHT, pch=data$SEX) > pdf(file="pics/52050.pdf"); oldpar <- par(mar=c(4, 4, 1, 1)) > plot(data$LENGTH^3, data$WEIGHT, type="n", + xlab=expression("Volume (cm"^3*")"), ylab="Weight") > text(data$LENGTH^3, data$WEIGHT, + labels=ifelse(data$SEX, "\\MA", "\\VE"), + vfont=c("serif","plain"), cex=1.5) > par(oldpar); dev.off() pdf 2 > plot(data$LENGTH, data$WEIGHT, type="n") > points(data$LENGTH, data$WEIGHT, pch=data$SEX*3, col=data$SEX+1) > legend("bottomright", legend=c("male", "female"), + pch=c(0, 3), col=1:2) > dev.copy(pdf, "graph.pdf") pdf 5 > dev.off() pdf 2 > pdf("graph.pdf") > plot(data$LENGTH, data$WEIGHT, type="n") > points(data$LENGTH, data$WEIGHT, pch=data$SEX*3, col=data$SEX+1) > legend("bottomright", legend=c("male", "female"), + pch=c(0, 3), col=1:2) > dev.off() pdf 2 > # \section{Testing...} > > t.test(data$WEIGHT, data$LENGTH, paired=TRUE) Paired t-test data: data$WEIGHT and data$LENGTH t = 1.529, df = 9, p-value = 0.1606 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.2201116 1.1381116 sample estimates: mean of the differences 0.459 > t.test(data$WEIGHT, data$LENGTH, paired=FALSE) Welch Two Sample t-test data: data$WEIGHT and data$LENGTH t = 1.2169, df = 15.62, p-value = 0.2417 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.3421944 1.2601944 sample estimates: mean of x mean of y 10.041 9.582 > t.test(data$WEIGHT ~ data$SEX) Welch Two Sample t-test data: data$WEIGHT by data$SEX t = 1.7277, df = 7.2425, p-value = 0.1262 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.3213711 2.1097044 sample estimates: mean in group 0 mean in group 1 10.577500 9.683333 > data[, c("WEIGHT", "SEX")] WEIGHT SEX 1 10.68 0 2 10.02 1 3 10.18 0 4 8.01 1 5 10.23 0 6 9.70 1 7 9.73 1 8 11.22 0 9 9.19 1 10 11.45 1 > data3 <- unstack(data[, c("WEIGHT", "SEX")]) > t.test(data3[[1]], data3[[2]]) Welch Two Sample t-test data: data3[[1]] and data3[[2]] t = 1.7277, df = 7.2425, p-value = 0.1262 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.3213711 2.1097044 sample estimates: mean of x mean of y 10.577500 9.683333 > wilcox.test(data$WEIGHT, data$LENGTH, paired=TRUE) Warning in wilcox.test.default(data$WEIGHT, data$LENGTH, paired = TRUE) : cannot compute exact p-value with ties Wilcoxon signed rank test with continuity correction data: data$WEIGHT and data$LENGTH V = 40.5, p-value = 0.202 alternative hypothesis: true location shift is not equal to 0 > wilcox.test(data$WEIGHT ~ data$SEX) Wilcoxon rank sum exact test data: data$WEIGHT by data$SEX W = 20, p-value = 0.1143 alternative hypothesis: true location shift is not equal to 0 > pairwise.t.test(data$WEIGHT, data$COLOR, p.adj="bonferroni") Pairwise comparisons using t tests with pooled SD data: data$WEIGHT and data$COLOR 1 2 2 0.7 - 3 0.8 1.0 P value adjustment method: bonferroni > kruskal.test(data$WEIGHT ~ data$COLOR) Kruskal-Wallis rank sum test data: data$WEIGHT by data$COLOR Kruskal-Wallis chi-squared = 1.6545, df = 2, p-value = 0.4372 > pairwise.wilcox.test(data$WEIGHT, data$COLOR) Pairwise comparisons using Wilcoxon rank sum exact test data: data$WEIGHT and data$COLOR 1 2 2 1 - 3 1 1 P value adjustment method: holm > chisq.test(data$COLOR, data$SEX) Warning in chisq.test(data$COLOR, data$SEX) : Chi-squared approximation may be incorrect Pearson's Chi-squared test data: data$COLOR and data$SEX X-squared = 1.3194, df = 2, p-value = 0.517 > prop.test(sum(data$SEX), length(data$SEX), 0.5) 1-sample proportions test with continuity correction data: sum(data$SEX) out of length(data$SEX), null probability 0.5 X-squared = 0.1, df = 1, p-value = 0.7518 alternative hypothesis: true p is not equal to 0.5 95 percent confidence interval: 0.2736697 0.8630694 sample estimates: p 0.6 > cor.test(data$WEIGHT, data$LENGTH, method="pearson") Pearson's product-moment correlation data: data$WEIGHT and data$LENGTH t = 1.2276, df = 8, p-value = 0.2545 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: -0.3089373 0.8217631 sample estimates: cor 0.3981316 > cor.test(data$WEIGHT, data$LENGTH, method="spearman") Spearman's rank correlation rho data: data$WEIGHT and data$LENGTH S = 104, p-value = 0.2956 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.369697 > summary(lm(data$LENGTH ~ data$SEX)) Call: lm(formula = data$LENGTH ~ data$SEX) Residuals: Min 1Q Median 3Q Max -0.6583 -0.5369 -0.1825 0.5542 1.0317 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 9.5125 0.3478 27.349 3.44e-09 *** data$SEX 0.1158 0.4490 0.258 0.803 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.6956 on 8 degrees of freedom Multiple R-squared: 0.00825, Adjusted R-squared: -0.1157 F-statistic: 0.06655 on 1 and 8 DF, p-value: 0.8029 > aov(lm(data$LENGTH ~ data$SEX)) Call: aov(formula = lm(data$LENGTH ~ data$SEX)) Terms: data$SEX Residuals Sum of Squares 0.032202 3.871158 Deg. of Freedom 1 8 Residual standard error: 0.6956255 Estimated effects may be unbalanced > # \section{Finishing...} > > # \section{Answers to exercises} > > # \chapter{Ten Years Later, or use \R script} > > pdf(file="pics/gridmoon.pdf", width=12); oldpar <- par(mar=c(0, 0, 0, 0)) > Gridmoon() # shipunov > par(oldpar); dev.off() pdf 2 > # \section{How to make your \R script} > > # \section{My \R script does not work!} > > plot(log(trees$Volume), 1:nrow(trees)) > log(trees$Volume) [1] 2.332144 2.332144 2.322388 2.797281 2.933857 2.980619 2.747271 2.901422 [9] 3.117950 2.990720 3.186353 3.044522 3.063391 3.058707 2.949688 3.100092 [17] 3.520461 3.310543 3.246491 3.214868 3.540959 3.456317 3.591818 3.645450 [25] 3.751854 4.014580 4.019980 4.065602 3.941582 3.931826 4.343805 > trees$Volume [1] 10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 24.2 21.0 21.4 21.3 19.1 [16] 22.2 33.8 27.4 25.7 24.9 34.5 31.7 36.3 38.3 42.6 55.4 55.7 58.3 51.5 51.0 [31] 77.0 > trees Girth Height Volume 1 8.3 70 10.3 2 8.6 65 10.3 3 8.8 63 10.2 4 10.5 72 16.4 5 10.7 81 18.8 6 10.8 83 19.7 7 11.0 66 15.6 8 11.0 75 18.2 9 11.1 80 22.6 10 11.2 75 19.9 11 11.3 79 24.2 12 11.4 76 21.0 13 11.4 76 21.4 14 11.7 69 21.3 15 12.0 75 19.1 16 12.9 74 22.2 17 12.9 85 33.8 18 13.3 86 27.4 19 13.7 71 25.7 20 13.8 64 24.9 21 14.0 78 34.5 22 14.2 80 31.7 23 14.5 74 36.3 24 16.0 72 38.3 25 16.3 77 42.6 26 17.3 81 55.4 27 17.5 82 55.7 28 17.9 80 58.3 29 18.0 80 51.5 30 18.0 80 51.0 31 20.6 87 77.0 > 1:nrow(trees) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 [26] 26 27 28 29 30 31 > nrow(trees) [1] 31 > if(0) { + + ## print here anything _syntactically correct_ + + } > abc <- function(x) + { + for (i in 1:10) x <- x+1 + x + } > abc(5) [1] 15 > abc <- function(x) + { + for (i in 1:10) { x <- x+1; print(x) } + x + } > abc(5) [1] 6 [1] 7 [1] 8 [1] 9 [1] 10 [1] 11 [1] 12 [1] 13 [1] 14 [1] 15 [1] 15 > # \section{Common pitfalls in \R scripting} > > # \subsection{Advices} > > trees[, c("Height", "Volume")] Height Volume 1 70 10.3 2 65 10.3 3 63 10.2 4 72 16.4 5 81 18.8 6 83 19.7 7 66 15.6 8 75 18.2 9 80 22.6 10 75 19.9 11 79 24.2 12 76 21.0 13 76 21.4 14 69 21.3 15 75 19.1 16 74 22.2 17 85 33.8 18 86 27.4 19 71 25.7 20 64 24.9 21 78 34.5 22 80 31.7 23 74 36.3 24 72 38.3 25 77 42.6 26 81 55.4 27 82 55.7 28 80 58.3 29 80 51.5 30 80 51.0 31 87 77.0 > trees[, trees %-% c("Height", "Volume")] # shipunov [1] 8.3 8.6 8.8 10.5 10.7 10.8 11.0 11.0 11.1 11.2 11.3 11.4 11.4 11.7 12.0 [16] 12.9 12.9 13.3 13.7 13.8 14.0 14.2 14.5 16.0 16.3 17.3 17.5 17.9 18.0 18.0 [31] 20.6 > Pull(trees, Girth < 11) # shipunov Girth Height Volume 1 8.3 70 10.3 2 8.6 65 10.3 3 8.8 63 10.2 4 10.5 72 16.4 5 10.7 81 18.8 6 10.8 83 19.7 > # \subsection{The Case-book of Advanced \R user} > > mode(trees) [1] "list" > trees2 <- trees[, 2:3] > mode(trees2) [1] "list" > trees1 <- trees2[, 2] > mode(trees1) [1] "numeric" > trees.new <- trees[trees[, 1] < 0, ] > str(trees.new) 'data.frame': 0 obs. of 3 variables: $ Girth : num $ Height: num $ Volume: num > 1:nrow(trees.new) [1] 1 0 > seq_len(nrow(trees.new)) integer(0) > aa <- c(1, NA, 3) > aa[aa != 1] # bad idea [1] NA 3 > aa[aa != 1 & !is.na(aa)] # good idea [1] 3 > trees[2, 2] <- NA > trees[trees$Height == 80, "Girth"] # bad idea [1] NA 11.1 14.2 17.9 18.0 18.0 > trees[which(trees$Height == 80), "Girth"] # good idea [1] 11.1 14.2 17.9 18.0 18.0 > data(trees) # restores changed data > prop.test(3, 23) 1-sample proportions test with continuity correction data: 3 out of 23, null probability 0.5 X-squared = 11.13, df = 1, p-value = 0.0008492 alternative hypothesis: true p is not equal to 0.5 95 percent confidence interval: 0.03430797 0.34666883 sample estimates: p 0.1304348 > pval <- prop.test(3, 23)$pvalue > pval NULL > pval <- prop.test(3, 23)$p.value # correct identity! > pval [1] 0.0008492268 > aa <- sqrt(2) > aa * aa == 2 [1] FALSE > aa * aa - 2 [1] 4.440892e-16 > all.equal(aa * aa, 2) [1] TRUE > all.equal(aa * aa - 2, 0) [1] TRUE > Sum <- function(a, b) a + b > a <- 3 > Sum(a=5, b=4) [1] 9 > a [1] 3 > Sum(a <- 5, b=4) [1] 9 > a [1] 5 > c(factor(LETTERS[1:3]), factor(letters[1:3])) [1] 1 2 3 1 2 3 > c.factor <- function(..., recursive=TRUE) + unlist(list(...), recursive=recursive) > c(factor(LETTERS[1:3]), factor(letters[1:3])) [1] A B C a b c Levels: A B C a b c > ifelse("a" != "b", 1:10, c(0, "A", "B")) # the first surprise [1] 1 > ifelse(c(TRUE, FALSE, TRUE), 1, 10) # the second surprise [1] 1 10 1 > # \subsection{Good, Bad, and Not-too-bad} > > read.table("http://ashipunov.me/shipunov/open/wilting.txt", + h=TRUE, sep="\t") SPECIES TIME 1 Alnus glutinosa 165 2 Alnus glutinosa 225 3 Alnus glutinosa 265 4 Alnus glutinosa 300 5 Alnus glutinosa 325 6 Alnus incana 70 7 Alnus incana 110 8 Alnus incana 130 9 Alnus incana 195 10 Alnus incana 195 11 Alnus incana 210 12 Alnus incana 210 13 Alnus incana 290 14 Alnus incana 450 15 Alnus incana 470 16 Betula pendula 160 17 Betula pendula 270 18 Betula pendula 470 19 Betula pendula 510 20 Betula pendula 580 21 Betula pendula 610 22 Padus avium 120 23 Padus avium 160 24 Padus avium 185 25 Padus avium 200 26 Padus avium 220 27 Padus avium 225 28 Padus avium 255 29 Padus avium 270 30 Padus avium 280 31 Salix acutifolia 70 32 Salix acutifolia 170 33 Salix acutifolia 185 34 Salix acutifolia 195 35 Salix acutifolia 225 36 Salix acutifolia 270 37 Salix acutifolia 300 38 Salix acutifolia 325 39 Salix acutifolia 360 40 Salix acutifolia 500 41 Salix caprea 85 42 Salix caprea 145 43 Salix caprea 165 44 Salix caprea 200 45 Salix caprea 245 46 Salix caprea 256 47 Salix caprea 265 48 Salix caprea 325 49 Salix caprea 345 50 Salix caprea 360 51 Salix caprea 365 52 Salix caprea 485 53 Salix caprea 595 > # \section{Answers to exercises} > > pdf(file="pics/use_r_script.pdf", width=12); oldpar <- par(mar=c(0, 0, 0, 0)) > Gridmoon(Nightsky=FALSE, Moon=FALSE, Stars=FALSE, # shipunov + Hillcol="forestgreen", Text="Use R script!", Textcol="yellow", + Textpos=c(0.35, 0.85), Textsize=96) > par(oldpar); dev.off() pdf 2 > # \chapter{\R fragments} > > # \section{\R and databases} > > head(eq_l) # locations N.POP WHERE SPECIES 1 1 Tver arvense 2 2 Tver arvense 3 3 Tver arvense 4 4 Tver arvense 5 5 Tver pratense 6 6 Tver palustre > head(eq_s) # measurements N.POP DL.R DIA.ST N.REB N.ZUB DL.OSN.Z DL.TR.V DL.BAZ DL.PER 1 1 424 2.3 13 12 2.0 5 3.0 25 2 1 339 2.0 11 12 1.0 4 2.5 13 3 1 321 2.5 15 14 2.0 5 2.3 13 4 1 509 3.0 14 14 1.5 5 2.2 23 5 1 462 2.5 12 13 1.1 4 2.1 12 6 1 350 1.8 9 9 1.1 4 2.0 15 > SP.NEW <- Recode(eq_s$N.POP, eq_l$N.POP, eq_l$SPECIES) > eq8 <- cbind(SPECIES=SP.NEW, eq_s[, -1]) > head(eq8) SPECIES DL.R DIA.ST N.REB N.ZUB DL.OSN.Z DL.TR.V DL.BAZ DL.PER 1 arvense 424 2.3 13 12 2.0 5 3.0 25 2 arvense 339 2.0 11 12 1.0 4 2.5 13 3 arvense 321 2.5 15 14 2.0 5 2.3 13 4 arvense 509 3.0 14 14 1.5 5 2.2 23 5 arvense 462 2.5 12 13 1.1 4 2.1 12 6 arvense 350 1.8 9 9 1.1 4 2.0 15 > mm <- c("Plantago major", "Littorella uniflora") > do.call(rbind, strsplit(mm, split=" ")) # exactly one space [,1] [,2] [1,] "Plantago" "major" [2,] "Littorella" "uniflora" > read.table(textConnection(mm), sep=" ") V1 V2 1 Plantago major 2 Littorella uniflora > nn <- c("a b", "c d\ne f") > read.table(textConnection(nn), sep=" ") V1 V2 1 a b 2 c d 3 e f > m.o [1] L S XL XXL S M L Levels: S < M < L < XL < XXL > model.matrix( ~ m.o - 1, data=data.frame(m.o)) m.oS m.oM m.oL m.oXL m.oXXL 1 0 0 1 0 0 2 1 0 0 0 0 3 0 0 0 1 0 4 0 0 0 0 1 5 1 0 0 0 0 6 0 1 0 0 0 7 0 0 1 0 0 attr(,"assign") [1] 1 1 1 1 1 attr(,"contrasts") attr(,"contrasts")$m.o [1] "contr.poly" > Tobin(m.o, convert.names=FALSE) # shipunov S M L XL XXL [1,] 0 0 1 0 0 [2,] 1 0 0 0 0 [3,] 0 0 0 1 0 [4,] 0 0 0 0 1 [5,] 1 0 0 0 0 [6,] 0 1 0 0 0 [7,] 0 0 1 0 0 > # \section{\R and time} > > dates.df <- data.frame(dates=c("2011-01-01","2011-01-02", + "2011-01-03","2011-01-04","2011-01-05")) > str(dates.df$dates) chr [1:5] "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" "2011-01-05" > dates.1 <- as.Date(dates.df$dates, "%Y-%m-%d") > str(dates.1) Date[1:5], format: "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" "2011-01-05" > d <- c(20130708, 19990203, 17650101) > as.Date(as.character(d), "%Y%m%d") [1] "2013-07-08" "1999-02-03" "1765-01-01" > ts(1:10, # sequence + frequency = 4, # by quartile + start = c(1959, 2)) # start in the second quartile 1959 Qtr1 Qtr2 Qtr3 Qtr4 1959 1 2 3 1960 4 5 6 7 1961 8 9 10 > z <- ts(matrix(rnorm(30), 10, 3), + start=c(1961, 1), # start in January 1961 + frequency=12) # by months > class(z) [1] "mts" "ts" "matrix" > pdf(file="pics/32340.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > plot(z, + plot.type="single", # place all series on one plot + lty=1:3) > par(oldpar); dev.off() pdf 2 > leaf <- read.table("data/sundew.txt", h=TRUE, sep=";") > str(leaf) 'data.frame': 80 obs. of 2 variables: $ WET : int 2 1 1 2 1 1 1 1 1 1 ... $ SHAPE: int 1 1 1 2 2 2 2 2 2 2 ... > summary(leaf) WET SHAPE Min. :1.000 Min. :1.0 1st Qu.:1.000 1st Qu.:1.0 Median :1.000 Median :2.0 Mean :1.325 Mean :1.7 3rd Qu.:2.000 3rd Qu.:2.0 Max. :2.000 Max. :2.0 > shape <- ts(leaf$SHAPE, frequency=36) > str(shape) Time-Series [1:80] from 1 to 3.19: 1 1 1 2 2 2 2 2 2 2 ... > pdf(file="pics/33160.pdf"); oldpar <- par(mar=c(4, 4, 4, 1)) > (acf(shape, main=expression(italic("Drosera")*" leaf"))) Autocorrelations of series ‘shape’, by lag 0.0000 0.0278 0.0556 0.0833 0.1111 0.1389 0.1667 0.1944 0.2222 0.2500 0.2778 1.000 0.614 0.287 0.079 0.074 0.068 -0.038 -0.085 -0.132 -0.137 -0.024 0.3056 0.3333 0.3611 0.3889 0.4167 0.4444 0.4722 0.5000 0.5278 0.030 0.025 -0.082 -0.087 0.027 0.021 -0.043 -0.090 -0.137 > par(oldpar); dev.off() pdf 2 > pdf(file="pics/33310_to_crop.pdf") > plot(stl(shape, s.window="periodic")$time.series, main="") > dev.off() pdf 2 > # \section{\R and bootstrap} > > library(boot) Attaching package: ‘boot’ The following object is masked from ‘package:robustbase’: salinity The following object is masked from ‘package:lattice’: melanoma > ## Statistic to be bootstrapped: > > ave.b <- function (data, indices) + { + d <- data[indices] + return(mean(d)) + } > (result.b <- boot(data=trees$Height, statistic=ave.b, R=100)) ORDINARY NONPARAMETRIC BOOTSTRAP Call: boot(data = trees$Height, statistic = ave.b, R = 100) Bootstrap Statistics : original bias std. error t1* 76 0.0983871 1.069863 > pdf(file="pics/06074.pdf"); oldpar <- par(mar=c(4, 4, 1, 1)) > plot(result.b) > par(oldpar); dev.off() pdf 2 > boot.ci(result.b, type="bca") BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS Based on 100 bootstrap replicates CALL : boot.ci(boot.out = result.b, type = "bca") Intervals : Level BCa 95% (73.91, 77.76 ) Calculations and Intervals on Original Scale Some BCa intervals may be unstable > spur <- scan("data/spur.txt") Read 1511 items > library(bootstrap) Attaching package: ‘bootstrap’ The following object is masked from ‘package:mclust’: diabetes > result.b2 <- bootstrap(x=spur, 100, function(x) mean(x)) > ## Median of bootstrapped values: > > median(result.b2$thetastar) [1] 7.519159 > ## Confidence interval: > > quantile(result.b2$thetastar, probs=c(0.025, 0.975)) 2.5% 97.5% 7.444077 7.610356 > result.j <- jackknife(x=spur, function(x) mean(x)) > ## Median of jackknifed values: > > median(result.j$jack.values) [1] 7.516424 > ## Standard error: > > result.j$jack.se [1] 0.04258603 > ## Confidence interval: > > quantile(result.j$jack.values, probs=c(0.025, 0.975)) 2.5% 97.5% 7.513775 7.517748 > boot <- 100 > tt <- matrix(ncol=2, nrow=boot) > for (n in 1:boot) + + { + + spur.sample <- sample(spur, length(spur), replace=TRUE) + + tt[n, 1] <- mean(spur.sample) + + tt[n, 2] <- sd(spur.sample) + + } > (result <- data.frame(spur.mean=mean(spur), spur.sd=sd(spur), + boot.mean=mean(tt[, 1]), boot.sd=mean(tt[, 2]))) spur.mean spur.sd boot.mean boot.sd 1 7.516082 1.655386 7.517484 1.663059 > dact <- scan("data/dact.txt") Read 48 items > quantile(dact, c(0.025, 0.5, 0.975)) 2.5% 50% 97.5% 3.700 33.500 104.825 > apply(replicate(100, quantile(sample(dact, length(dact), + replace=TRUE), c(0.025, 0.5, 0.975))), 1, mean) 2.5% 50% 97.5% 4.9700 37.6600 102.0667 > sleep.K.rep <- replicate(100, K(extra ~ group, + data=sleep[sample(1:nrow(sleep), replace=TRUE), ])) > quantile(sleep.K.rep, c(0.025, 0.975)) 2.5% 97.5% 0.00304031 1.83676444 > library(coin) Loading required package: survival Attaching package: ‘survival’ The following object is masked from ‘package:boot’: aml The following object is masked from ‘package:robustbase’: heart Attaching package: ‘coin’ The following objects are masked from ‘package:arules’: size, support The following object is masked from ‘package:wavethresh’: support The following object is masked from ‘package:kernlab’: size > grades$V2 <- as.factor(grades$V2) > wilcox_test(V1 ~ V2, data=subset(grades, V2 %in% c("A1", "B1")), + conf.int=TRUE) Asymptotic Wilcoxon-Mann-Whitney Test data: V1 by V2 (A1, B1) Z = -1.9938, p-value = 0.04618 alternative hypothesis: true mu is not equal to 0 95 percent confidence interval: -1.000000e+00 -4.297215e-09 sample estimates: difference in location -5.132013e-09 > # \section{\R and shape} > > library(geomorph) Loading required package: RRPP Loading required package: rgl Attaching package: ‘rgl’ The following object is masked from ‘package:plotrix’: mtext3d > TangentSpace2 <- function(A) + { + x <- two.d.array(A) + pc.res <- prcomp(x) + pcdata <- pc.res$x + list(array=x, pc.summary=summary(pc.res), pc.scores=pcdata) + } > am <- read.table("data/bigaln.txt", sep=";", head=TRUE) > ag <- readland.tps("data/bigaln.tps", specID="imageID") No curves detected; all points appear to be fixed landmarks. Warning: not all specimens have scale adjustment (perhaps because they are already scaled); no rescaling will be performed in these cases > gpa.ag <- gpagen(ag) | | | 0% | |============== | 20% | |============================ | 40% | |========================================== | 60% | |======================================================================| 100% > ta.ag <- TangentSpace2(gpa.ag$coords) > screeplot(ta.ag$pc.summary) # importance of principal components > pca.ag <- ta.ag$pc.summary$x > pdf(file="pics/10668.pdf"); oldpar <- par(mar=c(4, 4, 0, 1)) > pca.ag.ids <- as.numeric(gsub(".png", "", row.names(pca.ag))) > branch <- cut(am$P.1, 3, labels=c("lower", "middle", "top")) > b.code <- as.numeric(Recode(pca.ag.ids, am$PIC, branch, + char=FALSE)) # shipunov > plot(pca.ag[, 1:2], xlab="PC1", ylab="PC2", pch=19, col=b.code) > legend("topright", legend=paste(levels(branch), "branches"), + pch=19, col=1:3) > par(oldpar); dev.off() pdf 2 > c.lower <- mshape(gpa.ag$coords[, , b.code == 1]) > c.top <- mshape(gpa.ag$coords[, , b.code == 3]) > all.mean <- mshape(gpa.ag$coords) > ag.links <- matrix(c(1, rep(c(2:7, 12:8), each=2), 1), + ncol=2, byrow=TRUE) > pdf(file="pics/10667_to_crop.pdf") > old.par <- par(mfrow=c(1, 2)) > GP <- gridPar(grid.col="grey", + tar.link.col="darkseagreen", tar.pt.bg=0) > plotRefToTarget(c.lower, all.mean, links=ag.links, gridPars=GP) > title(main="lower branches", line=-5, cex=0.8) > plotRefToTarget(c.top, all.mean, links=ag.links, gridPars=GP) > title(main="top branches", line=-5, cex=0.8) > par(old.par) > dev.off() pdf 2 > # \section{\R and Bayes} > > chickwts <- chickwts[chickwts$feed %in% c("horsebean", "linseed"), ] > chickwts$feed <- factor(chickwts$feed) > plot(weight ~ feed, data=chickwts, main="Chick weights") > t.test(weight ~ feed, data=chickwts, var.eq=TRUE) Two Sample t-test data: weight by feed t = -2.934, df = 20, p-value = 0.008205 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -100.17618 -16.92382 sample estimates: mean in group horsebean mean in group linseed 160.20 218.75 > library(BayesFactor) Loading required package: coda Attaching package: ‘coda’ The following object is masked from ‘package:kernlab’: nvar ************ Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com). Type BFManual() to open the manual. ************ Attaching package: ‘BayesFactor’ The following object is masked from ‘package:modeltools’: posterior > bf <- ttestBF(formula = weight ~ feed, data=chickwts) > bf Bayes factor analysis -------------- [1] Alt., r=0.707 : 5.975741 ±0% Against denominator: Null, mu1-mu2 = 0 --- Bayes factor type: BFindepSample, JZS > # \section{\R, DNA and evolution} > > # \section{\R and reporting} > > # \section{\R without graphics} > > library(txtplot) > with(mtcars, txtplot(cyl, mpg)) 35 +--+------------+-------------+-------------+------------+--+ | * | | | 30 + * + | * | 25 + * + | * | | * * | 20 + * + | * * | | * | 15 + * + | * | | * | 10 +--+------------+-------------+-------------+------------+--+ 4 5 6 7 8 > # \section{Answers to exercises} > > wet <- ts(leaf$WET, frequency=36) > str(wet) Time-Series [1:80] from 1 to 3.19: 2 1 1 2 1 1 1 1 1 1 ... > acf(wet) > plot(stl(wet, s.window="periodic")$time.series) > cu <- read.table( + "http://ashipunov.me/shipunov/open/cuscuta.txt", + h=TRUE, sep="\t") > str(cu) 'data.frame': 96 obs. of 3 variables: $ HOST : chr "Alchemilla" "Alchemilla" "Alchemilla" "Alchemilla" ... $ DEGREE: int 3 2 2 2 1 2 1 0 0 1 ... $ WHERE : int 3 3 3 3 3 1 1 0 0 1 ... > cu2 <- cu[cu$HOST %in% c("Alchemilla","Knautia"), ] > cu2$HOST <- factor(cu2$HOST) > cu2.s <- split(cu2$DEGREE, cu2$HOST) > boxplot(cu2.s) > sapply(cu2.s, median) Alchemilla Knautia 2 0 > cliff.delta(cu2.s$Alchemilla, cu2.s$Knautia) Cliff's Delta delta estimate: 0.5185185 (large) 95 percent confidence interval: lower upper -0.1043896 0.8492326 > wilcox.test(cu2.s$Alchemilla, cu2.s$Knautia) Warning in wilcox.test.default(cu2.s$Alchemilla, cu2.s$Knautia) : cannot compute exact p-value with ties Wilcoxon rank sum test with continuity correction data: cu2.s$Alchemilla and cu2.s$Knautia W = 41, p-value = 0.09256 alternative hypothesis: true location shift is not equal to 0 > pdf(file="pics/75380.pdf"); oldpar <- par(mar=c(2, 4, 2, 1)) > ansari.test(cu2.s$Alchemilla, cu2.s$Knautia) Warning in ansari.test.default(cu2.s$Alchemilla, cu2.s$Knautia) : cannot compute exact p-value with ties Ansari-Bradley test data: cu2.s$Alchemilla and cu2.s$Knautia AB = 40, p-value = 0.7127 alternative hypothesis: true ratio of scales is not equal to 1 > library(beeswarm) > la <- layout(matrix(c(1, 3, 2, 3), ncol=2, byrow=TRUE)) > for (i in 1:2) hist(cu2.s[[i]], main=names(cu2.s)[i], + xlab="", xlim=range(cu2.s)) > bxplot(cu2.s) ; beeswarm(cu2.s, cex=1.2, add=TRUE) > par(oldpar); dev.off() pdf 2 > Rro.test(cu2.s$Alchemilla, cu2.s$Knautia) # shipunov z p.value 1.9913639 0.0464409 > library(boot) > meddif.b <- function (data, ind) { d <- data[ind]; + median(d[cu2$HOST == "Alchemilla"]) - median( + d[cu2$HOST == "Knautia"]) } > meddif.boot <- boot(data=cu2$DEGREE, statistic=meddif.b, + strata=cu2$HOST, R=999) > boot.ci(meddif.boot, type="bca") BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS Based on 999 bootstrap replicates CALL : boot.ci(boot.out = meddif.boot, type = "bca") Intervals : Level BCa 95% ( 0, 2 ) Calculations and Intervals on Original Scale Some BCa intervals may be unstable > library(coin) > cu$HOST <- as.factor(cu$HOST) > kruskal_test(DEGREE ~ HOST, data=cu, distribution=approximate()) Approximative Kruskal-Wallis Test data: DEGREE by HOST (Alchemilla, Briza, Carex flava, Cirsium, Dactylis, Equisetum, Erodium, Galium, Hieracium, Hypericum, Knautia, Leontodon, Phleum, Plantago, Potentilla, Ranunculus acris, Stellaria, Trifolium, Vicia cracca) chi-squared = 24.063, p-value = 0.1175 > pairwise.Rro.test(cu$DEGREE, cu$HOST) # shipunov Pairwise comparisons using Robust rank order test data: cu$DEGREE and cu$HOST Alchemilla Briza Carex flava Cirsium Dactylis Equisetum Briza 0.0066 - - - - - Carex flava 0.3817 0.9310 - - - - Cirsium 0.8392 0.1680 0.3164 - - - Dactylis 0.3668 0.6628 0.8759 0.4834 - - Equisetum 0.8629 0.7886 0.8629 0.7460 0.9310 - Erodium 0.7485 0.0022 0.7485 0.8282 0.7959 0.8986 Galium 0.9427 0.0615 0.5541 0.8675 0.6320 0.8629 Hieracium 0.8715 0.0615 0.6628 0.7859 0.6780 0.8986 Hypericum 0.7859 < 2e-16 0.2910 0.9276 0.0269 0.8078 Knautia 0.2647 0.8629 0.9427 0.3164 0.8769 0.8715 Leontodon 0.9500 0.0005 0.3089 0.8629 0.3089 0.8078 Phleum 0.0411 0.9500 0.9310 0.2900 0.7886 0.8501 Plantago 0.3164 0.8629 0.9310 0.6113 0.9310 0.9310 Potentilla 0.3164 0.8629 0.9310 0.6113 0.9310 0.9310 Ranunculus acris 0.8629 8.4e-08 0.2418 0.9310 0.2910 0.6628 Stellaria 0.1248 0.9276 1.0000 0.3629 0.8629 0.8715 Trifolium 0.9276 0.1071 0.6171 0.8078 0.6780 0.8715 Vicia cracca 0.8629 0.7386 0.8629 0.8078 0.9310 0.9427 Erodium Galium Hieracium Hypericum Knautia Leontodon Phleum Briza - - - - - - - Carex flava - - - - - - - Cirsium - - - - - - - Dactylis - - - - - - - Equisetum - - - - - - - Erodium - - - - - - - Galium 0.8715 - - - - - - Hieracium 0.8715 0.9310 - - - - - Hypericum < 2e-16 0.8078 0.4834 - - - - Knautia 0.6113 0.3951 0.4864 0.0269 - - - Leontodon 0.7077 0.9310 0.8629 0.7859 0.1605 - - Phleum 0.1383 0.1844 0.1844 < 2e-16 0.9049 0.0112 - Plantago 0.7739 0.6320 0.6320 < 2e-16 0.9310 0.2910 0.8715 Potentilla 0.7739 0.6320 0.6320 < 2e-16 0.9310 0.2910 0.8715 Ranunculus acris 0.7739 0.8629 0.8078 1.0000 0.1093 0.8629 0.0014 Stellaria 0.3668 0.3164 0.3164 < 2e-16 0.9385 0.0615 0.9310 Trifolium 0.9090 0.9310 0.9530 0.7393 0.4834 0.8769 0.2514 Vicia cracca 0.8715 0.8629 0.8715 0.7959 0.8629 0.8078 0.8078 Plantago Potentilla Ranunculus acris Stellaria Trifolium Briza - - - - - Carex flava - - - - - Cirsium - - - - - Dactylis - - - - - Equisetum - - - - - Erodium - - - - - Galium - - - - - Hieracium - - - - - Hypericum - - - - - Knautia - - - - - Leontodon - - - - - Phleum - - - - - Plantago - - - - - Potentilla 1.0000 - - - - Ranunculus acris 0.3089 0.3089 - - - Stellaria 0.9310 0.9310 0.0416 - - Trifolium 0.6732 0.6732 0.8078 0.3668 - Vicia cracca 0.9310 0.9310 0.6780 0.8629 0.8715 P value adjustment method: BH > # \chapter{Most essential \R commands} > > # \chapter{The short \R glossary} > > pdf(file="pics/76680.pdf"); oldpar <- par(mar=c(2, 2, 0, 1)) > plot(density(rnorm(10^5)), main="", xlab="", ylab="") > par(oldpar); dev.off() pdf 2 > pdf(file="pics/10643_to_crop.pdf") > library(plotrix) > plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE) > for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n) > set.seed(11); x <- rnorm(100, sd=.28); y <- rnorm(100, sd=.28) > points(x, y, pch=19) > dev.off() pdf 2 > # \chapter{References}