> # \chapterX{Foreword}
> 
> library(shipunov)
package 'shipunov', version 1.11

> # \chapter{The data}
> 
> # \section{Origin of the data}
> 
> # \section{Population and sample}
> 
> # \section{How to obtain the data}
> 
> # \section{What to find in the data}
> 
> # \subsection{Why do we need the data analysis}
> 
> # \subsection{What data analysis can do}
> 
> # \subsection{What data analysis cannot do}
> 
> # \section{Answers to exercises}
> 
> # \chapter{How to process the data}
> 
> # \section{General purpose software}
> 
> # \section{Statistical software}
> 
> # \subsection{Graphical systems}
> 
> # \subsection{Statistical environments}
> 
> # \section{The very short history of the \S and \R}
> 
> # \section{Use, advantages and disadvantages of the \R}
> 
> # \section{How to download and install \R}
> 
> # \section{How to start with \R}
> 
> # \subsection{Launching \R}
> 
> # \subsection{First steps}
> 
> round(1.5, digits=0)
[1] 2

> round(1.5, d=0)
[1] 2

> round(d=0, 1.5)
[1] 2

> round(1.5, 0)
[1] 2

> round(1.5,)
[1] 2

> round(1.5)
[1] 2

> round(0, 1.5)
[1] 0

> args(round)
function (x, digits = 0) 
NULL

> args(q)
function (save = "default", status = 0, runLast = TRUE) 
NULL

> # \subsection{How to type}
> 
> round   (1.5, digits=0)
[1] 2

> round(1.5, digits=0)
[1] 2

> round  (  1.5 , digits = 0  )
[1] 2

> # \subsection{Overgrown calculator}
> 
> 2+2
[1] 4

> 2+.2
[1] 2.2

> log(-1)
Warning in log(-1) : NaNs produced
[1] NaN

> 100/0
[1] Inf

> # \subsection{How to play with \R}
> 
> bb <- matrix(1:9, ncol=3)

> bb
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9

> coordinates <- expand.grid(1:100, 1:100)

> sampled.rows <- sample(1:nrow(coordinates), 100)

> coordinates[sampled.rows, ]
     Var1 Var2
7672   72   77
8019   19   81
6675   75   67
69     69    1
6479   79   65
649    49    7
6514   14   66
4235   35   43
7554   54   76
8007    7   81
7108    8   72
8185   85   82
3377   77   34
6348   48   64
6024   24   61
2951   51   30
4097   97   41
6506    6   66
2518   18   26
4725   25   48
4800  100   48
5970   70   60
9438   38   95
415    15    5
6299   99   63
2329   29   24
1259   59   13
9657   57   97
2366   66   24
5867   67   59
7934   34   80
6588   88   66
8569   69   86
1570   70   16
7815   15   79
6942   42   70
8976   76   90
7280   80   73
59     59    1
5311   11   54
2035   35   21
6195   95   62
1517   17   16
7634   34   77
5094   94   51
1013   13   11
8013   13   81
2699   99   27
9685   85   97
8090   90   81
9659   59   97
1175   75   12
1075   75   11
7430   30   75
5173   73   52
7599   99   76
904     4   10
394    94    4
9326   26   94
3180   80   32
2577   77   26
1805    5   19
7663   63   77
6754   54   68
2983   83   30
2701    1   28
3968   68   40
9276   76   93
2069   69   21
9839   39   99
6521   21   66
231    31    3
7661   61   77
4463   63   45
2764   64   28
2264   64   23
9226   26   93
6603    3   67
7780   80   78
1849   49   19
4185   85   42
3708    8   38
7467   67   75
657    57    7
2994   94   30
5685   85   57
2650   50   27
3032   32   31
5015   15   51
4251   51   43
6173   73   62
2834   34   29
5460   60   55
3881   81   39
9063   63   91
7997   97   80
5913   13   60
1449   49   15
5845   45   59
3951   51   40

> dice <- as.vector(outer(1:6, 1:6, paste))

> sample(dice, 4, replace=TRUE)
[1] "4 3" "1 1" "1 5" "6 1"

> sample(dice, 5, replace=TRUE)
[1] "6 6" "2 4" "4 5" "5 1" "5 5"

> cards <- paste(rep(c(6:10,"V","D","K","T"), 4),
+ c("Tr","Bu","Ch","Pi"))

> sample(cards, 6)
[1] "T Bu"  "K Ch"  "10 Tr" "K Pi"  "V Ch"  "D Pi" 

> sample(cards, 6)
[1] "D Bu"  "10 Tr" "T Ch"  "10 Ch" "T Bu"  "9 Pi" 

> # \section{\R and data}
> 
> # \subsection{How to enter the data from within \R}
> 
> c(1, 2, 3, 4, 5)
[1] 1 2 3 4 5

> aa <- c(2, 3, 4, 5, 6, 7, 8)

> aa
[1] 2 3 4 5 6 7 8

> (aa <- c(1, 2, 3, 4, 5, 6, 7, 8, 9))
[1] 1 2 3 4 5 6 7 8 9

> aa
[1] 1 2 3 4 5 6 7 8 9

> rep(1, 5)
[1] 1 1 1 1 1

> rep(1:5, each=3)
 [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5

> seq(1, 5)
[1] 1 2 3 4 5

> 1:5
[1] 1 2 3 4 5

> # \subsection{How to name your objects}
> 
> # \subsection{How to load the text data}
> 
> mydata <- read.table("data/mydata.txt", sep=";", head=TRUE)

> str(mydata)
'data.frame':	3 obs. of  3 variables:
 $ a: int  1 4 7
 $ b: int  2 5 8
 $ c: int  3 6 9

> head(mydata)
  a b c
1 1 2 3
2 4 5 6
3 7 8 9

> # \subsection{How to load data from Internet}
> 
> # \subsection{How to use \texttt{read.table()} properly}
> 
> read.table("data/mydata1.txt", head=TRUE)
      a b c
one   1 2 3
two   4 5 6
three 7 8 9

> read.table("data/mydata2.txt", sep="\t", quote="", head=TRUE)
              a b c
one           1 2 3
two           4 5 6
three o'clock 7 8 9

> read.table("data/mydata3.txt", dec=",", se=";", h=T)
  a b   c
1 1 2 3.5
2 4 5 6.0
3 7 8 9.0

> # \subsection{How to load binary data}
> 
> xx <- "apple"

> save(xx, file="xx.rd") # Save object "xx"

> exists("xx")
[1] TRUE

> rm(xx)

> exists("xx")
[1] FALSE

> dir()
 [1] "000todo"                        "data"                          
 [3] "Makefile"                       "old.zip"                       
 [5] "open"                           "pics"                          
 [7] "supp"                           "visual_statistics.aux"         
 [9] "visual_statistics.log"          "visual_statistics.out"         
[11] "visual_statistics.pdf"          "visual_statistics.r"           
[13] "visual_statistics_rplots.pdf"   "visual_statistics_rresults.txt"
[15] "visual_statistics.tex"          "visual_statistics.toc"         
[17] "xx.rd"                         

> load("xx.rd") # Load object "xx"

> xx
[1] "apple"

> # \subsection{How to load data from clipboard}
> 
> # \subsection{How to edit data in \R}
> 
> # \subsection{How to save the results}
> 
> write.table(file="trees.txt", trees, row.names=FALSE, sep="\t",
+ quote=FALSE)

> # \subsection{History and scripts}
> 
> # \section{\R graphics}
> 
> # \subsection{Graphical systems}
> 
> pdf(file="pics/05430.pdf"); oldpar <- par(mar=c(4, 4, 2, 1))

> plot(1:20, main="Title")

> legend("topleft", pch=1, legend="My wonderful points")

> par(oldpar); dev.off()
null device 
          1 

> plot(1:20, type="n")

> mtext("Title", line=1.5, font=2)

> points(1:20)

> legend("topleft", pch=1, legend="My wonderful points")

> pdf(file="pics/05610.pdf"); oldpar <- par(mar=c(4, 4, 2, 1))

> plot(cars)

> title(main="Cars from 1920s")

> par(oldpar); dev.off()
pdf 
  2 

> plot(1:20, pch=3, col=6, main="Title")

> pdf(file="pics/07420.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> library(lattice)

> xyplot(1:20 ~ 1:20, main="title")

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/05960.pdf"); oldpar <- par(mar=c(2, 2, 1, 1))

> library(ggplot2)

> qplot(1:20, 1:20, main="title")

> par(oldpar); dev.off()
pdf 
  2 

> # \subsection{Graphical devices}
> 
> png(file="01_20.png", bg="transparent")

> plot(1:20)

> text(10, 20, "a")

> dev.off()
pdf 
  2 

> pdf(file="01_20.pdf", width=8)

> plot(1:20)

> text(10, 20, "a")

> dev.off()
pdf 
  2 

> # \subsection{Graphical options}
> 
> pdf(file="pics/06060.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> old.par <- par(mfrow=c(2, 1))

> hist(cars$speed, main="")

> hist(cars$dist, main="")

> par(old.par)

> par(oldpar); dev.off()
pdf 
  2 

> hist(cars$speed, main="", xaxt="n", xlab="")

> old.par <- par(new=TRUE)

> hist(cars$dist, main="", axes=FALSE, xlab="", lty=2)

> par(old.par)

> # \subsection{Interactive graphics}
> 
> # \section{Answers to exercises}
> 
> eggs <- read.table("data/eggs.txt")

> str(eggs)
'data.frame':	555 obs. of  3 variables:
 $ V1: int  51 48 44 48 46 43 48 46 49 45 ...
 $ V2: int  34 33 33 34 32 32 35 32 34 33 ...
 $ V3: chr  "221" "221" "221" "221" ...

> head(eggs)
  V1 V2  V3
1 51 34 221
2 48 33 221
3 44 33 221
4 48 34 221
5 46 32 221
6 43 32 221

> pdf(file="pics/09010.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> plot(V2 ~ V1, data=eggs, xlab="Length, mm", ylab="Width, mm",
+ pch=21, bg="grey")

> abline(line(eggs$V1, eggs$V2), lty=2, lwd=1.5)

> lines(loess.smooth(eggs$V1, eggs$V2), col=2, lty=2, lwd=1.5)

> legend("topleft", lty=2, col=1:2, lwd=1.5, legend=c(
+ "Tukey's median-median line", "LOESS curve"))

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/10320.pdf"); oldpar <- par(mar=c(2, 2, 2, 1))

> plot(1:20, col="green", xlab="", ylab="")

> points(20:1, pch=0)

> par(oldpar); dev.off()
pdf 
  2 

> # \chapter{Types of data}
> 
> # \section{Degrees, hours and kilometers: measurement data}
> 
> rnorm(10)
 [1] -0.856583317 -0.651605803 -0.007427443 -0.039881843  1.053649210
 [6]  0.881785566  0.009527859  0.785361902  1.493052534  1.291616044

> runif(10)
 [1] 0.96716324 0.05713539 0.30067351 0.83834945 0.76931100 0.19325560
 [7] 0.76148651 0.93578418 0.11441590 0.02459024

> pdf(file="pics/10490.pdf")

> old.par <- par(mfrow=c(2, 1))

> hist(rnorm(100), main="Normal data")

> hist(runif(100), main="Non-normal data")

> par(old.par)

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/parametric_to_crop.pdf")

> library(plotrix)

> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> draw.circle(-.2, 0, .4)

> draw.circle(.1, 0, .9)

> text(-.2, 0, "parametric", cex=1.5)

> text(.1, 0.6, "nonparametric", cex=1.5)

> dev.off()
pdf 
  2 

> x <- c(174, 162, 188, 192, 165, 168, 172.5)

> str(x)
 num [1:7] 174 162 188 192 165 ...

> is.vector(x)
[1] TRUE

> is.numeric(x)
[1] TRUE

> sort(x)
[1] 162.0 165.0 168.0 172.5 174.0 188.0 192.0

> rev(sort(x))
[1] 192.0 188.0 174.0 172.5 168.0 165.0 162.0

> pdf(file="pics/10247_to_crop.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> library(vegan)
Loading required package: permute
This is vegan 2.5-6

> phanerozoic <- read.table("data/phanerozoic.txt")

> with(phanerozoic, linestack(541-V2, labels=paste(V1, V2), cex=1))

> par(oldpar); dev.off()
pdf 
  2 

> # \section{Grades and t-shirts: ranked data}
> 
> rr <- c(2, 1, 3, 3, 1, 1, 2)

> str(rr)
 num [1:7] 2 1 3 3 1 1 2

> (hh <- cut(x, 3, labels=c(1:3), ordered_result=TRUE))
[1] 2 1 3 3 1 1 2
Levels: 1 < 2 < 3

> x
[1] 174.0 162.0 188.0 192.0 165.0 168.0 172.5

> as.numeric(hh)
[1] 2 1 3 3 1 1 2

> # \section{Colors, names and sexes: nominal data}
> 
> # \subsection{Character vectors}
> 
> sex <- c("male", "female", "male", "male", "female", "male",
+ "male")

> is.character(sex)
[1] TRUE

> is.vector(sex)
[1] TRUE

> str(sex)
 chr [1:7] "male" "female" "male" "male" "female" "male" "male"

> sex
[1] "male"   "female" "male"   "male"   "female" "male"   "male"  

> sex[2:3]
[1] "female" "male"  

> `[`(sex, 2:3)
[1] "female" "male"  

> table(sex)
sex
female   male 
     2      5 

> # \subsection{Factors}
> 
> sex.f <- factor(sex)

> sex.f
[1] male   female male   male   female male   male  
Levels: female male

> pdf(file="pics/08830.pdf"); oldpar <- par(mar=c(2, 2, 1, 1))

> plot(sex.f)

> par(oldpar); dev.off()
pdf 
  2 

> is.factor(sex.f)
[1] TRUE

> is.character(sex.f)
[1] FALSE

> str(sex.f)
 Factor w/ 2 levels "female","male": 2 1 2 2 1 2 2

> levels(sex.f)
[1] "female" "male"  

> nlevels(sex.f)
[1] 2

> as.numeric(sex.f)
[1] 2 1 2 2 1 2 2

> w <- c(69, 68, 93, 87, 59, 82, 72)

> pdf(file="pics/09290.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> plot(x, w, pch=as.numeric(sex.f), col=as.numeric(sex.f),
+ xlab="Height, cm", ylab="Weight, kg")

> legend("topleft", pch=1:2, col=1:2, legend=levels(sex.f))

> par(oldpar); dev.off()
pdf 
  2 

> palette()
[1] "black"   "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
[8] "gray62" 

> (ff <- factor(3:5))
[1] 3 4 5
Levels: 3 4 5

> as.numeric(ff) # incorrect!
[1] 1 2 3

> as.numeric(as.character(ff)) # correct!
[1] 3 4 5

> sex.f[5:6]
[1] female male  
Levels: female male

> sex.f[6:7]
[1] male male
Levels: female male

> droplevels(sex.f[6:7])
[1] male male
Levels: male

> sex.f[6:7, drop=T]
[1] male male
Levels: male

> factor(as.character(sex.f[6:7]))
[1] male male
Levels: male

> m <- c("L", "S", "XL", "XXL", "S", "M", "L")

> m.f <- factor(m)

> m.f
[1] L   S   XL  XXL S   M   L  
Levels: L M S XL XXL

> m.o <- ordered(m.f, levels=c("S", "M", "L", "XL", "XXL"))

> m.o
[1] L   S   XL  XXL S   M   L  
Levels: S < M < L < XL < XXL

> ls()
 [1] "aa"           "bb"           "cards"        "coordinates"  "dice"        
 [6] "eggs"         "ff"           "hh"           "m"            "m.f"         
[11] "m.o"          "mydata"       "oldpar"       "old.par"      "phanerozoic" 
[16] "rr"           "sampled.rows" "sex"          "sex.f"        "w"           
[21] "x"            "xx"          

> Ls() # shipunov
           Name      Mode       Type   Obs Vars      Size
1            aa   numeric     vector     9    1 176 bytes
2            bb   numeric     matrix     3    3 264 bytes
3         cards character     vector    36    1    2.3 Kb
4   coordinates      list data.frame 10000    2     94 Kb
5          dice character     vector    36    1    2.3 Kb
6          eggs      list data.frame   555    3   10.6 Kb
7            ff   numeric     factor     3    1 648 bytes
8            hh   numeric     factor     7    1 728 bytes
9             m character     vector     7    1 392 bytes
10          m.f   numeric     factor     7    1 792 bytes
11          m.o   numeric     factor     7    1 856 bytes
12       mydata      list data.frame     3    3      1 Kb
13       oldpar      list    unknown     -    - 360 bytes
14      old.par      list    unknown     -    - 336 bytes
15  phanerozoic      list data.frame    12    2    1.8 Kb
16           rr   numeric     vector     7    1 112 bytes
17 sampled.rows   numeric     vector   100    1 448 bytes
18          sex character     vector     7    1 224 bytes
19        sex.f   numeric     factor     7    1 592 bytes
20            w   numeric     vector     7    1 112 bytes
21            x   numeric     vector     7    1 112 bytes
22           xx character     vector     1    1 112 bytes

> # \subsection{Logical vectors and binary data}
> 
> (likes.pizza <- c(T, T, F, F, T, T, F))
[1]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE

> is.vector(likes.pizza)
[1] TRUE

> is.factor(likes.pizza)
[1] FALSE

> is.character(likes.pizza)
[1] FALSE

> is.logical(likes.pizza)
[1] TRUE

> likes.pizza * 1
[1] 1 1 0 0 1 1 0

> as.logical(c(1, 1, 0))
[1]  TRUE  TRUE FALSE

> as.numeric(likes.pizza)
[1] 1 1 0 0 1 1 0

> Tobin(sex, convert.names=FALSE)
     female male
[1,]      0    1
[2,]      1    0
[3,]      0    1
[4,]      0    1
[5,]      1    0
[6,]      0    1
[7,]      0    1

> (is.male <- sex == "male")
[1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE

> (is.female <- sex == "female")
[1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

> x > 170
[1]  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE

> x[x > 170]
[1] 174.0 188.0 192.0 172.5

> ((x < 180) | (w <= 70)) & (sex=="female" | m=="S")
[1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

> ifelse(sex=="female", "pink", "blue")
[1] "blue" "pink" "blue" "blue" "pink" "blue" "blue"

> if(sex[1]=="female") {
+ "pink"
+ } else {
+ "blue"
+ }
[1] "blue"

> # \section{Fractions, counts and ranks: secondary data}
> 
> sex.t <- table(sex)

> round(100*sex.t/sum(sex.t))
sex
female   male 
    29     71 

> pdf(file="pics/09860.pdf"); oldpar <- par(mar=c(4, 2, 1, 1))

> load("data/com12.rd")

> exists("com12") # check if our object is here
[1] TRUE

> com12.plot <- barplot(com12, names.arg="")

> text(com12.plot, par("usr")[3]*2, srt=45, pos=2,
+ xpd=TRUE, labels=names(com12))

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/10070.pdf"); oldpar <- par(mar=c(2, 2, 3, 1))

> dotchart(com12)

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/06105_to_crop.pdf")

> com80 <- read.table("data/com80.txt")

> library(wordcloud)
Loading required package: RColorBrewer

> set.seed(5) # freeze random number generator

> wordcloud(words=com80[, 1], freq=com80[, 2],
+ colors=brewer.pal(8, "Dark2"))

> dev.off()
pdf 
  2 

> set.seed(1); rnorm(1)
[1] -0.6264538

> dotchart(log(com80[, 2]), labels=com80[, 1], cex=.6)

> x.ranks <- x

> names(x.ranks) <- rank(x)

> x.ranks
    5     1     6     7     2     3     4 
174.0 162.0 188.0 192.0 165.0 168.0 172.5 

> sort(x.ranks) # easier to spot
    1     2     3     4     5     6     7 
162.0 165.0 168.0 172.5 174.0 188.0 192.0 

> x.ranks2 <- c(x, x[3]) # duplicate the 3rd item

> names(x.ranks2) <- rank(x.ranks2)

> sort(x.ranks2)
    1     2     3     4     5   6.5   6.5     8 
162.0 165.0 168.0 172.5 174.0 188.0 188.0 192.0 

> wilcox.test(x.ranks2)
Warning in wilcox.test.default(x.ranks2) :
  cannot compute exact p-value with ties

	Wilcoxon signed rank test with continuity correction

data:  x.ranks2
V = 36, p-value = 0.01415
alternative hypothesis: true location is not equal to 0


> # \section{Missing data}
> 
> (hh <- c(8, 10, NA, NA, 8, NA, 8))
[1]  8 10 NA NA  8 NA  8

> mean(hh)
[1] NA

> mean(hh, na.rm=TRUE)
[1] 8.5

> mean(na.omit(hh))
[1] 8.5

> hh.old <- hh

> hh.old
[1]  8 10 NA NA  8 NA  8

> hh[is.na(hh)] <- mean(hh, na.rm=TRUE)

> hh
[1]  8.0 10.0  8.5  8.5  8.0  8.5  8.0

> # \section{Outliers, and how to find them}
> 
> m1 <- read.table("data/mydata.txt", sep=";") # wrong!

> str(m1)
'data.frame':	4 obs. of  3 variables:
 $ V1: chr  "a" "1" "4" "7"
 $ V2: chr  "b" "2" "5" "8"
 $ V3: chr  "c" "3" "6" "9"

> m2 <- read.table("data/mydata.txt", sep=";", h=TRUE) # correct!

> str(m2)
'data.frame':	3 obs. of  3 variables:
 $ a: int  1 4 7
 $ b: int  2 5 8
 $ c: int  3 6 9

> # \section{Changing data: basics of transformations}
> 
> scale(trees)
            Girth     Height      Volume
 [1,] -1.57685421 -0.9416472 -1.20885469
 [2,] -1.48125614 -1.7263533 -1.20885469
 [3,] -1.41752409 -2.0402357 -1.21493821
 [4,] -0.87580169 -0.6277648 -0.83775985
 [5,] -0.81206964  0.7847060 -0.69175532
 [6,] -0.78020362  1.0985884 -0.63700362
 [7,] -0.71647157 -1.5694121 -0.88642802
 [8,] -0.71647157 -0.1569412 -0.72825645
 [9,] -0.68460554  0.6277648 -0.46058149
[10,] -0.65273952 -0.1569412 -0.62483658
[11,] -0.62087350  0.4708236 -0.36324513
[12,] -0.58900747  0.0000000 -0.55791784
[13,] -0.58900747  0.0000000 -0.53358375
[14,] -0.49340940 -1.0985884 -0.53966727
[15,] -0.39781133 -0.1569412 -0.67350476
[16,] -0.11101712 -0.3138824 -0.48491557
[17,] -0.11101712  1.4124708  0.22077297
[18,]  0.01644698  1.5694121 -0.16857243
[19,]  0.14391108 -0.7847060 -0.27199230
[20,]  0.17577710 -1.8832945 -0.32066048
[21,]  0.23950915  0.3138824  0.26335763
[22,]  0.30324119  0.6277648  0.09301901
[23,]  0.39883927 -0.3138824  0.37286102
[24,]  0.87682962 -0.6277648  0.49453146
[25,]  0.97242770  0.1569412  0.75612291
[26,]  1.29108793  0.7847060  1.53481372
[27,]  1.35481998  0.9416472  1.55306429
[28,]  1.48228408  0.6277648  1.71123586
[29,]  1.51415010  0.6277648  1.29755636
[30,]  1.51415010  0.6277648  1.26713875
[31,]  2.34266672  1.7263533  2.84885447
attr(,"scaled:center")
   Girth   Height   Volume 
13.24839 76.00000 30.17097 
attr(,"scaled:scale")
    Girth    Height    Volume 
 3.138139  6.371813 16.437846 

> # \subsection{How to tell the kind of data}
> 
> # \section{Inside \R}
> 
> # \subsection{Matrices}
> 
> m <- 1:4

> m
[1] 1 2 3 4

> ma <- matrix(m, ncol=2, byrow=TRUE)

> ma
     [,1] [,2]
[1,]    1    2
[2,]    3    4

> str(ma)
 int [1:2, 1:2] 1 3 2 4

> str(m)
 int [1:4] 1 2 3 4

> mb <- m

> mb
[1] 1 2 3 4

> attr(mb, "dim") <- c(2, 2)

> mb
     [,1] [,2]
[1,]    1    3
[2,]    2    4

> ma[1, 2]
[1] 2

> ma[, ]
     [,1] [,2]
[1,]    1    2
[2,]    3    4

> (mi <- matrix(c(1, 1, 2, 2), ncol=2, byrow=TRUE))
     [,1] [,2]
[1,]    1    1
[2,]    2    2

> ma[mi]
[1] 1 4

> (mn <- matrix(c(NA, 1, 2, 2), ncol=2, byrow=TRUE))
     [,1] [,2]
[1,]   NA    1
[2,]    2    2

> is.na(mn)
      [,1]  [,2]
[1,]  TRUE FALSE
[2,] FALSE FALSE

> mn[is.na(mn)] <- 0

> mn
     [,1] [,2]
[1,]    0    1
[2,]    2    2

> mean(ma)
[1] 2.5

> ma[1, 1] <- "a"

> mean(ma)
Warning in mean.default(ma) :
  argument is not numeric or logical: returning NA
[1] NA

> ma
     [,1] [,2]
[1,] "a"  "2" 
[2,] "3"  "4" 

> m3 <- 1:8

> dim(m3) <- c(2, 2, 2)

> m3
, , 1

     [,1] [,2]
[1,]    1    3
[2,]    2    4

, , 2

     [,1] [,2]
[1,]    5    7
[2,]    6    8


> # \subsection{Lists}
> 
> l <- list("R", 1:3, TRUE, NA, list("r", 4))

> l
[[1]]
[1] "R"

[[2]]
[1] 1 2 3

[[3]]
[1] TRUE

[[4]]
[1] NA

[[5]]
[[5]][[1]]
[1] "r"

[[5]][[2]]
[1] 4


> fred <- list(name="Fred", wife.name="Mary", no.children=3,
+ child.ages=c(1, 5, 9))

> fred
$name
[1] "Fred"

$wife.name
[1] "Mary"

$no.children
[1] 3

$child.ages
[1] 1 5 9


> names(w) <- c("Rick", "Amanda", "Peter", "Alex", "Kathryn",
+ "Ben", "George")

> w
   Rick  Amanda   Peter    Alex Kathryn     Ben  George 
     69      68      93      87      59      82      72 

> row.names(ma) <- c("row1", "row2")

> colnames(ma) <- c("col1", "col2")

> ma
     col1 col2
row1 "a"  "2" 
row2 "3"  "4" 

> names(w) <- NULL

> w
[1] 69 68 93 87 59 82 72

> hh[3]
[1] 8.5

> ma[2, 1]
[1] "3"

> l[1]
[[1]]
[1] "R"


> str(l[1])
List of 1
 $ : chr "R"

> l[[1]]
[1] "R"

> str(l[[1]])
 chr "R"

> str(l[[5]])
List of 2
 $ : chr "r"
 $ : num 4

> names(l) <- c("first", "second", "third", "fourth", "fifth")

> l$first
[1] "R"

> str(l$first)
 chr "R"

> l$fir
[1] "R"

> l$fi
NULL

> names(w) <- c("Rick", "Amanda", "Peter", "Alex", "Kathryn",
+ "Ben", "George")

> w["Jenny"]
<NA> 
  NA 

> x2.wilcox <- wilcox.test(x.ranks2)
Warning in wilcox.test.default(x.ranks2) :
  cannot compute exact p-value with ties

> str(x2.wilcox)
List of 7
 $ statistic  : Named num 36
  ..- attr(*, "names")= chr "V"
 $ parameter  : NULL
 $ p.value    : num 0.0141
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "location"
 $ alternative: chr "two.sided"
 $ method     : chr "Wilcoxon signed rank test with continuity correction"
 $ data.name  : chr "x.ranks2"
 - attr(*, "class")= chr "htest"

> x2.wilcox$p.value
[1] 0.0141474

> # \subsection{Data frames}
> 
> d <- data.frame(weight=w, height=x, size=m.o, sex=sex.f)

> row.names(d) <- c("Rick", "Amanda", "Peter", "Alex", "Kathryn",
+ "Ben", "George")

> d
        weight height size    sex
Rick        69  174.0    L   male
Amanda      68  162.0    S female
Peter       93  188.0   XL   male
Alex        87  192.0  XXL   male
Kathryn     59  165.0    S female
Ben         82  168.0    M   male
George      72  172.5    L   male

> d <- read.table("data/d.txt", h=TRUE, stringsAsFactors=TRUE)

> d$size <- ordered(d$size, levels=c("S", "M", "L", "XL", "XXL"))

> str(d)
'data.frame':	7 obs. of  4 variables:
 $ weight: int  69 68 93 87 59 82 72
 $ height: num  174 162 188 192 165 ...
 $ size  : Ord.factor w/ 5 levels "S"<"M"<"L"<"XL"<..: 3 1 4 5 1 2 3
 $ sex   : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 2

> d[, 1]
[1] 69 68 93 87 59 82 72

> d[[1]]
[1] 69 68 93 87 59 82 72

> d$weight
[1] 69 68 93 87 59 82 72

> d[, "weight"]
[1] 69 68 93 87 59 82 72

> d[["weight"]]
[1] 69 68 93 87 59 82 72

> identical(d$weight, d[, 1])
[1] TRUE

> d[, 2:4] # matrix method
        height size    sex
Rick     174.0    L   male
Amanda   162.0    S female
Peter    188.0   XL   male
Alex     192.0  XXL   male
Kathryn  165.0    S female
Ben      168.0    M   male
George   172.5    L   male

> d[, c("height", "size", "sex")]
        height size    sex
Rick     174.0    L   male
Amanda   162.0    S female
Peter    188.0   XL   male
Alex     192.0  XXL   male
Kathryn  165.0    S female
Ben      168.0    M   male
George   172.5    L   male

> d[2:4] # list method
        height size    sex
Rick     174.0    L   male
Amanda   162.0    S female
Peter    188.0   XL   male
Alex     192.0  XXL   male
Kathryn  165.0    S female
Ben      168.0    M   male
George   172.5    L   male

> subset(d, select=2:4)
        height size    sex
Rick     174.0    L   male
Amanda   162.0    S female
Peter    188.0   XL   male
Alex     192.0  XXL   male
Kathryn  165.0    S female
Ben      168.0    M   male
George   172.5    L   male

> d[, -1] # negative selection
        height size    sex
Rick     174.0    L   male
Amanda   162.0    S female
Peter    188.0   XL   male
Alex     192.0  XXL   male
Kathryn  165.0    S female
Ben      168.0    M   male
George   172.5    L   male

> Str(d) # shipunov
 'data.frame':	7 obs. of  4 variables: 
  1 weight: int  69 68 93 87 59 82 72 
  2 height: num  174 162 188 192 165 ... 
  3 size  : Ord.factor w/ 5 levels "S"<"M"<"L"<"XL"<..: 3 1 4 5 1 2 3 
  4 sex   : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 2 
 row.names [1:7] "Rick" "Amanda" "Peter" "Alex" "Kathryn" ... 

> d[d$sex=="female", ]
        weight height size    sex
Amanda      68    162    S female
Kathryn     59    165    S female

> d$sex=="female"
[1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

> d[c(2, 5), ]
        weight height size    sex
Amanda      68    162    S female
Kathryn     59    165    S female

> d[, names(d) != "weight"]
        height size    sex
Rick     174.0    L   male
Amanda   162.0    S female
Peter    188.0   XL   male
Alex     192.0  XXL   male
Kathryn  165.0    S female
Ben      168.0    M   male
George   172.5    L   male

> d[d$size== "M" | d$size== "S", ]
        weight height size    sex
Amanda      68    162    S female
Kathryn     59    165    S female
Ben         82    168    M   male

> d[d$size %in% c("M", "L") & d$sex=="male", ]
       weight height size  sex
Rick       69  174.0    L male
Ben        82  168.0    M male
George     72  172.5    L male

> subset(d, sex=="female")
        weight height size    sex
Amanda      68    162    S female
Kathryn     59    165    S female

> d.new <- d

> d.new[, 1] <- round(d.new[, 1] * 2.20462)

> d.new
        weight height size    sex
Rick       152  174.0    L   male
Amanda     150  162.0    S female
Peter      205  188.0   XL   male
Alex       192  192.0  XXL   male
Kathryn    130  165.0    S female
Ben        181  168.0    M   male
George     159  172.5    L   male

> d.new$he <- round(d.new$he * 0.0328084)

> d.new
        weight height size    sex he
Rick       152  174.0    L   male  6
Amanda     150  162.0    S female  5
Peter      205  188.0   XL   male  6
Alex       192  192.0  XXL   male  6
Kathryn    130  165.0    S female  5
Ben        181  168.0    M   male  6
George     159  172.5    L   male  6

> data.frame(a=1:4, b=1:2)
  a b
1 1 1
2 2 2
3 3 1
4 4 2

> d[order(d$sex, d$height), ]
        weight height size    sex
Amanda      68  162.0    S female
Kathryn     59  165.0    S female
Ben         82  168.0    M   male
George      72  172.5    L   male
Rick        69  174.0    L   male
Peter       93  188.0   XL   male
Alex        87  192.0  XXL   male

> order(d$sex, d$height)
[1] 2 5 6 7 1 3 4

> # \subsection{Overview of data types and modes}
> 
> # \section{Answers to exercises}
> 
> plot(sex.f, col=1:2)

> plot(sex.f, col=1:nlevels(sex.f))

> rev(sort(com12))
         c       plot      names read.table     length    dev.off      paste 
      4128       2896       2031        918        772        722        699 
       par    summary   function       text as.numeric 
       677        672        653        628        573 

> sort(com12, decreasing=TRUE)
         c       plot      names read.table     length    dev.off      paste 
      4128       2896       2031        918        772        722        699 
       par    summary   function       text as.numeric 
       677        672        653        628        573 

> comp <- read.table(
+ "http://ashipunov.me/shipunov/open/compositae.txt",
+ h=TRUE, sep="\t")

> str(comp)
'data.frame':	1396 obs. of  8 variables:
 $ SPECIES  : chr  "Achillea cartilaginea" "Achillea cartilaginea" "Achillea cartilaginea" "Achillea cartilaginea" ...
 $ PLANT    : int  118 118 118 118 118 118 118 118 118 118 ...
 $ HEIGHT   : int  460 460 460 460 460 460 460 460 460 460 ...
 $ N.CIRCLES: chr  "1" "1" "1" "1" ...
 $ N.LEAVES : int  2 2 2 2 2 2 2 2 2 2 ...
 $ HEAD.D   : num  7 8 8 8 10 7 8 9 9 10 ...
 $ DISC.D   : int  2 2 3 3 4 3 3 3 3 3 ...
 $ RAYS     : int  7 7 7 7 7 8 8 8 8 8 ...

> c3 <- comp[comp$SPECIES %in% c("Anthemis tinctoria",
+ "Cosmos bipinnatus", "Tripleurospermum inodorum"), ]

> c3$SPECIES <- factor(c3$SPECIES)

> with(c3, plot(HEAD.D, RAYS, col=as.numeric(SPECIES),
+ pch=as.numeric(SPECIES),
+ xlab="Diameter of head, mm", ylab="Number of rays"))

> legend("topright", pch=1:3, col=1:3, legend=levels(c3$SPECIES))

> with(c3, plot(jitter(HEAD.D), jitter(RAYS),
+ col=as.numeric(SPECIES), pch=as.numeric(SPECIES),
+ xlab="Diameter of head, mm", ylab="Number of rays"))

> with(c3, sunflowerplot(HEAD.D, RAYS))

> with(c3, smoothScatter(HEAD.D, RAYS))

> library(hexbin)

> with(c3, plot(hexbin(HEAD.D, RAYS)))

> pdf(file="pics/07055.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> with(c3, plot(HEAD.D, RAYS, type="n",
+ xlab="Diameter of head, mm", ylab="Number of rays"))

> with(c3, PPoints(SPECIES, HEAD.D, RAYS, scale=.9,
+ pchs=1, centers=TRUE))

> legend("topright", pch=1, col=1:3,
+ legend=levels(c3$SPECIES), text.font=3)

> par(oldpar); dev.off()
pdf 
  2 

> ma <- matrix(m, ncol=2, byrow=FALSE)

> ma
     [,1] [,2]
[1,]    1    3
[2,]    2    4

> jj <- paste0(rep("", 100), c("", "", "Fizz"), c(rep("", 4), "Buzz"))

> ifelse(jj == "", 1:100, jj)
  [1] "1"        "2"        "Fizz"     "4"        "Buzz"     "Fizz"    
  [7] "7"        "8"        "Fizz"     "Buzz"     "11"       "Fizz"    
 [13] "13"       "14"       "FizzBuzz" "16"       "17"       "Fizz"    
 [19] "19"       "Buzz"     "Fizz"     "22"       "23"       "Fizz"    
 [25] "Buzz"     "26"       "Fizz"     "28"       "29"       "FizzBuzz"
 [31] "31"       "32"       "Fizz"     "34"       "Buzz"     "Fizz"    
 [37] "37"       "38"       "Fizz"     "Buzz"     "41"       "Fizz"    
 [43] "43"       "44"       "FizzBuzz" "46"       "47"       "Fizz"    
 [49] "49"       "Buzz"     "Fizz"     "52"       "53"       "Fizz"    
 [55] "Buzz"     "56"       "Fizz"     "58"       "59"       "FizzBuzz"
 [61] "61"       "62"       "Fizz"     "64"       "Buzz"     "Fizz"    
 [67] "67"       "68"       "Fizz"     "Buzz"     "71"       "Fizz"    
 [73] "73"       "74"       "FizzBuzz" "76"       "77"       "Fizz"    
 [79] "79"       "Buzz"     "Fizz"     "82"       "83"       "Fizz"    
 [85] "Buzz"     "86"       "Fizz"     "88"       "89"       "FizzBuzz"
 [91] "91"       "92"       "Fizz"     "94"       "Buzz"     "Fizz"    
 [97] "97"       "98"       "Fizz"     "Buzz"    

> d.sorted <- d[order(d$sex, d$height), ]

> d.sorted[, order(names(d.sorted))]
        height    sex size weight
Amanda   162.0 female    S     68
Kathryn  165.0 female    S     59
Ben      168.0   male    M     82
George   172.5   male    L     72
Rick     174.0   male    L     69
Peter    188.0   male   XL     93
Alex     192.0   male  XXL     87

> # \chapter{One-dimensional data}
> 
> # \section{How to estimate general tendencies}
> 
> # \subsection{Median is the best}
> 
> salary <- c(21, 19, 27, 11, 102, 25, 21)

> mean(salary)
[1] 32.28571

> median(salary)
[1] 21

> sort(salary1 <- c(salary, 22))
[1]  11  19  21  21  22  25  27 102

> median(salary1)
[1] 21.5

> mean(salary, trim=0.2)
[1] 22.6

> sex <- c("male", "female", "male", "male", "female", "male",
+ "male")

> t.sex <- table(sex)

> mode <- names(t.sex[which.max(t.sex)])

> mode
[1] "male"

> attach(trees)

> mean(Girth)
[1] 13.24839

> mean(Height)
[1] 76

> detach(trees)

> with(trees, mean(Volume)) # Second way
[1] 30.17097

> sapply(trees, mean)
   Girth   Height   Volume 
13.24839 76.00000 30.17097 

> trees.n <- trees

> trees.n[2, 1] <- NA

> sapply(trees.n, mean)
   Girth   Height   Volume 
      NA 76.00000 30.17097 

> sapply(trees.n, mean, na.rm=TRUE)
   Girth   Height   Volume 
13.40333 76.00000 30.17097 

> # \subsection{Quartiles and quantiles}
> 
> quantile(salary, c(0, 0.25, 0.5, 0.75, 1))
  0%  25%  50%  75% 100% 
  11   20   21   26  102 

> fivenum(salary)
[1]  11  20  21  26 102

> summary(salary)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  11.00   20.00   21.00   32.29   26.00  102.00 

> summary(PlantGrowth)
     weight       group   
 Min.   :3.590   ctrl:10  
 1st Qu.:4.550   trt1:10  
 Median :5.155   trt2:10  
 Mean   :5.073            
 3rd Qu.:5.530            
 Max.   :6.310            

> summary(hh)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    8.0     8.0     8.5     8.5     8.5    10.0 

> err <- read.table("data/errors.txt", sep="\t",
+ h=TRUE, stringsAsFactors=TRUE)

> str(err)
'data.frame':	7 obs. of  3 variables:
 $ AGE   : Factor w/ 6 levels "12","22","23",..: 3 4 3 5 1 6 2
 $ NAME  : Factor w/ 6 levels "","John","Kate",..: 2 3 1 4 5 6 2
 $ HEIGHT: num  172 163 161 16.1 132 155 183

> summary(err)
 AGE       NAME       HEIGHT     
 12:1        :1   Min.   : 16.1  
 22:1   John :2   1st Qu.:143.5  
 23:2   Kate :1   Median :161.0  
 24:1   Lucy :1   Mean   :140.3  
 56:1   Penny:1   3rd Qu.:167.5  
 a :1   Sasha:1   Max.   :183.0  

> # \subsection{Variation}
> 
> var(salary); sqrt(var(salary)); sd(salary)
[1] 970.9048
[1] 31.15934
[1] 31.15934

> IQR(salary); mad(salary)
[1] 6
[1] 5.9304

> with(trees, paste(median(Height), IQR(Height)/2, sep="±"))
[1] "76±4"

> paste(median(trees$Height), mad(trees$Height), sep="±")
[1] "76±5.9304"

> paste(quantile(trees$Height, c(0.025, 0.975)), collapse="-")
[1] "63.75-86.25"

> paste(quantile(trees$Girth, c(0.025, 0.5, 0.975)), collapse="-")
[1] "8.525-12.9-18.65"

> paste(boxplot.stats(trees$Height)$stats[c(1, 5)], collapse="-")
[1] "63-87"

> HS <- fivenum(trees$Height)

> paste("(", HS[1], ")", HS[2], "-", HS[4], "(", HS[5], ")", sep="")
[1] "(63)72-80(87)"

> pdf(file="pics/reports_to_crop.pdf")

> library(plotrix)

> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> draw.circle(-.2, 0, .4)

> draw.circle(.1, 0, .9)

> text(-.2, 0, "mean()±sd()", cex=1.3)

> text(.1, 0.65, "median()±IQR()\nmedian()±mad()", cex=1.3)

> dev.off()
pdf 
  2 

> 100*sapply(trees, sd)/colMeans(trees)
    Girth    Height    Volume 
23.686948  8.383964 54.482331 

> # \section{1-dimensional plots}
> 
> pdf(file="pics/14390.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> new.1000 <- sample((median(salary) - IQR(salary)) :
+ (median(salary) + IQR(salary)), 1000, replace=TRUE)

> salary2 <- c(salary, new.1000)

> boxplot(salary2, log="y")

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/boxplot.pdf"); oldpar <- par(mar=c(1, 1, 0, 1))

> Ex.boxplot()

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/14560.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> boxplot(scale(trees))

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/14730.pdf"); oldpar <- par(mar=c(2, 4, 1, 1))

> hist(salary2, breaks=20, main="", xlab=0)

> par(oldpar); dev.off()
pdf 
  2 

> table(cut(salary2, 20))

(10.9,15.6] (15.6,20.1] (20.1,24.6] (24.6,29.2] (29.2,33.8] (33.8,38.3] 
         71         380         335         220           0           0 
(38.3,42.8] (42.8,47.4] (47.4,51.9] (51.9,56.5]   (56.5,61]   (61,65.6] 
          0           0           0           0           0           0 
(65.6,70.2] (70.2,74.7] (74.7,79.2] (79.2,83.8] (83.8,88.3] (88.3,92.9] 
          0           0           0           0           0           0 
(92.9,97.5]  (97.5,102] 
          0           1 

> stem(salary, scale=2)

  The decimal point is 1 digit(s) to the right of the |

   1 | 19
   2 | 1157
   3 | 
   4 | 
   5 | 
   6 | 
   7 | 
   8 | 
   9 | 
  10 | 2


> pdf(file="pics/15260.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> plot(density(salary2, adjust=2), main="", xlab="", ylab="")

> rug(salary2)

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/15580.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> library(beeswarm)

> trees.s <- data.frame(scale(trees), class=cut(trees$Girth, 3,
+ labels=c("thin", "medium", "thick")))

> beeswarm(trees.s[, 1:3], cex=2, col=1:3,
+ pwpch=rep(as.numeric(trees.s$class), 3))

> bxplot(trees.s[, 1:3], add=TRUE)

> legend("top", pch=1:3, legend=levels(trees.s$class))

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/32570.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> library(beanplot)

> beanplot(trees.s[, 1:3], col=list(1, 2, 3),
+ border=1:3, beanlines="median")

> par(oldpar); dev.off()
pdf 
  2 

> # \section{Confidence intervals}
> 
> t.test(trees$Height)

	One Sample t-test

data:  trees$Height
t = 66.41, df = 30, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 73.6628 78.3372
sample estimates:
mean of x 
       76 


> S.value(t.test(trees$Height)) # shipunov
[1] 110.9296

> wilcox.test(salary, conf.int=TRUE)
Warning in wilcox.test.default(salary, conf.int = TRUE) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(salary, conf.int = TRUE) :
  cannot compute exact confidence interval with ties

	Wilcoxon signed rank test with continuity correction

data:  salary
V = 28, p-value = 0.02225
alternative hypothesis: true location is not equal to 0
95 percent confidence interval:
 15.00002 64.49995
sample estimates:
(pseudo)median 
            23 


> # \section{Normality}
> 
> pdf(file="pics/17030.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> qqnorm(salary2, main=""); qqline(salary2, col=2)

> par(oldpar); dev.off()
pdf 
  2 

> shapiro.test(salary)

	Shapiro-Wilk normality test

data:  salary
W = 0.61164, p-value = 0.0003726


> shapiro.test(salary2)

	Shapiro-Wilk normality test

data:  salary2
W = 0.73849, p-value < 2.2e-16


> set.seed(1638) # freeze random number generator

> shapiro.test(rnorm(100))

	Shapiro-Wilk normality test

data:  rnorm(100)
W = 0.99337, p-value = 0.9094


> ks.test(scale(salary2), "pnorm")
Warning in ks.test(scale(salary2), "pnorm") :
  ties should not be present for the Kolmogorov-Smirnov test

	One-sample Kolmogorov-Smirnov test

data:  scale(salary2)
D = 0.094068, p-value = 3.641e-08
alternative hypothesis: two-sided


> old.options <- options(scipen=100)

> ks.test(scale(salary2), "pnorm")
Warning in ks.test(scale(salary2), "pnorm") :
  ties should not be present for the Kolmogorov-Smirnov test

	One-sample Kolmogorov-Smirnov test

data:  scale(salary2)
D = 0.094068, p-value = 0.00000003641
alternative hypothesis: two-sided


> options(old.options)

> # \section{How to create your own functions}
> 
> Normality <- function(a)
+ {
+ ifelse(shapiro.test(a)$p.value < 0.05, "NOT NORMAL", "NORMAL")
+ }

> Normality(salary) # shipunov
[1] "NOT NORMAL"

> sapply(trees, Normality)
       Girth       Height       Volume 
    "NORMAL"     "NORMAL" "NOT NORMAL" 

> sapply(log(trees+0.01), Normality)
   Girth   Height   Volume 
"NORMAL" "NORMAL" "NORMAL" 

> sapply(chickwts,
+ function(.x) names(table(.x))[which.max(table(.x))])
   weight      feed 
    "248" "soybean" 

> old.options <- options(warn=-1)

> sapply(trees,
+ function(.x) wilcox.test(.x, conf.int=TRUE)$conf.int)
        Girth   Height   Volume
[1,] 11.84995 73.50001 22.00000
[2,] 14.44990 78.50003 36.05001

> options(old.options)

> # \section{How good is the proportion?}
> 
> table(sex)
sex
female   male 
     2      5 

> table(sex)/length(sex)
sex
   female      male 
0.2857143 0.7142857 

> prop.test(table(sex))
Warning in prop.test(table(sex)) :
  Chi-squared approximation may be incorrect

	1-sample proportions test with continuity correction

data:  table(sex), null probability 0.5
X-squared = 0.57143, df = 1, p-value = 0.4497
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.05112431 0.69743997
sample estimates:
        p 
0.2857143 


> prop.test(x=356, n=476, p=0.7, alternative="two.sided")

	1-sample proportions test with continuity correction

data:  356 out of 476, null probability 0.7
X-squared = 4.9749, df = 1, p-value = 0.02572
alternative hypothesis: true p is not equal to 0.7
95 percent confidence interval:
 0.7059174 0.7858054
sample estimates:
        p 
0.7478992 


> prop.test(x=0.52*262, n=262, p=0.5, alternative="greater")

	1-sample proportions test with continuity correction

data:  0.52 * 262 out of 262, null probability 0.5
X-squared = 0.34302, df = 1, p-value = 0.279
alternative hypothesis: true p is greater than 0.5
95 percent confidence interval:
 0.4673901 1.0000000
sample estimates:
   p 
0.52 


> sapply(1:10, Phyllotaxis) # shipunov
 [1] "1/2"    "1/3"    "2/5"    "3/8"    "5/13"   "8/21"   "13/34"  "21/55" 
 [9] "34/89"  "55/144"

> # \section{Answers to exercises}
> 
> str(shapiro.test(rnorm(100)))
List of 4
 $ statistic: Named num 0.992
  ..- attr(*, "names")= chr "W"
 $ p.value  : num 0.854
 $ method   : chr "Shapiro-Wilk normality test"
 $ data.name: chr "rnorm(100)"
 - attr(*, "class")= chr "htest"

> set.seed(1683)

> shapiro.test(rnorm(100))$p.value
[1] 0.8424037

> betula <- read.table(
+ "http://ashipunov.me/shipunov/open/betula.txt", h=TRUE)

> Str(betula) # shipunov
 'data.frame':	229 obs. of  10 variables: 
  1 LOC      : int  1 1 1 1 1 1 1 1 1 1 ... 
  2 LEAF.L   : int  50 44 45 35 41 53 50 47 52 42 ... 
  3 LEAF.W   : int  37 29 37 26 32 37 40 36 39 40 ... 
  4 LEAF.MAXW: int  23 20 19 15 18 25 21 21 23 19 ... 
  5 PUB      : int  0 1 0 0 0 0 0 0 1 0 ... 
  6 PAPILLAE : int  1 1 1 1 1 1 1 1 1 1 ... 
  7 CATKIN   : int  31 25 21 20 24 22 40 25 14 27 ... 
  8 SCALE.L  : num  4 3 4 5.5 5 5 6 5 5 5 ... 
  9 LOBES    * int  0 0 1 0 0 0 0 0 1 0 ... 
 10 WINGS    * int  0 0 0 0 0 0 1 0 0 1 ... 

> sapply(betula[, c(2:4, 7:8)], Normality) # shipunov
      LEAF.L       LEAF.W    LEAF.MAXW       CATKIN      SCALE.L 
"NOT NORMAL" "NOT NORMAL" "NOT NORMAL"     "NORMAL" "NOT NORMAL" 

> pdf(file="pics/26980_to_crop.pdf")

> betula.s <- stack(betula[, c(2:4, 7:8)])

> qqmath(~ values | ind, data=betula.s,
+ panel=function(x) {panel.qqmathline(x); panel.qqmath(x)})

> dev.off()
pdf 
  2 

> bwtheme <- standard.theme("pdf", color=FALSE)

> histogram(~ values | ind, data=betula.s, par.settings=bwtheme)

> CV <- function(x) {}

> CV <- function(x)
+ {
+ 100*sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE)
+ }

> sapply(betula[, c(2:4, 7:8)], CV)
   LEAF.L    LEAF.W LEAF.MAXW    CATKIN   SCALE.L 
 17.93473  20.38630  26.08806  24.17354  24.72061 

> dact <- scan("data/dact.txt")
Read 48 items

> str(dact)
 num [1:48] 88 22 52 31 51 63 32 57 68 27 ...

> Normality(dact) # shipunov
[1] "NOT NORMAL"

> summary(dact)[-4] # no mean
   Min. 1st Qu.  Median 3rd Qu.    Max. 
   0.00   22.00   33.50   65.25  108.00 

> IQR(dact)
[1] 43.25

> mad(dact)
[1] 27.4281

> wilcox.test(dact, conf.int=TRUE)$conf.int
Warning in wilcox.test.default(dact, conf.int = TRUE) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(dact, conf.int = TRUE) :
  cannot compute exact confidence interval with ties
Warning in wilcox.test.default(dact, conf.int = TRUE) :
  cannot compute exact p-value with zeroes
Warning in wilcox.test.default(dact, conf.int = TRUE) :
  cannot compute exact confidence interval with zeroes
[1] 34.49995 53.99997
attr(,"conf.level")
[1] 0.95

> pdf(file="pics/37240.pdf"); oldpar <- par(mar=c(2, 4, 0, 0))

> Histr(dact, xlab="", main="") # shipunov
Warning in hist.default(x, plot = FALSE, ...) :
  arguments ‘main’, ‘xlab’ are not made use of

> par(oldpar); dev.off()
pdf 
  2 

> stem(dact)

  The decimal point is 1 digit(s) to the right of the |

   0 | 0378925789
   2 | 0224678901122345
   4 | 471127
   6 | 035568257
   8 | 2785
  10 | 458


> pdf(file="pics/moments_to_crop.pdf")

> oldpar <- par(mfrow=c(2, 2), mar=c(0, 0, 0, 0))

> library(plotrix)

> ##
> 
> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n)

> set.seed(11); x <- rnorm(100, sd=.28); y <- rnorm(100, sd=.28)

> points(x, y, pch=19)

> ##
> 
> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n)

> set.seed(11); x <- rnorm(100, sd=.38); y <- rnorm(100, sd=.38)

> points(x, y, pch=19)

> ##
> 
> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n)

> set.seed(11); x <- rnorm(100, sd=.28) - .2; y <- rnorm(100, sd=.28)

> points(x, y, pch=19)

> ##
> 
> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n)

> set.seed(11); x <- c(rnorm(50, sd=.28), rnorm(50, sd=.1)); y <- c(rnorm(50, sd=.28), rnorm(50, sd=.1))

> points(x, y, pch=19)

> ##
> 
> par(oldpar)

> dev.off()
pdf 
  2 

> library(e1071)

> skewness(dact)
[1] 0.5242118

> kurtosis(dact)
[1] -0.8197875

> ny <- read.table(
+ "http://ashipunov.me/shipunov/open/nymphaeaceae.txt",
+ h=TRUE, sep="\t")

> Str(ny) # shipunov
 'data.frame':	267 obs. of  5 variables: 
  1 SPECIES: chr  "Nuphar lutea" "Nuphar lutea" "Nuphar lutea" "Nuphar lutea" ... 
  2 SEPALS : int  4 5 5 5 5 5 5 5 5 5 ... 
  3 PETALS : int  14 10 10 11 11 11 12 12 12 12 ... 
  4 STAMENS* int  131 104 113 106 111 119 73 91 102 109 ... 
  5 STIGMAS* int  13 12 12 11 13 15 10 12 12 11 ... 

> oldpar <- par(mfrow=c(2, 2))

> for (i in 2:5) boxplot(ny[, i] ~ ny[, 1], main=names(ny)[i])

> par(oldpar)

> library(lattice)

> ny.s <- stack(as.data.frame(scale(ny[ ,2:5])))

> ny$SPECIES <- as.factor(ny$SPECIES)

> ny.s$SPECIES <- ny$SPECIES

> bwplot(SPECIES ~ values | ind, ny.s, xlab="")

> pdf(file="pics/44230.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> Boxplots(ny[, 2:5], ny$SPECIES, srt=0, adj=c(.5, 1)) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/44400.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> Linechart(ny[, 2:5], ny[, 1], se.lwd=2) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> aggregate(ny[, 3:4], by=list(SPECIES=ny[, 1]), Normality) # shipunov
           SPECIES     PETALS    STAMENS
1     Nuphar lutea NOT NORMAL NOT NORMAL
2 Nymphaea candida NOT NORMAL NOT NORMAL

> aggregate(ny[, 3:4],
+ by=list(SPECIES=ny[, 1]),
+ function(.x) wilcox.test(.x, conf.int=TRUE)$conf.int)
           SPECIES PETALS.1 PETALS.2 STAMENS.1 STAMENS.2
1     Nuphar lutea 14.49997 14.99996 119.00003 125.50005
2 Nymphaea candida 25.49997 27.00001  73.99995  78.49997

> aggregate(ny[, 3:4], by=list(SPECIES=ny[, 1]), function(.x)
+ paste(median(.x, na.rm=TRUE), mad(.x, na.rm=TRUE), sep="±"))
           SPECIES    PETALS     STAMENS
1     Nuphar lutea 14±1.4826 119±19.2738
2 Nymphaea candida 26±2.9652  77±10.3782

> phx <- read.table(
+ "http://ashipunov.me/shipunov/open/phyllotaxis.txt",
+ h=TRUE, sep="\t")

> str(phx)
'data.frame':	6032 obs. of  4 variables:
 $ FAMILY   : chr  "Anacardiaceae" "Anacardiaceae" "Anacardiaceae" "Anacardiaceae" ...
 $ SPECIES  : chr  "Cotinus coggygria" "Cotinus coggygria" "Cotinus coggygria" "Cotinus coggygria" ...
 $ N.CIRCLES: int  2 2 2 2 2 2 2 2 2 2 ...
 $ N.LEAVES : int  4 4 5 5 5 5 5 5 5 5 ...

> nlevels(factor((phx$FAMILY)))
[1] 11

> pdf(file="pics/26710.pdf"); oldpar <- par(mar=c(2, 0, 1, 1))

> phx10 <- sapply(1:10, Phyllotaxis)

> phx.all <- paste(phx$N.CIRCLES, phx$N.LEAVES, sep="/")

> phx.tbl <- table(phx$FAMILY, phx.all %in% phx10)

> Dotchart(sort(phx.tbl[,"FALSE"]/(rowSums(phx.tbl)))) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> mean.phx.prop <- sum(phx.tbl[, 1])/sum(phx.tbl)

> prop.test(phx.tbl["Onagraceae", 1], sum(phx.tbl["Onagraceae", ]),
+ mean.phx.prop)

	1-sample proportions test with continuity correction

data:  phx.tbl["Onagraceae", 1] out of sum(phx.tbl["Onagraceae", ]), null probability mean.phx.prop
X-squared = 227.9, df = 1, p-value < 2.2e-16
alternative hypothesis: true p is not equal to 0.2712202
95 percent confidence interval:
 0.6961111 0.8221820
sample estimates:
        p 
0.7647059 


> power.prop.test(p1=0.48, p2=0.52, power=0.8)

     Two-sample comparison of proportions power calculation 

              n = 2451.596
             p1 = 0.48
             p2 = 0.52
      sig.level = 0.05
          power = 0.8
    alternative = two.sided

NOTE: n is number in *each* group


> # \chapter{Two-dimensional data: differences}
> 
> # \section{What is a statistical test?}
> 
> bpress <- read.table("data/bpress.txt", h=TRUE)

> head(bpress)
  PLACEBO.1 PLACEBO.10 DRUG.1 DRUG.10
1       180        170    140     120
2       140        150    120     100
3       160        155    180     140
4       170        180    160     140
5       150        150    160     160

> drug.d <- bpress$DRUG.10 - bpress$DRUG.1

> placebo.d <- bpress$PLACEBO.10 - bpress$PLACEBO.1

> mean(drug.d - placebo.d)
[1] -21

> boxplot(bpress)

> # \subsection{Statistical hypotheses}
> 
> # \subsection{Statistical errors}
> 
> # \section{Is there a difference? Comparing two samples}
> 
> # \subsection{Two sample tests}
> 
> sapply(data.frame(placebo.d, drug.d), Normality) # shipunov
placebo.d    drug.d 
 "NORMAL"  "NORMAL" 

> t.test(placebo.d, drug.d)

	Welch Two Sample t-test

data:  placebo.d and drug.d
t = 2.8062, df = 6.7586, p-value = 0.02726
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  3.175851 38.824149
sample estimates:
mean of x mean of y 
        1       -20 


> S.value(t.test(placebo.d, drug.d)) # shipunov
[1] 5.197132

> df <- 6.7586

> v1 <- var(placebo.d)

> v2 <- var(drug.d)

> (t.stat <- (mean(placebo.d) - mean(drug.d))/sqrt(v1/5 + v2/5))
[1] 2.806243

> (p.value <- 2*pt(-abs(t.stat), df))
[1] 0.02725892

> S.value(p.value) # shipunov
[1] 5.197128

> long <- stack(data.frame(placebo.d, drug.d))

> head(long)
  values       ind
1    -10 placebo.d
2     10 placebo.d
3     -5 placebo.d
4     10 placebo.d
5      0 placebo.d
6    -20    drug.d

> t.test(values ~ ind, data=long)

	Welch Two Sample t-test

data:  values by ind
t = 2.8062, df = 6.7586, p-value = 0.02726
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  3.175851 38.824149
sample estimates:
mean in group placebo.d    mean in group drug.d 
                      1                     -20 


> boxplot(values ~ ind, data=long)

> tapply(long$values, long$ind, sd)
placebo.d    drug.d 
 8.944272 14.142136 

> aggregate(values ~ ind, range, data=long)
        ind values.1 values.2
1 placebo.d      -10       10
2    drug.d      -40        0

> tapply(beaver2$temp, beaver2$activ, Normality) # shipunov
       0        1 
"NORMAL" "NORMAL" 

> boxplot(temp ~ activ, data=beaver2)

> t.test(temp ~ activ, data=beaver2)

	Welch Two Sample t-test

data:  temp by activ
t = -18.548, df = 80.852, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.8927106 -0.7197342
sample estimates:
mean in group 0 mean in group 1 
       37.09684        37.90306 


> uu <- unstack(beaver2, temp ~ activ)

> str(uu)
List of 2
 $ 0: num [1:38] 36.6 36.7 36.9 37.1 37.2 ...
 $ 1: num [1:62] 38 38 38 38.2 38.1 ...

> sapply(bpress, Normality) # shipunov
 PLACEBO.1 PLACEBO.10     DRUG.1    DRUG.10 
  "NORMAL"   "NORMAL"   "NORMAL"   "NORMAL" 

> attach(bpress)

> t.test(DRUG.1, DRUG.10, paired=TRUE)

	Paired t-test

data:  DRUG.1 and DRUG.10
t = 3.1623, df = 4, p-value = 0.03411
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  2.440219 37.559781
sample estimates:
mean of the differences 
                     20 


> t.test(DRUG.10 - DRUG.1, mu=0) # same results

	One Sample t-test

data:  DRUG.10 - DRUG.1
t = -3.1623, df = 4, p-value = 0.03411
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 -37.559781  -2.440219
sample estimates:
mean of x 
      -20 


> t.test(DRUG.1, DRUG.10) # non-paired

	Welch Two Sample t-test

data:  DRUG.1 and DRUG.10
t = 1.3868, df = 8, p-value = 0.2029
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -13.25766  53.25766
sample estimates:
mean of x mean of y 
      152       132 


> detach(bpress)

> attach(bpress)

> t.test(PLACEBO.10, DRUG.10, alt="greater") # one-tailed test

	Welch Two Sample t-test

data:  PLACEBO.10 and DRUG.10
t = 2.4509, df = 6.4729, p-value = 0.0234
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 6.305348      Inf
sample estimates:
mean of x mean of y 
      161       132 


> t.test(PLACEBO.10, DRUG.10) # "common" test

	Welch Two Sample t-test

data:  PLACEBO.10 and DRUG.10
t = 2.4509, df = 6.4729, p-value = 0.04681
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  0.5528512 57.4471488
sample estimates:
mean of x mean of y 
      161       132 


> detach(bpress)

> tapply(ToothGrowth$len, ToothGrowth$supp, Normality) # shipunov
          OJ           VC 
"NOT NORMAL"     "NORMAL" 

> boxplot(len ~ supp, data=ToothGrowth, notch=TRUE)

> wilcox.test(len ~ supp, data=ToothGrowth)
Warning in wilcox.test.default(x = c(15.2, 21.5, 17.6, 9.7, 14.5, 10, 8.2,  :
  cannot compute exact p-value with ties

	Wilcoxon rank sum test with continuity correction

data:  len by supp
W = 575.5, p-value = 0.06449
alternative hypothesis: true location shift is not equal to 0


> w0 <- ChickWeight$weight[ChickWeight$Time == 0]

> w2 <- ChickWeight$weight[ChickWeight$Time == 2]

> sapply(data.frame(w0, w2), Normality)
          w0           w2 
"NOT NORMAL" "NOT NORMAL" 

> boxplot(w0, w2)

> wilcox.test(w0, w2, paired=TRUE)

	Wilcoxon signed rank test with continuity correction

data:  w0 and w2
V = 8, p-value = 1.132e-09
alternative hypothesis: true location shift is not equal to 0


> pdf(file="pics/20320.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> plot(extra ~ group, data=sleep)

> par(oldpar); dev.off()
pdf 
  2 

> tapply(sleep$extra, sleep$group, Normality) # shipunov
       1        2 
"NORMAL" "NORMAL" 

> t.test(extra ~ group, data=sleep, paired=TRUE)

	Paired t-test

data:  extra by group
t = -4.0621, df = 9, p-value = 0.002833
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -2.4598858 -0.7001142
sample estimates:
mean of the differences 
                  -1.58 


> power.t.test(n=10, sig.level=0.05, d=1.58)

     Two-sample t test power calculation 

              n = 10
          delta = 1.58
             sd = 1
      sig.level = 0.05
          power = 0.9160669
    alternative = two.sided

NOTE: n is number in *each* group


> (aa <- 1:9)
[1] 1 2 3 4 5 6 7 8 9

> (bb <- rep(5, 9))
[1] 5 5 5 5 5 5 5 5 5

> (xx <- 51:59)
[1] 51 52 53 54 55 56 57 58 59

> wilcox.test(Ozone ~ Month, data=airquality,
+ subset = Month %in% c(5, 8), conf.int=TRUE)
Warning in wilcox.test.default(x = c(41L, 36L, 12L, 18L, 28L, 23L, 19L,  :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(x = c(41L, 36L, 12L, 18L, 28L, 23L, 19L,  :
  cannot compute exact confidence intervals with ties

	Wilcoxon rank sum test with continuity correction

data:  Ozone by Month
W = 127.5, p-value = 0.0001208
alternative hypothesis: true location shift is not equal to 0
95 percent confidence interval:
 -52.99999 -14.99998
sample estimates:
difference in location 
             -31.99996 


> pdf(file="pics/20890.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> boxplot(Ozone ~ Month, data=airquality,
+ subset=Month %in% c(5, 8), notch=TRUE)

> par(oldpar); dev.off()
pdf 
  2 

> ar <- read.table("data/argon.txt")

> unstack(ar, form=V2 ~ V1)
      air chemical
1 2.31017  2.30143
2 2.30986  2.29890
3 2.31010  2.29816
4 2.31001  2.30182
5 2.31024  2.29869
6 2.31010  2.29940
7 2.31028  2.29849
8      NA  2.29869

> pdf(file="pics/28580.pdf"); oldpar <- par(mar=c(2, 3, 0, 1))

> means <- tapply(ar$V2, ar$V1, mean, na.rm=TRUE)

> oldpar <- par(mfrow=1:2)

> boxplot(V2 ~ V1, data=ar)

> barplot(means)

> par(oldpar)

> par(oldpar); dev.off()
pdf 
  2 

> # \subsection{Effect sizes}
> 
> wilcox.test(1:10, 1:10 + 0.001, paired=TRUE)
Warning in wilcox.test.default(1:10, 1:10 + 0.001, paired = TRUE) :
  cannot compute exact p-value with ties

	Wilcoxon signed rank test with continuity correction

data:  1:10 and 1:10 + 0.001
V = 0, p-value = 0.005355
alternative hypothesis: true location shift is not equal to 0


> wilcox.test(1:10, 1:10 + 0.0001, paired=TRUE)
Warning in wilcox.test.default(1:10, 1:10 + 1e-04, paired = TRUE) :
  cannot compute exact p-value with ties

	Wilcoxon signed rank test with continuity correction

data:  1:10 and 1:10 + 1e-04
V = 0, p-value = 0.004237
alternative hypothesis: true location shift is not equal to 0


> library(effsize)

> cohen.d(extra ~ group, data=sleep)

Cohen's d

d estimate: -0.8321811 (large)
95 percent confidence interval:
     lower      upper 
-1.8115649  0.1472027 


> cliff.delta(1:10, 1:10+0.001)

Cliff's Delta

delta estimate: -0.1 (negligible)
95 percent confidence interval:
     lower      upper 
-0.5344426  0.3762404 


> cliff.delta(Ozone ~ Month, data=airquality,
+ subset = Month %in% c(5, 8))

Cliff's Delta

delta estimate: -0.2991453 (small)
95 percent confidence interval:
     lower      upper 
-0.6255598  0.1163964 


> K(extra ~ group, data=sleep) # shipunov
0.3462627 

> summary(K(extra ~ group, data=sleep))
Lubischew's K        Effect             P 
         0.35          Weak          0.34 

> summary(K(aa*3, aa*10)) # shipunov
  Lubischew's K          Effect               P 
            1.5 Fairly moderate            0.19 

> cliff.delta(aa*3, aa*10)

Cliff's Delta

delta estimate: -0.7777778 (large)
95 percent confidence interval:
     lower      upper 
-0.9493811 -0.2486473 


> summary(K(1:10, 1:10+0.001, mad=TRUE)) # shipunov
 Lubischew's K         Effect              P 
             0 Extremely weak            0.5 

> (dd <- K(Ozone ~ Month,
+ data=airquality[airquality$Month %in% c(5, 8), ],
+ mad=TRUE)) # shipunov
0.6141992 

> summary(dd)
Lubischew's K        Effect             P 
         0.61   Fairly weak          0.29 

> # \section{If there are more than two samples: ANOVA}
> 
> # \subsection{One way}
> 
> pdf(file="pics/27420.pdf"); oldpar <- par(mar=c(2, 4, 0, 1))

> str(hwc) # shipunov
'data.frame':	90 obs. of  3 variables:
 $ COLOR : Factor w/ 3 levels "black","blond",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ WEIGHT: int  80 82 79 80 81 79 82 83 78 80 ...
 $ HEIGHT: int  166 170 170 171 169 171 169 170 167 166 ...

> boxplot(WEIGHT ~ COLOR, data=hwc, ylab="Weight, kg")

> par(oldpar); dev.off()
pdf 
  2 

> sapply(hwc[sapply(hwc, is.numeric)], Normality) # shipunov
  WEIGHT   HEIGHT 
"NORMAL" "NORMAL" 

> tapply(hwc$WEIGHT, hwc$COLOR, var)
   black    blond    brown 
8.805747 9.219540 8.896552 

> wc.aov <- aov(WEIGHT ~ COLOR, data=hwc)

> summary(wc.aov)
            Df Sum Sq Mean Sq F value   Pr(>F)    
COLOR        2  435.1  217.54   24.24 4.29e-09 ***
Residuals   87  780.7    8.97                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> df1 <- 2

> df2 <- 87

> group.size <- 30

> (sq.between <- sum(tapply(hwc$WEIGHT, hwc$COLOR,
+ function(.x) (mean(.x) - mean(hwc$WEIGHT))^2))*group.size)
[1] 435.0889

> (mean.sq.between <- sq.between/df1)
[1] 217.5444

> (sq.within <- sum(tapply(hwc$WEIGHT, hwc$COLOR,
+ function(.x) sum((.x - mean(.x))^2))))
[1] 780.7333

> (mean.sq.within <- sq.within/df2)
[1] 8.973946

> (f.value <- mean.sq.between/mean.sq.within)
[1] 24.24178

> (p.value <- (1 - pf(f.value, df1, df2)))
[1] 4.285683e-09

> aggregate(hwc[, -1], by=list(COLOR=hwc[, 1]), var)
  COLOR   WEIGHT   HEIGHT
1 black 8.805747 9.154023
2 blond 9.219540 8.837931
3 brown 8.896552 9.288506

> bartlett.test(WEIGHT ~ COLOR, data=hwc)

	Bartlett test of homogeneity of variances

data:  WEIGHT by COLOR
Bartlett's K-squared = 0.016654, df = 2, p-value = 0.9917


> fligner.test(WEIGHT ~ COLOR, data=hwc)

	Fligner-Killeen test of homogeneity of variances

data:  WEIGHT by COLOR
Fligner-Killeen:med chi-squared = 1.1288, df = 2, p-value = 0.5687


> Normality(wc.aov$residuals)
[1] "NORMAL"

> Eta2 <- function(aov)
+ {
+ summary.lm(aov)$r.squared
+ }

> (ewc <- Eta2(wc.aov))
[1] 0.3578557

> Mag(ewc) # shipunov
[1] "high"

> Normality(chickwts$weight)
[1] "NORMAL"

> bartlett.test(weight ~ feed, data=chickwts)

	Bartlett test of homogeneity of variances

data:  weight by feed
Bartlett's K-squared = 3.2597, df = 5, p-value = 0.66


> boxplot(weight ~ feed, data=chickwts)

> chicks.aov <- aov(weight ~ feed, data=chickwts)

> summary(chicks.aov)
            Df Sum Sq Mean Sq F value   Pr(>F)    
feed         5 231129   46226   15.37 5.94e-10 ***
Residuals   65 195556    3009                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> Eta2(chicks.aov)
[1] 0.5416855

> Mag(Eta2(chicks.aov)) # shipunov
[1] "very high"

> pairwise.t.test(hwc$WEIGHT, hwc$COLOR)

	Pairwise comparisons using t tests with pooled SD 

data:  hwc$WEIGHT and hwc$COLOR 

      black   blond
blond 1.7e-08 -    
brown 8.4e-07 0.32 

P value adjustment method: holm 

> TukeyHSD(wc.aov)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = WEIGHT ~ COLOR, data = hwc)

$COLOR
                  diff       lwr       upr     p adj
blond-black -5.0000000 -6.844335 -3.155665 0.0000000
brown-black -4.2333333 -6.077668 -2.388999 0.0000013
brown-blond  0.7666667 -1.077668  2.611001 0.5843745


> pairwise.Eff(hwc$WEIGHT, hwc$COLOR, eff="cohen.d") # shipunov
      black        blond         brown
black                                 
blond 1.67 (large)                    
brown 1.42 (large) -0.25 (small)      

> Normality(PlantGrowth$weight)
[1] "NORMAL"

> bartlett.test(weight ~ group, data=PlantGrowth)

	Bartlett test of homogeneity of variances

data:  weight by group
Bartlett's K-squared = 2.8786, df = 2, p-value = 0.2371


> plants.aov <- aov(weight ~ group, data=PlantGrowth)

> summary(plants.aov)
            Df Sum Sq Mean Sq F value Pr(>F)  
group        2  3.766  1.8832   4.846 0.0159 *
Residuals   27 10.492  0.3886                 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> Eta2(plants.aov)
[1] 0.2641483

> Mag(Eta2(plants.aov)) # shipunov
[1] "high"

> boxplot(weight ~ group, data=PlantGrowth)

> with(PlantGrowth, pairwise.t.test(weight, group))

	Pairwise comparisons using t tests with pooled SD 

data:  weight and group 

     ctrl  trt1 
trt1 0.194 -    
trt2 0.175 0.013

P value adjustment method: holm 

> boxplot(WEIGHT ~ COLOR, data=hwc2) # shipunov

> sapply(hwc2[, 2:3], Normality) # shipunov
  WEIGHT   HEIGHT 
"NORMAL" "NORMAL" 

> tapply(hwc2$WEIGHT, hwc2$COLOR, var)
   black    blond    brown 
62.27126 23.45862 31.11379 

> bartlett.test(WEIGHT ~ COLOR, data=hwc2)

	Bartlett test of homogeneity of variances

data:  WEIGHT by COLOR
Bartlett's K-squared = 7.4914, df = 2, p-value = 0.02362


> oneway.test(WEIGHT ~ COLOR, data=hwc2)

	One-way analysis of means (not assuming equal variances)

data:  WEIGHT and COLOR
F = 7.0153, num df = 2.000, denom df = 56.171, p-value = 0.001907


> (e2 <- Eta2(aov(WEIGHT ~ COLOR, data=hwc2)))
[1] 0.1626432

> Mag(e2)
[1] "medium"

> pairwise.t.test(hwc2$WEIGHT, hwc2$COLOR) # most applicable post hoc

	Pairwise comparisons using t tests with pooled SD 

data:  hwc2$WEIGHT and hwc2$COLOR 

      black   blond  
blond 0.00047 -      
brown 0.00797 0.32349

P value adjustment method: holm 

> Normality(InsectSprays$count) # shipunov
[1] "NOT NORMAL"

> Normality(sqrt(InsectSprays$count))
[1] "NORMAL"

> bartlett.test(sqrt(count) ~ spray, data=InsectSprays)$p.value
[1] 0.5855673

> boxplot(WEIGHT ~ COLOR, data=hwc3)

> sapply(hwc3[, 2:3], Normality) # shipunov
      WEIGHT       HEIGHT 
"NOT NORMAL" "NOT NORMAL" 

> kruskal.test(WEIGHT ~ COLOR, data=hwc3)

	Kruskal-Wallis rank sum test

data:  WEIGHT by COLOR
Kruskal-Wallis chi-squared = 32.859, df = 2, p-value = 7.325e-08


> Epsilon2 <- function(kw, n) # n is the number of cases
+ {
+ unname(kw$statistic/((n^2 - 1)/(n+1)))
+ }

> kw <- kruskal.test(WEIGHT ~ COLOR, data=hwc3)

> Epsilon2(kw, nrow(hwc3))
[1] 0.3691985

> Mag(Epsilon2(kw, nrow(hwc3))) # shipunov
[1] "high"

> boxplot(WEIGHT ~ COLOR, data=hwc3)

> pairwise.wilcox.test(hwc3$WEIGHT, hwc3$COLOR)
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties

	Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  hwc3$WEIGHT and hwc3$COLOR 

      black   blond
blond 1.1e-06 -    
brown 1.6e-05 0.056

P value adjustment method: holm 

> library(dunn.test)

> dunn.test(hwc3$WEIGHT, hwc3$COLOR, method="holm", altp=TRUE)
  Kruskal-Wallis rank sum test

data: x and group
Kruskal-Wallis chi-squared = 32.8587, df = 2, p-value = 0


                           Comparison of x by group                            
                                    (Holm)                                     
Col Mean-|
Row Mean |      black      blond
---------+----------------------
   blond |   5.537736
         |    0.0000*
         |
   brown |   4.051095  -1.486640
         |    0.0001*     0.1371

alpha = 0.05
Reject Ho if p <= alpha

> pdf(file="pics/anovas_to_crop.pdf")

> library(plotrix)

> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> draw.circle(0, .58, .3)

> draw.circle(0, .29, .6)

> draw.circle(0, 0, .9)

> text(0, .75, "normal, \nvariances\nsimilar:", font=3)

> text(0, .48, "aov()\nTukeyHSD()", font=2)

> text(0, .12, "normal, \nvariances different:", font=3)

> text(0, -.12, "oneway.test()\npairwise.t.test()", font=2)

> text(0, -.5, "not normal:", font=3)

> text(0, -.72, "kruskal.test()\npaiwise.wilcox.test()\n or dunn.test()", font=2)

> dev.off()
pdf 
  2 

> # \subsection{More then one way}
> 
> pdf(file="pics/49740.pdf"); oldpar <- par(mar=c(2, 4, 0, 1))

> with(ToothGrowth, interaction.plot(supp, dose, len))

> par(oldpar); dev.off()
pdf 
  2 

> # \section{Is there an association? Analysis of tables}
> 
> # \subsection{Contingency tables}
> 
> tea <- read.table("data/tea.txt", h=TRUE)

> head(tea)
  GUESS REALITY
1  Milk    Milk
2  Milk    Milk
3  Milk    Milk
4  Milk     Tea
5   Tea     Tea
6   Tea     Tea

> (tea.t <- table(tea))
      REALITY
GUESS  Milk Tea
  Milk    3   1
  Tea     1   3

> xtabs(~ GUESS + REALITY, data=tea)
      REALITY
GUESS  Milk Tea
  Milk    3   1
  Tea     1   3

> ftable(Titanic)
                   Survived  No Yes
Class Sex    Age                   
1st   Male   Child            0   5
             Adult          118  57
      Female Child            0   1
             Adult            4 140
2nd   Male   Child            0  11
             Adult          154  14
      Female Child            0  13
             Adult           13  80
3rd   Male   Child           35  13
             Adult          387  75
      Female Child           17  14
             Adult           89  76
Crew  Male   Child            0   0
             Adult          670 192
      Female Child            0   0
             Adult            3  20

> d <- rep(LETTERS[1:3], 10)

> is.na(d) <- 3:4

> d
 [1] "A" "B" NA  NA  "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C" "A"
[20] "B" "C" "A" "B" "C" "A" "B" "C" "A" "B" "C"

> table(d, useNA="ifany")
d
   A    B    C <NA> 
   9   10    9    2 

> pdf(file="pics/21660.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> titanic <- apply(Titanic, c(1, 4), sum)

> titanic
      Survived
Class   No Yes
  1st  122 203
  2nd  167 118
  3rd  528 178
  Crew 673 212

> mosaicplot(titanic, col=c("#485392", "#204F15"),
+ main="", cex.axis=1)

> par(oldpar); dev.off()
pdf 
  2 

> comfort <- ifelse(airquality$Temp < 64 | airquality$Temp > 86,
+ "uncomfortable", "comfortable")

> comf.month <- table(comfort, airquality$Month)

> comf.month
               
comfort          5  6  7  8  9
  comfortable   18 25 24 21 24
  uncomfortable 13  5  7 10  6

> pdf(file="pics/31120.pdf"); oldpar <- par(mar=c(2, 4, 1, 2))

> spineplot(t(comf.month))

> par(oldpar); dev.off()
pdf 
  2 

> # \subsection{Table tests}
> 
> chisq.test(comf.month)

	Pearson's Chi-squared test

data:  comf.month
X-squared = 6.6499, df = 4, p-value = 0.1556


> df <- 4

> (expected <- outer(rowSums(comf.month),
+ colSums(comf.month), "*")/sum(comf.month))
                     5         6        7        8         9
comfortable   22.69281 21.960784 22.69281 22.69281 21.960784
uncomfortable  8.30719  8.039216  8.30719  8.30719  8.039216

> (chi.squared <- sum((comf.month - expected)^2/expected))
[1] 6.649898

> (p.value <- 1 - pchisq(chi.squared, df))
[1] 0.1555872

> pdf(file="pics/21960.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> (HE <- margin.table(HairEyeColor, 1:2))
       Eye
Hair    Brown Blue Hazel Green
  Black    68   20    15     5
  Brown   119   84    54    29
  Red      26   17    14    14
  Blond     7   94    10    16

> assocplot(HE)

> par(oldpar); dev.off()
pdf 
  2 

> chisq.test(HE)

	Pearson's Chi-squared test

data:  HE
X-squared = 138.29, df = 9, p-value < 2.2e-16


> chisq.test(titanic)

	Pearson's Chi-squared test

data:  titanic
X-squared = 190.4, df = 3, p-value < 2.2e-16


> pairwise.Table2.test(titanic) # shipunov

	Pairwise comparisons using Pearson's Chi-squared test 

data:  titanic 

     1st     2nd     3rd
2nd  4.7e-07 -       -  
3rd  < 2e-16 8.3e-07 -  
Crew < 2e-16 3.9e-08 0.6

P value adjustment method: BH 

> tox <- read.table("data/poisoning.txt", h=TRUE)

> head(tox)
  ILL CHEESE CRABDIP CRISPS BREAD CHICKEN RICE CAESAR TOMATO ICECREAM CAKE
1   1      1       1      1     2       1    1      1      1        1    1
2   2      1       1      1     2       1    2      2      2        1    1
3   1      2       2      1     2       1    2      1      2        1    1
4   1      1       2      1     1       1    2      1      2        1    1
5   1      1       1      1     2       1    1      1      1        2    1
6   1      1       1      1     1       1    2      1      1        2    1
  JUICE WINE COFFEE
1     1    1      1
2     1    1      2
3     2    1      2
4     2    1      2
5     1    1      1
6     1    2      2

> tox.1 <- lapply(tox[, -1], function(.x) table(tox[, 1], .x))

> tox.2 <- array(unlist(tox.1),
+ dim=c(dim(tox.1[[1]]), length(tox.1))) # or simply c(2, 2, 13)

> dimnames(tox.2) <- list(c("ill","not ill"),
+ c("took","didn't take"), names(tox.1))

> tox.2[,,"TOMATO"]
        took didn't take
ill       24           5
not ill    6          10

> pdf(file="pics/33940_to_crop.pdf")

> fourfoldplot(tox.2, conf.level=0, col=c("yellow","black"))

> dev.off()
pdf 
  2 

> cbind(apply(tox.2, 3, function(.x) chisq.test(.x)$p.value))
Warning in chisq.test(.x) : Chi-squared approximation may be incorrect
Warning in chisq.test(.x) : Chi-squared approximation may be incorrect
Warning in chisq.test(.x) : Chi-squared approximation may be incorrect
Warning in chisq.test(.x) : Chi-squared approximation may be incorrect
Warning in chisq.test(.x) : Chi-squared approximation may be incorrect
Warning in chisq.test(.x) : Chi-squared approximation may be incorrect
Warning in chisq.test(.x) : Chi-squared approximation may be incorrect
                 [,1]
CHEESE   0.8408996794
CRABDIP  0.9493138514
CRISPS   1.0000000000
BREAD    0.3498177243
CHICKEN  0.3114822175
RICE     0.5464344359
CAESAR   0.0002034102
TOMATO   0.0059125029
ICECREAM 0.5977125948
CAKE     0.8694796709
JUICE    1.0000000000
WINE     1.0000000000
COFFEE   0.7265552461

> p.adjust(c(0.005, 0.05, 0.1), method="BH")
[1] 0.015 0.075 0.100

> chisq.test(c(5474, 1850), p=c(3/4, 1/4))

	Chi-squared test for given probabilities

data:  c(5474, 1850)
X-squared = 0.26288, df = 1, p-value = 0.6081


> pdf(file="pics/55440.pdf"); oldpar <- par(mar=c(4, 6, 3, 1))

> sp <- read.table("data/species.txt", sep="\t")

> species <- sp[, 2]

> names(species) <- sp[, 1]

> Dotchart(rev(sort(log10(species))),
+ xlab="Decimal logarithm of species number")

> chisq.test(species)

	Chi-squared test for given probabilities

data:  species
X-squared = 4771684, df = 7, p-value < 2.2e-16


> par(oldpar); dev.off()
pdf 
  2 

> fisher.test(tea.t)

	Fisher's Exact Test for Count Data

data:  tea.t
p-value = 0.4857
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
   0.2117329 621.9337505
sample estimates:
odds ratio 
  6.408309 


> fourfoldplot(tea.t)

> ee <- read.table("data/teeth.txt", h=TRUE)

> chisq.test(table(ee))
Warning in chisq.test(table(ee)) :
  Chi-squared approximation may be incorrect

	Pearson's Chi-squared test with Yates' continuity correction

data:  table(ee)
X-squared = 1.1398, df = 1, p-value = 0.2857


> summary(table(ee)) # No correction in summary.table()
Number of cases in table: 42 
Number of factors: 2 
Test for independence of all factors:
	Chisq = 2.3858, df = 1, p-value = 0.1224
	Chi-squared approximation may be incorrect

> chisq.test(table(ee), simulate.p.value=TRUE)

	Pearson's Chi-squared test with simulated p-value (based on 2000
	replicates)

data:  table(ee)
X-squared = 2.3858, df = NA, p-value = 0.1754


> sqrt(chisq.test(tea.t, correct=FALSE)$statistic/sum(tea.t))
Warning in chisq.test(tea.t, correct = FALSE) :
  Chi-squared approximation may be incorrect
X-squared 
      0.5 

> (x <- margin.table(Titanic, 1:2))
      Sex
Class  Male Female
  1st   180    145
  2nd   179    106
  3rd   510    196
  Crew  862     23

> VTcoeffs(x) # shipunov
               coefficients    values comments
1                Cramer's V 0.3987227   medium
2    Cramer's V (corrected) 0.3970098   medium
3             Tschuprow's T 0.3029637         
4 Tschuprow's T (corrected) 0.3016622         

> # \section{Answers to exercises}
> 
> # \subsection{Exercises on two samples}
> 
> aa <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)

> bb <- c(5, 5, 5, 5, 5, 5, 5, 5, 5)

> dif <- aa - bb

> pos.dif <- dif[dif > 0]

> prop.test(length(pos.dif), length(dif))
Warning in prop.test(length(pos.dif), length(dif)) :
  Chi-squared approximation may be incorrect

	1-sample proportions test with continuity correction

data:  length(pos.dif) out of length(dif), null probability 0.5
X-squared = 0, df = 1, p-value = 1
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.1534306 0.7734708
sample estimates:
        p 
0.4444444 


> ozone.month <- airquality[, c("Ozone", "Month")]

> ozone.month.list <- unstack(ozone.month)

> sapply(ozone.month.list, Normality) # shipunov
           5            6            7            8            9 
"NOT NORMAL"     "NORMAL"     "NORMAL"     "NORMAL" "NOT NORMAL" 

> sapply(unstack(ar, form=V2 ~ V1), Normality)
         air     chemical 
    "NORMAL" "NOT NORMAL" 

> wilcox.test(jitter(V2) ~ V1, data=ar)

	Wilcoxon rank sum exact test

data:  jitter(V2) by V1
W = 56, p-value = 0.0003108
alternative hypothesis: true location shift is not equal to 0


> cashiers <- read.table("data/cashiers.txt", h=TRUE)

> head(cashiers)
  CASHIER.1 CASHIER.2
1         3        12
2        12        12
3        13         9
4         5         6
5         4         2
6        11         9

> (cashiers.m <- sapply(cashiers, median))
CASHIER.1 CASHIER.2 
        8         9 

> with(cashiers, wilcox.test(CASHIER.1, CASHIER.2, alt="greater"))
Warning in wilcox.test.default(CASHIER.1, CASHIER.2, alt = "greater") :
  cannot compute exact p-value with ties

	Wilcoxon rank sum test with continuity correction

data:  CASHIER.1 and CASHIER.2
W = 236, p-value = 0.3523
alternative hypothesis: true location shift is greater than 0


> grades <- read.table("data/grades.txt")

> classes <- split(grades$V1, grades$V2)

> sapply(classes, Normality) # shipunov
          A1           A2           B1 
"NOT NORMAL" "NOT NORMAL" "NOT NORMAL" 

> sapply(classes, median, na.rm=TRUE)
A1 A2 B1 
 4  4  5 

> wilcox.test(classes$A1, classes$A2, paired=TRUE, conf.int=TRUE)
Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) :
  cannot compute exact confidence interval with ties
Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) :
  cannot compute exact p-value with zeroes
Warning in wilcox.test.default(classes$A1, classes$A2, paired = TRUE, conf.int = TRUE) :
  cannot compute exact confidence interval with zeroes

	Wilcoxon signed rank test with continuity correction

data:  classes$A1 and classes$A2
V = 15.5, p-value = 0.8605
alternative hypothesis: true location shift is not equal to 0
95 percent confidence interval:
 -1.499923  1.500018
sample estimates:
(pseudo)median 
             0 


> wilcox.test(classes$B1, classes$A1, alt="greater", conf.int=TRUE)
Warning in wilcox.test.default(classes$B1, classes$A1, alt = "greater",  :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(classes$B1, classes$A1, alt = "greater",  :
  cannot compute exact confidence intervals with ties

	Wilcoxon rank sum test with continuity correction

data:  classes$B1 and classes$A1
W = 306, p-value = 0.02382
alternative hypothesis: true location shift is greater than 0
95 percent confidence interval:
 6.957242e-05          Inf
sample estimates:
difference in location 
          6.160018e-05 


> cliff.delta(classes$A1, classes$A2)

Cliff's Delta

delta estimate: 0.03557312 (negligible)
95 percent confidence interval:
     lower      upper 
-0.2744203  0.3388690 


> cliff.delta(classes$B1, classes$A1)

Cliff's Delta

delta estimate: 0.3246753 (small)
95 percent confidence interval:
       lower        upper 
0.0003599976 0.5871917025 


> aa <- read.table(
+ "http://ashipunov.me/shipunov/open/aegopodium.txt", h=TRUE)

> aa$SUN <- factor(aa$SUN, labels=c("shade","sun"))

> Str(aa)
 'data.frame':	100 obs. of  5 variables: 
  1 PET.L : num  25.5 24.1 27.4 24.9 26.2 37.4 18 30.3 26.1 17.8 ... 
  2 TERM.L: num  8.9 5.8 8.2 6.8 8.2 13.1 7.2 7.8 7.1 5.8 ... 
  3 LEAF.L: num  34.4 29.9 35.6 31.7 34.4 50.5 25.2 38.1 33.2 23.6 ... 
  4 BLADES: int  5 7 7 7 7 4 7 6 7 7 ... 
  5 SUN   : Factor w/ 2 levels "shade","sun": 1 1 1 1 1 1 1 1 1 1 ... 

> pdf(file="pics/47860.pdf"); oldpar <- par(mar=c(1, 2, 0, 1))

> aggregate(aa[, -5], list(light=aa[, 5]), Normality) # shipunov
  light  PET.L     TERM.L LEAF.L     BLADES
1 shade NORMAL     NORMAL NORMAL NOT NORMAL
2   sun NORMAL NOT NORMAL NORMAL NOT NORMAL

> Linechart(aa[, 1:4], aa[, 5], xmarks=FALSE, lcolor=1,
+ se.lwd=2, mad=TRUE) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> t.test(LEAF.L ~ SUN, data=aa)

	Welch Two Sample t-test

data:  LEAF.L by SUN
t = 14.846, df = 63.691, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 14.20854 18.62746
sample estimates:
mean in group shade   mean in group sun 
             35.534              19.116 


> library(effsize)

> cohen.d(LEAF.L ~ SUN, data=aa)

Cohen's d

d estimate: 2.969218 (large)
95 percent confidence interval:
   lower    upper 
2.393787 3.544650 


> summary(K(LEAF.L ~ SUN, data=aa))
Lubischew's K        Effect             P 
         4.41        Strong          0.07 

> # \subsection{Exercises on ANOVA}
> 
> summary(aov(HEIGHT ~ COLOR, data=hwc))
            Df Sum Sq Mean Sq F value Pr(>F)    
COLOR        2 2787.3  1393.6   153.3 <2e-16 ***
Residuals   87  791.1     9.1                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> pairwise.t.test(hwc$HEIGHT, hwc$COLOR)

	Pairwise comparisons using t tests with pooled SD 

data:  hwc$HEIGHT and hwc$COLOR 

      black   blond  
blond < 2e-16 -      
brown 1.7e-10 3.3e-16

P value adjustment method: holm 

> mm <- read.table(
+ "http://ashipunov.me/shipunov/open/melampyrum.txt", h=TRUE)

> Str(mm)
 'data.frame':	126 obs. of  9 variables: 
  1 LOC      : int  1 1 1 1 1 1 1 1 1 1 ... 
  2 P.HEIGHT : int  246 235 270 260 300 250 205 190 275 215 ... 
  3 NODES    * int  NA NA NA NA NA NA NA NA NA NA ... 
  4 V.NODES  * int  NA NA NA NA NA NA NA NA NA NA ... 
  5 LEAF.L   * int  23 27 35 20 38 46 17 22 42 26 ... 
  6 LEAF.W   * int  5 3 4 4 6 5 3 3 4 3 ... 
  7 LEAF.MAXW* int  3 5 5 4 6 11 5 4 5 3 ... 
  8 TEETH    * int  3 2 2 6 2 4 4 5 4 3 ... 
  9 TOOTH.L  * int  4 2 2 5 6 2 11 3 5 4 ... 

> pdf(file="pics/61660.pdf")

> old.par <- par(mfrow=c(2, 1), mai=c(0.5, 0.5, 0.1, 0.1))

> boxplot(P.HEIGHT ~ LOC, data=mm, col=grey(0.8))

> boxplot(LEAF.L ~ LOC, data=mm, col=rgb(173, 204, 90, max=255))

> par(old.par)

> dev.off()
pdf 
  2 

> sapply(mm[, c(2, 5)], Normality)
    P.HEIGHT       LEAF.L 
    "NORMAL" "NOT NORMAL" 

> bartlett.test(P.HEIGHT ~ LOC, data=mm)

	Bartlett test of homogeneity of variances

data:  P.HEIGHT by LOC
Bartlett's K-squared = 17.014, df = 6, p-value = 0.00923


> oneway.test(P.HEIGHT ~ LOC, data=mm)

	One-way analysis of means (not assuming equal variances)

data:  P.HEIGHT and LOC
F = 18.376, num df = 6.000, denom df = 49.765, p-value = 4.087e-11


> pairwise.t.test(mm$P.HEIGHT, mm$LOC)

	Pairwise comparisons using t tests with pooled SD 

data:  mm$P.HEIGHT and mm$LOC 

  1       2       3       4       5       6      
2 0.05381 -       -       -       -       -      
3 0.00511 1.1e-08 -       -       -       -      
4 1.00000 0.00779 0.00511 -       -       -      
5 1.00000 0.04736 0.00041 1.00000 -       -      
6 4.2e-05 1.8e-11 0.62599 2.3e-05 1.1e-06 -      
7 0.28824 1.9e-05 0.39986 0.39986 0.09520 0.01735

P value adjustment method: holm 

> kruskal.test(LEAF.L ~ LOC, data=mm)

	Kruskal-Wallis rank sum test

data:  LEAF.L by LOC
Kruskal-Wallis chi-squared = 22.6, df = 6, p-value = 0.0009422


> pairwise.wilcox.test(mm$LEAF.L, mm$LOC)
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties
Warning in wilcox.test.default(xi, xj, paired = paired, ...) :
  cannot compute exact p-value with ties

	Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  mm$LEAF.L and mm$LOC 

  1      2      3      4      5      6     
2 0.6249 -      -      -      -      -     
3 1.0000 0.1538 -      -      -      -     
4 0.4999 0.0064 1.0000 -      -      -     
5 1.0000 0.5451 1.0000 1.0000 -      -     
6 0.6599 1.0000 0.1434 0.0028 0.5451 -     
7 1.0000 1.0000 1.0000 0.5904 1.0000 1.0000

P value adjustment method: holm 

> # \subsection{Exercises on tables}
> 
> pr <- read.table("data/seedlings.txt", h=TRUE)

> str(pr)
'data.frame':	80 obs. of  2 variables:
 $ CID    : int  63 63 63 63 63 63 63 63 63 63 ...
 $ GERM.14: int  1 1 1 1 1 1 1 1 1 1 ...

> pdf(file="pics/59370.pdf")

> (pr.t <- table(pr))
     GERM.14
CID    0  1
  0    1 19
  63   3 17
  80  17  3
  105 10 10

> Dotchart(t(pr.t))

> par(oldpar)

> pdf(file="pics/47350_to_crop.pdf")

> library(vcd)
Loading required package: grid

> assoc(pr.t, shade=TRUE, gp=shading_Friendly2,
+ gp_args=list(interpolate=c(1, 1.8)))

> par(oldpar)

> chisq.test(pr.t, simulate=TRUE)

	Pearson's Chi-squared test with simulated p-value (based on 2000
	replicates)

data:  pr.t
X-squared = 33.443, df = NA, p-value = 0.0004998


> pairwise.Table2.test(table(pr), exact=TRUE)

	Pairwise comparisons using Fisher's Exact Test 

data:  table(pr) 

    0       63      80    
63  0.6050  -       -     
80  2.4e-06 5.8e-05 -     
105 0.0067  0.0489  0.0489

P value adjustment method: BH 

> tox <- read.table("data/poisoning.txt", h=TRUE)

> tox.p.values <- apply(tox[, -1], 2,
+ function(.x) fisher.test(table(tox[, 1], .x))$p.value)

> sort(round(p.adjust(tox.p.values, method="BH"), 3))
  CAESAR   TOMATO   CHEESE  CRABDIP    BREAD  CHICKEN     RICE ICECREAM 
   0.001    0.021    0.985    0.985    0.985    0.985    0.985    0.985 
    CAKE   COFFEE   CRISPS    JUICE     WINE 
   0.985    0.985    1.000    1.000    1.000 

> cc <-read.table(
+ "http://ashipunov.me/shipunov/open/cochlearia.txt", h=TRUE)

> cc$LOC <- factor(cc$LOC, labels=paste0("loc", levels(cc$LOC)))

> cc$IS.CREEPING <- factor(cc$IS.CREEPING,
+ labels=c("upright", "creeping"))

> str(cc)
'data.frame':	174 obs. of  8 variables:
 $ LOC        : Factor w/ 8 levels "loc1","loc2",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ STEM       : int  162 147 170 432 146 207 91 166 204 172 ...
 $ IS.CREEPING: Factor w/ 2 levels "upright","creeping": 1 1 1 1 1 1 1 2 2 2 ...
 $ LATERAL    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ PETAL.L    : num  NA NA NA NA NA NA NA 3 NA NA ...
 $ FRUIT.L    : int  6 7 9 7 7 6 7 6 7 8 ...
 $ FRUIT.W    : int  4 3 7 5 4 5 4 4 5 5 ...
 $ SEED.S     : int  2 2 1 1 1 1 1 2 1 1 ...

> pdf(file="pics/51170.pdf"); oldpar <- par(mar=c(4, 4, 1, 2))

> s.cols <- colorRampPalette(c("white", "forestgreen"))(5)[3:4]

> spineplot(IS.CREEPING ~ LOC, data=cc, col=s.cols)

> par(oldpar); dev.off()
pdf 
  2 

> (cc.lc <- xtabs(~ LOC + IS.CREEPING, data=cc))
      IS.CREEPING
LOC    upright creeping
  loc1      15       24
  loc2      39        0
  loc3       0        1
  loc4       5        0
  loc5      15        0
  loc6      46        0
  loc7      15        0
  loc8       7        7

> chisq.test(cc.lc, simulate.p.value=TRUE)

	Pearson's Chi-squared test with simulated p-value (based on 2000
	replicates)

data:  cc.lc
X-squared = 89.177, df = NA, p-value = 0.0004998


> VTcoeffs(cc.lc)[2, ] # shipunov
Warning in chisq.test(table, correct = correct, ...) :
  Chi-squared approximation may be incorrect
            coefficients    values comments
2 Cramer's V (corrected) 0.6872265    large

> (betula.ll <- table(betula[betula$LOC < 3, c("LOC","LOBES")]))
   LOBES
LOC  0  1
  1 17  4
  2 14 16

> pdf(file="pics/48360.pdf"); oldpar <- par(mar=c(4, 4, 0, 2))

> birch.cols <- colorRampPalette(c("black", "forestgreen"))(5)[3:4]

> spineplot(betula.ll, col=birch.cols)

> par(oldpar); dev.off()
pdf 
  2 

> prop.test(betula.ll)

	2-sample test for equality of proportions with continuity correction

data:  betula.ll
X-squared = 4.7384, df = 1, p-value = 0.0295
alternative hypothesis: two.sided
95 percent confidence interval:
 0.05727638 0.62843791
sample estimates:
   prop 1    prop 2 
0.8095238 0.4666667 


> fisher.test(betula.ll)

	Fisher's Exact Test for Count Data

data:  betula.ll
p-value = 0.01987
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  1.156525 23.904424
sample estimates:
odds ratio 
  4.704463 


> chisq.test(betula.ll)

	Pearson's Chi-squared test with Yates' continuity correction

data:  betula.ll
X-squared = 4.7384, df = 1, p-value = 0.0295


> VTcoeffs(betula.ll)[2, ] # shipunov
            coefficients    values comments
2 Cramer's V (corrected) 0.3159734   medium

> (betula.lw <- table(betula[, c("LOBES", "WINGS")]))
     WINGS
LOBES  0  1
    0 61 69
    1 50 45

> pdf(file="pics/50140.pdf")

> fourfoldplot(betula.lw, col=birch.cols)

> dev.off()
pdf 
  2 

> mcnemar.test(betula.lw)

	McNemar's Chi-squared test with continuity correction

data:  betula.lw
McNemar's chi-squared = 2.7227, df = 1, p-value = 0.09893


> # \chapter{Two-dimensional data: models}
> 
> # \section{Analysis of correlation}
> 
> # \subsection{Plot it first}
> 
> classic.desc <- function(.x) {c(mean=mean(.x, na.rm=TRUE),
+ var=var(.x, na.rm=TRUE))}

> sapply(anscombe, classic.desc)
     x1 x2 x3 x4       y1       y2      y3       y4
mean  9  9  9  9 7.500909 7.500909 7.50000 7.500909
var  11 11 11 11 4.127269 4.127629 4.12262 4.123249

> pdf(file="pics/anscombe.pdf"); oldpar <- par(mar=c(4, 4, 2, 1))

> a.vars <- data.frame(i=c(1, 5), ii=c(2, 6),
+ iii=c(3, 7), iv=c(4, 8))

> oldpar <- par(mfrow=c(2, 2), mar=c(4, 4, 1, 1))

> for (i in 1:4) { plot(anscombe[a.vars[, i]], pch=19, cex=1.2);
+ abline(lm(anscombe[rev(a.vars[, i])]), lty=2) }

> par(oldpar); dev.off()
pdf 
  2 

> robust.desc <- function(.x) {c(median=median(.x, na.rm=TRUE),
+ IQR=IQR(.x, na.rm=TRUE), mad=mad(.x, na.rm=TRUE))}

> sapply(anscombe, robust.desc)
           x1     x2     x3 x4       y1       y2       y3       y4
median 9.0000 9.0000 9.0000  8 7.580000 8.140000 7.110000 7.040000
IQR    5.0000 5.0000 5.0000  0 2.255000 2.255000 1.730000 2.020000
mad    4.4478 4.4478 4.4478  0 1.823598 1.467774 1.527078 1.897728

> # \subsection{Correlation}
> 
> cor(5:15, 7:17)
[1] 1

> cor(5:15, c(7:16, 23))
[1] 0.9375093

> cor(trees)
           Girth    Height    Volume
Girth  1.0000000 0.5192801 0.9671194
Height 0.5192801 1.0000000 0.5982497
Volume 0.9671194 0.5982497 1.0000000

> with(trees, cor(Girth, Height))
[1] 0.5192801

> (v1 <- var(trees$Girth))
[1] 9.847914

> (v2 <- var(trees$Height))
[1] 40.6

> (v12 <- var(trees$Girth - trees$Height))
[1] 29.68125

> (pearson.r <- (v1 + v2 - v12)/(2*sqrt(v1)*sqrt(v2)))
[1] 0.5192801

> with(trees, cov(Girth, Height)/(sd(Girth)*sd(Height)))
[1] 0.5192801

> noquote(apply(cor(trees), 1:2,
+ function(.x) Mag(.x, squared=FALSE)))  # shipunov
       Girth     Height    Volume   
Girth  very high high      very high
Height high      very high high     
Volume very high high      very high

> diag(cor(anscombe[, 1:4], anscombe[, 5:8]))
[1] 0.8164205 0.8162365 0.8162867 0.8165214

> with(trees, cor(Girth, Height, method="spearman"))
[1] 0.4408387

> diag(cor(anscombe[, 1:4], anscombe[, 5:8], method="s"))
[1] 0.8181818 0.6909091 0.9909091 0.5000000

> with(trees, cor(Girth, Height, method="k"))
[1] 0.3168641

> diag(cor(anscombe[, 1:4], anscombe[, 5:8], method="k"))
[1] 0.6363636 0.5636364 0.9636364 0.4264014

> with(trees, cor.test(Girth, Height))

	Pearson's product-moment correlation

data:  Girth and Height
t = 3.2722, df = 29, p-value = 0.002758
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2021327 0.7378538
sample estimates:
      cor 
0.5192801 


> symnum(cor(longley))
             GNP. GNP U A P Y E
GNP.deflator 1                 
GNP          B    1            
Unemployed   ,    ,   1        
Armed.Forces .    .     1      
Population   B    B   , . 1    
Year         B    B   , . B 1  
Employed     B    B   . . B B 1
attr(,"legend")
[1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1

> pdf(file="pics/22880_to_crop.pdf")

> library(lattice)

> mycorr <- abs(cor(iris[, -5]))

> levelplot(mycorr, xlab="", ylab="") # axes names a redundant

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/23120_to_crop.pdf")

> library(ellipse)

Attaching package: ‘ellipse’

The following object is masked from ‘package:graphics’:

    pairs


> colors <- cm.colors(7)

> cor.l <- cor(longley)

> plotcorr(cor.l, type="lower", col=colors[5*cor.l + 2])

> dev.off()
pdf 
  2 

> detach(package:ellipse) # because it masks base::pairs()

> pdf(file="pics/35870.pdf")

> tox.cor <- cor(tox, method="k")

> Pleiad(tox.cor, corr=TRUE, lcol="black") # shipunov

> dev.off()
pdf 
  2 

> Cor(tox, method="kendall", dec=2) # shipunov
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
            ILL CHEESE CRABDIP CRISPS  BREAD CHICKEN   RICE CAESAR TOMATO
ILL          -   0.08    0.06  -0.03  -0.19    0.21   0.14    0.6*  0.46*
CHEESE    0.08      -   -0.38* -0.11   0.34*   0.04   0.03   0.08   0.22 
CRABDIP   0.06  -0.38*      -   0.64*  -0.3*   0.27   0.18   0.04   0.19 
CRISPS   -0.03  -0.11    0.64*     -  -0.05    0.33*  0.17   0.25   0.21 
BREAD    -0.19   0.34*   -0.3* -0.05      -    0.05   0.09   0.03  -0.03 
CHICKEN   0.21   0.04    0.27   0.33*  0.05       -   0.17   0.02   0.12 
RICE      0.14   0.03    0.18   0.17   0.09    0.17      -    0.1   0.28 
CAESAR     0.6*  0.08    0.04   0.25   0.03    0.02    0.1      -   0.64*
TOMATO    0.46*  0.22    0.19   0.21  -0.03    0.12   0.28   0.64*     - 
ICECREAM  0.13  -0.02   -0.13  -0.13  -0.09    0.21   0.04   -0.2  -0.13 
CAKE      0.08   0.09    0.02   0.09  -0.26    0.08  -0.05  -0.08   0.11 
JUICE    -0.04   0.06    0.32*  0.16   0.11    0.07   0.08  -0.21  -0.17 
WINE      0.01   0.14   -0.04   0.12   0.02   -0.03  -0.02   0.06   0.04 
COFFEE    -0.1   0.12    0.21   0.05   0.04    0.18   0.41* -0.13   0.03 
         ICECREAM   CAKE  JUICE   WINE COFFEE
ILL         0.13   0.08  -0.04   0.01   -0.1 
CHEESE     -0.02   0.09   0.06   0.14   0.12 
CRABDIP    -0.13   0.02   0.32* -0.04   0.21 
CRISPS     -0.13   0.09   0.16   0.12   0.05 
BREAD      -0.09  -0.26   0.11   0.02   0.04 
CHICKEN     0.21   0.08   0.07  -0.03   0.18 
RICE        0.04  -0.05   0.08  -0.02   0.41*
CAESAR      -0.2  -0.08  -0.21   0.06  -0.13 
TOMATO     -0.13   0.11  -0.17   0.04   0.03 
ICECREAM       -  -0.13  -0.14   0.01   -0.1 
CAKE       -0.13      -   0.16   0.01  -0.16 
JUICE      -0.14   0.16      -  -0.32* -0.01 
WINE        0.01   0.01  -0.32*     -  -0.12 
COFFEE      -0.1  -0.16  -0.01  -0.12      - 

> Topm(tox.cor, level=0.4) # shipunov
    Var1    Var2     Value Magnitude
1 TOMATO  CAESAR 0.6449020      high
2 CRISPS CRABDIP 0.6359727      high
3 CAESAR     ILL 0.6039006      high
4 TOMATO     ILL 0.4595725    medium
5 COFFEE    RICE 0.4134925    medium

> # \section{Analysis of regression}
> 
> # \subsection{Single line}
> 
> pdf(file="pics/23870.pdf"); oldpar <- par(mar=c(4, 4, 1, 1))

> women.lm <- lm(weight ~ height, data=women)

> plot(weight ~ height, data=women,
+ xlab="Height, in", ylab="Weight, lb")

> grid()

> abline(women.lm, col="red")

> Cladd(women.lm, data=women) # shipunov

> legend("bottomright", col=2:1, lty=1:2,
+ legend=c("linear relationship", "95% confidence bands"))

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/56910.pdf"); oldpar <- par(mar=c(4, 4, 1, 1))

> plot(weight ~ height, data=women, pch=19, col="red")

> abline(women.lm)

> with(women, segments(height, fitted(women.lm), height, weight,
+ col="red"))

> par(oldpar); dev.off()
pdf 
  2 

> summary(women.lm)

Call:
lm(formula = weight ~ height, data = women)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.7333 -1.1333 -0.3833  0.7417  3.1167 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
height        3.45000    0.09114   37.85 1.09e-14 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.525 on 13 degrees of freedom
Multiple R-squared:  0.991,	Adjusted R-squared:  0.9903 
F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14


> Mag(0.9903) # shipunov
[1] "very high"

> (atan(women.lm$coefficients[[2]]) * 180)/pi
[1] 73.8355

> oldpar <- par(mfrow=c(3, 3))

> ## Uniform variation and no trend:
> 
> for (i in 1:9) plot(1:50, rnorm(50), xlab="Fitted",
+ ylab="Residuals")

> title("'Good' Residuals vs. Fitted", outer=TRUE, line=-2)

> ## Non-uniform variation plus trend:
> 
> for (i in 1:9) plot(1:50, ((1:50)*rnorm(50) + 50:1),
+ xlab="Fitted",ylab="Residuals")

> title("'Bad' Residuals vs. Fitted", outer=TRUE, line=-2)

> par(oldpar)

> oldpar <- par(mfrow=c(3, 3))

> for (i in 1:9) { aa <- rnorm(50);
+ qqnorm(aa, main=""); qqline(aa) }

> title("'Good' normality QQ plots", outer=TRUE, line=-2)

> for (i in 1:9) { aa <- rnorm(50)^2;
+ qqnorm(aa, main=""); qqline(aa) }

> title("'Bad' normality QQ plots", outer=TRUE, line=-2)

> par(oldpar)

> Normality(women.lm$residuals) # shipunov
[1] "NORMAL"

> pdf(file="pics/38550.pdf"); oldpar <- par(mar=c(4, 4, 2, 1))

> women.lm3 <- lm(weight ~ height + I(height^3), data=women)

> summary(women.lm3)

Call:
lm(formula = weight ~ height + I(height^3), data = women)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.49318 -0.26391 -0.02908  0.28097  0.54213 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.466e+02  1.589e+01   9.223 8.52e-07 ***
height      -1.981e+00  3.679e-01  -5.385 0.000164 ***
I(height^3)  4.274e-04  2.890e-05  14.788 4.57e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.362 on 12 degrees of freedom
Multiple R-squared:  0.9995,	Adjusted R-squared:  0.9995 
F-statistic: 1.282e+04 on 2 and 12 DF,  p-value: < 2.2e-16


> plot(women.lm3, which=1) # just residuals vs. fitted

> par(oldpar); dev.off()
pdf 
  2 

> eggs <- read.table("data/eggs.txt")

> eggs.lm <- lm(V2 ~ V1, data=eggs)

> plot(eggs.lm)

> summary(eggs.lm)

Call:
lm(formula = V2 ~ V1, data = eggs)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.4934 -1.7508 -0.0615  1.8350 11.2049 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.84827    0.49581   3.728 0.000213 ***
V1           0.69823    0.01212  57.597  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.617 on 553 degrees of freedom
Multiple R-squared:  0.8571,	Adjusted R-squared:  0.8569 
F-statistic:  3317 on 1 and 553 DF,  p-value: < 2.2e-16


> confint(eggs.lm)
                2.5 %    97.5 %
(Intercept) 0.8743690 2.8221677
V1          0.6744162 0.7220408

> Mag(summary(eggs.lm)$adj.r.squared)
[1] "very high"

> ee <- read.table("http://ashipunov.me/data/exams.txt", h=TRUE)

> str(ee)
'data.frame':	201 obs. of  3 variables:
 $ EXAM.N   : int  3 3 3 3 3 3 3 3 3 3 ...
 $ ORDER    : int  1 2 3 4 5 6 7 8 9 10 ...
 $ POINTS.50: int  42 23 30 32 27 19 37 30 36 28 ...

> ee3 <- ee[ee$EXAM.N == 3,]

> plot(POINTS.50 ~ ORDER, data=ee3)

> ee3.lm <- lm(POINTS.50 ~ ORDER, data=ee3)

> summary(ee3.lm)

Call:
lm(formula = POINTS.50 ~ ORDER, data = ee3)

Residuals:
     Min       1Q   Median       3Q      Max 
-16.0118  -4.7561   0.4708   4.4344  13.4695 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 33.35273    1.24634  26.761   <2e-16 ***
ORDER       -0.02005    0.02143  -0.936    0.352    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.185 on 98 degrees of freedom
Multiple R-squared:  0.008859,	Adjusted R-squared:  -0.001254 
F-statistic: 0.876 on 1 and 98 DF,  p-value: 0.3516


> plot(ee3.lm)

> abline(ee3.lm)

> Cladd(ee3.lm, data=ee3)

> newcolor <- relevel(hwc$COLOR, "brown")

> summary(lm(cbind(WEIGHT, HEIGHT) ~ newcolor, data=hwc))
Response WEIGHT :

Call:
lm(formula = WEIGHT ~ newcolor, data = hwc)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.2333 -2.0000 -0.2333  1.9417  7.0000 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)    75.0000     0.5469 137.129  < 2e-16 ***
newcolorblack   4.2333     0.7735   5.473  4.2e-07 ***
newcolorblond  -0.7667     0.7735  -0.991    0.324    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.996 on 87 degrees of freedom
Multiple R-squared:  0.3579,	Adjusted R-squared:  0.3431 
F-statistic: 24.24 on 2 and 87 DF,  p-value: 4.286e-09


Response HEIGHT :

Call:
lm(formula = HEIGHT ~ newcolor, data = hwc)

Residuals:
   Min     1Q Median     3Q    Max 
-6.300 -2.233 -0.050  1.750  9.133 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   164.2333     0.5506 298.302  < 2e-16 ***
newcolorblack   5.6333     0.7786   7.235 1.72e-10 ***
newcolorblond  -7.9333     0.7786 -10.189  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.016 on 87 degrees of freedom
Multiple R-squared:  0.7789,	Adjusted R-squared:  0.7738 
F-statistic: 153.3 on 2 and 87 DF,  p-value: < 2.2e-16


> # \subsection{Many lines}
> 
> pdf(file="pics/32650.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> ipo <- read.table("data/ipomopsis.txt", h=TRUE)

> with(ipo, plot(Root, Fruit, pch=as.numeric(Grazing)))
Warning in plot.xy(xy, type, ...) : NAs introduced by coercion

> abline(lm(Fruit ~ Root, data=subset(ipo, Grazing=="Grazed")))

> abline(lm(Fruit ~ Root, data=subset(ipo, Grazing=="Ungrazed")),
+ lty=2)

> legend("topleft", lty=1:2, legend=c("Grazed","Ungrazed"))

> par(oldpar); dev.off()
pdf 
  2 

> ipo.lm <- lm(Fruit ~ Root * Grazing, data=ipo)

> summary(ipo.lm)

Call:
lm(formula = Fruit ~ Root * Grazing, data = ipo)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.3177  -2.8320   0.1247   3.8511  17.1313 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          -125.173     12.811  -9.771 1.15e-11 ***
Root                   23.240      1.531  15.182  < 2e-16 ***
GrazingUngrazed        30.806     16.842   1.829   0.0757 .  
Root:GrazingUngrazed    0.756      2.354   0.321   0.7500    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.831 on 36 degrees of freedom
Multiple R-squared:  0.9293,	Adjusted R-squared:  0.9234 
F-statistic: 157.6 on 3 and 36 DF,  p-value: < 2.2e-16


> ipo.lm2 <- update(ipo.lm, . ~ . - Root:Grazing)

> summary(ipo.lm2)

Call:
lm(formula = Fruit ~ Root + Grazing, data = ipo)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.1920  -2.8224   0.3223   3.9144  17.3290 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -127.829      9.664  -13.23 1.35e-15 ***
Root              23.560      1.149   20.51  < 2e-16 ***
GrazingUngrazed   36.103      3.357   10.75 6.11e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.747 on 37 degrees of freedom
Multiple R-squared:  0.9291,	Adjusted R-squared:  0.9252 
F-statistic: 242.3 on 2 and 37 DF,  p-value: < 2.2e-16


> AIC(ipo.lm)
[1] 273.0135

> AIC(ipo.lm2)
[1] 271.1279

> AIC(women.lm)
[1] 59.08158

> AIC(women.lm3)
[1] 16.73861

> elections <- read.table("data/elections.txt", h=TRUE)

> str(elections)
'data.frame':	153 obs. of  7 variables:
 $ DISTRICT: int  1 2 3 4 5 6 7 8 9 10 ...
 $ VOTER   : int  329786 144483 709903 696114 717095 787593 696087 691688 730050 164275 ...
 $ INVALID : int  2623 859 5656 4392 3837 4715 10456 1819 2403 89 ...
 $ VALID   : int  198354 97863 664195 619629 653133 655486 397797 632045 667994 161470 ...
 $ CAND.1  : int  24565 7884 30491 54999 36880 72166 43559 53046 60355 234 ...
 $ CAND.2  : int  11786 6364 11152 11519 10002 25204 28189 5504 2680 451 ...
 $ CAND.3  : int  142627 68573 599105 525814 559653 485669 267776 563502 599798 159496 ...

> attach(elections)

> PROP <- cbind(CAND.1, CAND.2, CAND.3) / VOTER

> ATTEN <- (VALID + INVALID) / VOTER

> pdf(file="pics/27760.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> lm.1 <- lm(CAND.1/VOTER ~ ATTEN)

> lm.2 <- lm(CAND.2/VOTER ~ ATTEN)

> lm.3 <- lm(CAND.3/VOTER ~ ATTEN)

> plot(CAND.3/VOTER ~ ATTEN, xlim=c(0, 1), ylim=c(0, 1),
+ xlab="Attendance", ylab="Percent voted for the candidate")

> points(CAND.1/VOTER ~ ATTEN, pch=2)

> points(CAND.2/VOTER ~ ATTEN, pch=3)

> abline(lm.3)

> abline(lm.2, lty=2)

> abline(lm.1, lty=3)

> legend("topleft", lty=c(3, 2, 1),
+ legend=c("Party 1", "Party 2", "Party 3"))

> detach(elections)

> par(oldpar); dev.off()
pdf 
  2 

> elections2 <- cbind(ATTEN, stack(data.frame(PROP)))

> names(elections2) <- c("atten", "percn", "cand")

> str(elections2)
'data.frame':	459 obs. of  3 variables:
 $ atten: num  0.609 0.683 0.944 0.896 0.916 ...
 $ percn: num  0.0745 0.0546 0.043 0.079 0.0514 ...
 $ cand : Factor w/ 3 levels "CAND.1","CAND.2",..: 1 1 1 1 1 1 1 1 1 1 ...

> ancova.v <- lm(percn ~ atten * cand, data=elections2)

> summary(ancova.v)

Call:
lm(formula = percn ~ atten * cand, data = elections2)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.116483 -0.015470 -0.000699  0.014825  0.102810 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       0.117115   0.011973   9.781  < 2e-16 ***
atten            -0.070726   0.018266  -3.872 0.000124 ***
candCAND.2       -0.023627   0.016933  -1.395 0.163591    
candCAND.3       -0.547179   0.016933 -32.315  < 2e-16 ***
atten:candCAND.2  0.004129   0.025832   0.160 0.873074    
atten:candCAND.3  1.393336   0.025832  53.938  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.02591 on 453 degrees of freedom
Multiple R-squared:  0.9824,	Adjusted R-squared:  0.9822 
F-statistic:  5057 on 5 and 453 DF,  p-value: < 2.2e-16


> # \subsection{More then one way, again}
> 
> ToothGrowth.1 <- with(ToothGrowth,
+ data.frame(len, supp, fdose=factor(dose)))

> str(ToothGrowth.1)
'data.frame':	60 obs. of  3 variables:
 $ len  : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
 $ supp : Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
 $ fdose: Factor w/ 3 levels "0.5","1","2": 1 1 1 1 1 1 1 1 1 1 ...

> Normality(ToothGrowth$len)
[1] "NORMAL"

> with(ToothGrowth, fligner.test(split(len, list(dose, supp))))

	Fligner-Killeen test of homogeneity of variances

data:  split(len, list(dose, supp))
Fligner-Killeen:med chi-squared = 7.7488, df = 5, p-value = 0.1706


> summary(aov(len ~ supp * fdose, data=ToothGrowth.1))
            Df Sum Sq Mean Sq F value   Pr(>F)    
supp         1  205.4   205.4  15.572 0.000231 ***
fdose        2 2426.4  1213.2  92.000  < 2e-16 ***
supp:fdose   2  108.3    54.2   4.107 0.021860 *  
Residuals   54  712.1    13.2                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> AIC(aov(len ~ supp * fdose, data=ToothGrowth.1))
[1] 332.7056

> summary(aov(len ~ supp + fdose, data=ToothGrowth.1))
            Df Sum Sq Mean Sq F value   Pr(>F)    
supp         1  205.4   205.4   14.02 0.000429 ***
fdose        2 2426.4  1213.2   82.81  < 2e-16 ***
Residuals   56  820.4    14.7                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> AIC(aov(len ~ supp + fdose, data=ToothGrowth.1))
[1] 337.2013

> Eta2(aov(len ~ supp * fdose, data=ToothGrowth.1))
[1] 0.7937246

> TukeyHSD(aov(len ~ supp * fdose, data=ToothGrowth.1))
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = len ~ supp * fdose, data = ToothGrowth.1)

$supp
      diff       lwr       upr     p adj
VC-OJ -3.7 -5.579828 -1.820172 0.0002312

$fdose
        diff       lwr       upr   p adj
1-0.5  9.130  6.362488 11.897512 0.0e+00
2-0.5 15.495 12.727488 18.262512 0.0e+00
2-1    6.365  3.597488  9.132512 2.7e-06

$`supp:fdose`
               diff        lwr        upr     p adj
VC:0.5-OJ:0.5 -5.25 -10.048124 -0.4518762 0.0242521
OJ:1-OJ:0.5    9.47   4.671876 14.2681238 0.0000046
VC:1-OJ:0.5    3.54  -1.258124  8.3381238 0.2640208
OJ:2-OJ:0.5   12.83   8.031876 17.6281238 0.0000000
VC:2-OJ:0.5   12.91   8.111876 17.7081238 0.0000000
OJ:1-VC:0.5   14.72   9.921876 19.5181238 0.0000000
VC:1-VC:0.5    8.79   3.991876 13.5881238 0.0000210
OJ:2-VC:0.5   18.08  13.281876 22.8781238 0.0000000
VC:2-VC:0.5   18.16  13.361876 22.9581238 0.0000000
VC:1-OJ:1     -5.93 -10.728124 -1.1318762 0.0073930
OJ:2-OJ:1      3.36  -1.438124  8.1581238 0.3187361
VC:2-OJ:1      3.44  -1.358124  8.2381238 0.2936430
OJ:2-VC:1      9.29   4.491876 14.0881238 0.0000069
VC:2-VC:1      9.37   4.571876 14.1681238 0.0000058
VC:2-OJ:2      0.08  -4.718124  4.8781238 1.0000000


> pdf(file="pics/64650.pdf"); oldpar <- par(mar=c(4, 7, 3, 1))

> plot(TukeyHSD(aov(len ~ supp * fdose, data=ToothGrowth.1)), las=1)

> par(oldpar); dev.off()
pdf 
  2 

> # \section{Probability of the success: logistic regression}
> 
> l <- read.table("data/logit.txt")

> l$V2 <- as.factor(l$V2)

> head(l)
  V1 V2
1 14  F
2 29  F
3  6  F
4 25  S
5 18  S
6  4  F

> pdf(file="pics/40380.pdf"); oldpar <- par(mar=c(4, 4, 1, 2))

> cdplot(V2 ~ V1, data=l,
+ xlab="Experience, months", ylab="Success")

> par(oldpar); dev.off()
pdf 
  2 

> l.logit <- glm(V2 ~ V1, family=binomial, data=l)

> summary(l.logit)

Call:
glm(formula = V2 ~ V1, family = binomial, data = l)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9987  -0.4584  -0.2245   0.4837   1.5005  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)  -4.9638     2.4597  -2.018   0.0436 *
V1            0.2350     0.1163   2.021   0.0432 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 18.249  on 13  degrees of freedom
Residual deviance: 10.301  on 12  degrees of freedom
AIC: 14.301

Number of Fisher Scoring iterations: 5


> tox.logit <- glm(formula=I(2-ILL) ~ CAESAR + TOMATO,
+ family=binomial, data=tox)

> tox.logit2 <- update(tox.logit, . ~ . - TOMATO)

> tox.logit3 <- update(tox.logit, . ~ . - CAESAR)

> tox.logit$aic
[1] 47.40782

> tox.logit2$aic
[1] 45.94004

> tox.logit3$aic
[1] 53.11957

> # \section{Answers to exercises}
> 
> # \subsection{Correlation and linear models}
> 
> traits <- read.table("data/traits.txt", h=TRUE, row.names=1)

> Str(traits)
 'data.frame':	21 obs. of  9 variables: 
  1 TONGUE : int  0 0 0 0 0 0 1 0 1 1 ... 
  2 EARLOBE: int  0 0 0 1 0 0 0 1 1 1 ... 
  3 PINKY  : int  0 1 1 1 1 0 1 0 1 0 ... 
  4 ARM    : int  0 1 0 1 1 0 0 1 1 0 ... 
  5 CHEEK  : int  1 0 0 0 1 0 0 1 0 0 ... 
  6 CHIN   : int  1 1 1 1 1 1 1 1 0 0 ... 
  7 THUMB  : int  0 0 0 1 0 0 1 1 0 1 ... 
  8 TOE    : int  0 1 0 0 1 1 1 0 0 0 ... 
  9 PEAK   : int  0 1 1 1 0 0 1 1 0 0 ... 
 row.names [1:21] "A" "B" "C" "D" "E" ... 

> Cor(traits, method="kendall", dec=1) # shipunov
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
Warning in cor.test.default(X[, i], X[, j], ...) :
  Cannot compute exact p-value with ties
        TONGUE EARLOBE PINKY   ARM CHEEK  CHIN THUMB   TOE  PEAK
TONGUE      -     0.1  -0.2  -0.1  -0.1  -0.6*  0.5*    0  -0.2 
EARLOBE   0.1       -  -0.2   0.1  -0.1  -0.4   0.3  -0.3     0 
PINKY    -0.2    -0.2     -   0.2   0.1     0  -0.1     0   0.2 
ARM      -0.1     0.1   0.2     -   0.2     0  -0.1   0.1     0 
CHEEK    -0.1    -0.1   0.1   0.2     -  -0.1     0  -0.1  -0.1 
CHIN     -0.6*   -0.4     0     0  -0.1     -  -0.5*    0   0.4 
THUMB     0.5*    0.3  -0.1  -0.1     0  -0.5*    -   0.1   0.1 
TOE         0    -0.3     0   0.1  -0.1     0   0.1     -     0 
PEAK     -0.2       0   0.2     0  -0.1   0.4   0.1     0     - 

> traits.c <- cor(traits, method="kendall")

> Topm(traits.c) # shipunov
   Var1   Var2      Value Magnitude
1  CHIN TONGUE -0.6264145      high
2 THUMB TONGUE  0.5393599      high
3 THUMB   CHIN -0.4853627    medium

> pdf(file="pics/cover_to_crop.pdf")

> Pleiad(traits.c, corr=TRUE, lcol=1, legend=FALSE, off=1.12,
+ pch=21, bg="white", cex=1.1) # shipunov

> dev.off()
pdf 
  2 

> pdf(file="pics/26450.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> cor.test(hwc$WEIGHT, hwc$HEIGHT)

	Pearson's product-moment correlation

data:  hwc$WEIGHT and hwc$HEIGHT
t = 5.0682, df = 88, p-value = 2.199e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2975308 0.6212688
sample estimates:
      cor 
0.4753337 


> w.h <- lm(WEIGHT ~ HEIGHT, data=hwc)

> summary(w.h)

Call:
lm(formula = WEIGHT ~ HEIGHT, data = hwc)

Residuals:
   Min     1Q Median     3Q    Max 
-7.966 -2.430  0.305  2.344  5.480 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 30.86387    8.94310   3.451  0.00086 ***
HEIGHT       0.27707    0.05467   5.068  2.2e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.27 on 88 degrees of freedom
Multiple R-squared:  0.2259,	Adjusted R-squared:  0.2171 
F-statistic: 25.69 on 1 and 88 DF,  p-value: 2.199e-06


> plot(WEIGHT ~ HEIGHT, data=hwc, xlab="Height, cm",
+ ylab="Weight, kg")

> abline(w.h)

> Cladd(w.h, data=hwc) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> ee <- read.table(
+ "http://ashipunov.me/shipunov/open/erophila.txt", h=TRUE)

> Str(ee) # shipunov
 'data.frame':	111 obs. of  5 variables: 
  1 LOC       : int  1 1 1 1 1 1 1 1 1 1 ... 
  2 FRUIT.L   : num  4.8 5.1 4.9 4.7 4.7 5 4.7 4.8 5.5 4.5 ... 
  3 FRUIT.W   : num  1.8 2 2.3 2.1 2.4 2.8 2.8 2.8 2.8 1.8 ... 
  4 FRUIT.MAXW: num  3.5 3 3 2.5 2.8 3.3 1.5 2 3 2.5 ... 
  5 FRUIT.A   : int  2 2 2 2 2 1 0 0 2 2 ... 

> sapply(ee[, 2:4], Normality) # shipunov
     FRUIT.L      FRUIT.W   FRUIT.MAXW 
    "NORMAL"     "NORMAL" "NOT NORMAL" 

> Topm(cor(ee[, 2:4], method="spearman")) # shipunov
        Var1    Var2     Value Magnitude
1 FRUIT.MAXW FRUIT.L 0.7109781 very high
2    FRUIT.W FRUIT.L 0.4642429    medium

> with(ee, cor.test(FRUIT.L, FRUIT.MAXW, method="spearman"))
Warning in cor.test.default(FRUIT.L, FRUIT.MAXW, method = "spearman") :
  Cannot compute exact p-value with ties

	Spearman's rank correlation rho

data:  FRUIT.L and FRUIT.MAXW
S = 65874, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.7109781 


> pdf(file="pics/52690.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> ee.lm <- lm(FRUIT.MAXW ~ FRUIT.L, data=ee)

> plot(FRUIT.MAXW ~ FRUIT.L, data=ee, type="n")

> Points(ee$FRUIT.L, ee$FRUIT.MAXW, scale=.5) # shipunov

> Cladd(ee.lm, ee, ab.lty=1) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> summary(ee.lm)

Call:
lm(formula = FRUIT.MAXW ~ FRUIT.L, data = ee)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.17383 -0.23371 -0.03309  0.27678  1.08666 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.14037    0.30475  -0.461    0.646    
FRUIT.L      0.59877    0.06091   9.830   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4196 on 109 degrees of freedom
Multiple R-squared:  0.4699,	Adjusted R-squared:  0.4651 
F-statistic: 96.64 on 1 and 109 DF,  p-value: < 2.2e-16


> plot(ee.lm, which=1)

> Normality(ee.lm$residuals)
[1] "NORMAL"

> he <- read.table(
+ "http://ashipunov.me/shipunov/open/heterostyly.txt", h=TRUE)

> he$SPECIES <- as.factor(he$SPECIES)

> str(he)
'data.frame':	993 obs. of  6 variables:
 $ LOC     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ STYLE.L : num  7.5 14 8 6 10.5 16 7 7 8 13 ...
 $ STAMEN.L: num  9 10 17 13 10.5 10 17 12 16 10.5 ...
 $ TUBE.L  : num  15 20 19.5 19 16 17 19 21 20 17.5 ...
 $ COLOR   : chr  "yl" "ww" "ww" "yl" ...
 $ SPECIES : Factor w/ 2 levels "veris","vulgaris": 2 2 2 2 2 2 2 2 2 2 ...

> boxplot((STYLE.L-STAMEN.L) ~ (STYLE.L-STAMEN.L)>0,
+ names=c("short","long"), data=he)

> pdf(file="pics/48090.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> plot(STYLE.L ~ STAMEN.L, data=he, type="n",
+ xlab="Length of stamens, mm", ylab="Length of style, mm")

> PPoints(he$SPECIES, he$STAMEN.L, he$STYLE.L, scale=.9, cols=1)

> abline(lm(STYLE.L ~ STAMEN.L, data=he[he$SPECIES=="veris", ]))

> abline(lm(STYLE.L ~ STAMEN.L, data=he[he$SPECIES=="vulgaris", ]),
+ lty=2)

> legend("topright", pch=1:2, lty=1:2, legend=c("Primula veris",
+ "P. vulgaris"), text.font=3)

> par(oldpar); dev.off()
pdf 
  2 

> summary(he.lm1 <- lm(STYLE.L ~ STAMEN.L * SPECIES, data=he))

Call:
lm(formula = STYLE.L ~ STAMEN.L * SPECIES, data = he)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.2707  -1.7731  -0.2508   1.7541  10.2368 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)              19.13735    1.17900  16.232  < 2e-16 ***
STAMEN.L                 -0.74519    0.09137  -8.156 1.05e-15 ***
SPECIESvulgaris          -1.84688    1.23060  -1.501   0.1337    
STAMEN.L:SPECIESvulgaris  0.24272    0.09509   2.552   0.0108 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.921 on 989 degrees of freedom
Multiple R-squared:  0.3077,	Adjusted R-squared:  0.3056 
F-statistic: 146.5 on 3 and 989 DF,  p-value: < 2.2e-16


> plot(he.lm1, which=1:2)

> AIC(he.lm1)
[1] 4953.172

> summary(he.lm2 <- update(he.lm1, . ~ . - STAMEN.L:SPECIES))

Call:
lm(formula = STYLE.L ~ STAMEN.L + SPECIES, data = he)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.3613  -2.0086  -0.1926   1.8496  10.2020 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     16.34607    0.44188  36.992  < 2e-16 ***
STAMEN.L        -0.52109    0.02538 -20.536  < 2e-16 ***
SPECIESvulgaris  1.18400    0.32400   3.654 0.000271 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.93 on 990 degrees of freedom
Multiple R-squared:  0.3031,	Adjusted R-squared:  0.3017 
F-statistic: 215.3 on 2 and 990 DF,  p-value: < 2.2e-16


> AIC(he.lm2)
[1] 4957.692

> with(he, interaction.plot(cut(STAMEN.L, quantile(STAMEN.L)),
+ SPECIES, STYLE.L))

> Str(drosera) # shipunov
 'data.frame':	1165 obs. of  11 variables: 
  1 POP     : Factor w/ 19 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ... 
  2 YOUNG.L * int  0 0 0 1 0 1 0 1 0 1 ... 
  3 MATURE.L* int  3 2 5 5 3 5 4 4 3 3 ... 
  4 OLD.L   * int  3 2 5 2 1 2 1 1 3 2 ... 
  5 INSECTS * int  3 2 10 5 0 5 7 3 1 1 ... 
  6 INFL.L  * int  0 0 128 0 0 0 0 0 0 0 ... 
  7 STALK.L * int  0 0 100 0 0 0 0 0 0 0 ... 
  8 N.FLOW  * int  0 0 3 0 0 0 0 0 0 0 ... 
  9 LEAF.L  * num  4 4 6 5 3 5 6 4 3 4 ... 
 10 LEAF.W  * num  4 4 6 6 4 6 7 5 4 5 ... 
 11 PET.L   * int  10 7 16 15 4 5 13 14 11 10 ... 

> sapply(drosera[, -1], Normality)
     YOUNG.L     MATURE.L        OLD.L      INSECTS       INFL.L      STALK.L 
"NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" 
      N.FLOW       LEAF.L       LEAF.W        PET.L 
"NOT NORMAL" "NOT NORMAL" "NOT NORMAL" "NOT NORMAL" 

> pdf(file="pics/51270.pdf")

> dr.cor <- cor(drosera[, -1], method="spearman", use="pairwise")

> Topm(dr.cor) # shipunov
     Var1     Var2     Value Magnitude
1 STALK.L   INFL.L 0.9901613 very high
2  N.FLOW  STALK.L 0.9774198 very high
3  N.FLOW   INFL.L 0.9593589 very high
4  LEAF.W   LEAF.L 0.8251841 very high
5   PET.L   LEAF.W 0.7129433 very high
6   PET.L   LEAF.L 0.6972402      high
7   PET.L   INFL.L 0.4795218    medium
8   PET.L  STALK.L 0.4661593    medium
9 INSECTS MATURE.L 0.4644699    medium

> Pleiad(dr.cor, corr=TRUE, legtext=2, legpos="bottom",
+ leghoriz=TRUE, pch=19, cex=1.2) # shipunov

> dev.off()
pdf 
  2 

> summary(dr.lm <- lm(INFL.L ~ STALK.L, data=drosera))

Call:
lm(formula = INFL.L ~ STALK.L, data = drosera)

Residuals:
     Min       1Q   Median       3Q      Max 
-100.015    0.046    0.046    0.046   74.026 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.046294   0.212065  -0.218    0.827    
STALK.L      1.139452   0.004637 245.719   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.285 on 1158 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.9812,	Adjusted R-squared:  0.9812 
F-statistic: 6.038e+04 on 1 and 1158 DF,  p-value: < 2.2e-16


> plot(dr.lm, which=1:2)

> (largest3 <- rev(sort(table(drosera[, 1])))[1:3])

 Q1   L  N1 
211 201 144 

> dr3 <- drosera[drosera$POP %in% names(largest3), ]

> dr3$POP <- droplevels(dr3$POP)

> boxplot(LEAF.L ~ POP, data=dr3)

> tapply(dr3$LEAF.L, dr3$POP, mad, na.rm=TRUE)
     L     N1     Q1 
1.4826 1.4826 1.4826 

> fligner.test(LEAF.L ~ POP, data=dr3)

	Fligner-Killeen test of homogeneity of variances

data:  LEAF.L by POP
Fligner-Killeen:med chi-squared = 8.1408, df = 2, p-value = 0.01707


> kruskal.test(LEAF.L ~ POP, data=dr3)

	Kruskal-Wallis rank sum test

data:  LEAF.L by POP
Kruskal-Wallis chi-squared = 97.356, df = 2, p-value < 2.2e-16


> pairwise.wilcox.test(dr3$LEAF.L, dr3$POP)

	Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  dr3$LEAF.L and dr3$POP 

   L       N1     
N1 5.2e-16 -      
Q1 0.74    < 2e-16

P value adjustment method: holm 

> # \subsection{Logistic regression}
> 
> seeing <- read.table("data/seeing.txt")

> attach(seeing)

> seeing.logit <- glm(V3 ~ V2, family=binomial, data=seeing)

> summary(seeing.logit)

Call:
glm(formula = V3 ~ V2, family = binomial, data = seeing)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4029  -0.8701   0.4299   0.7825   1.5197  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  -1.6776     0.7923  -2.117  0.03423 * 
V2            0.9015     0.2922   3.085  0.00203 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 62.687  on 49  degrees of freedom
Residual deviance: 49.738  on 48  degrees of freedom
AIC: 53.738

Number of Fisher Scoring iterations: 4


> pdf(file="pics/32450.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> tries <- seq(1, 5, length=50) # exactly 50 numbers from 1 to 5

> seeing.p <- predict(seeing.logit, list(V2=tries),
+ type="response")

> plot(V3 ~ jitter(V2, amount=.1), data=seeing, xlab="", ylab="")

> lines(tries, seeing.p)

> par(oldpar); dev.off()
pdf 
  2 

> jj <- read.table(
+ "http://ashipunov.me/shipunov/open/juniperus.txt", h=TRUE)

> jj$LOC <- factor(jj$LOC, labels=paste0("loc", levels(jj$LOC)))

> Str(jj) # shipunov
 'data.frame':	61 obs. of  7 variables: 
  1 LOC     : Factor w/ 3 levels "loc1","loc2",..: 1 1 1 1 1 1 1 1 1 1 ... 
  2 HEIGHT  : num  90 55 20 80 80 65 25 40 55 40 ... 
  3 WIDTH   : num  40 25 45 100 135 35 55 25 45 55 ... 
  4 NEEDLE.L: int  8 8 5 6 10 6 6 9 5 5 ... 
  5 PROTR   : num  1 1 1.5 1 0 1 0 0.5 1 1 ... 
  6 STEM.D  * num  2.4 2.3 3.5 6 2.6 4.5 3.2 0.5 NA 1.7 ... 
  7 PINE.N  : int  1 0 2 2 0 2 3 2 0 3 ... 

> pdf(file="pics/57390.pdf"); oldpar <- par(mar=c(4, 0, 0, 1))

> j.cols <- colorRampPalette(c("steelblue", "white"))(5)[2:4]

> Boxplots(jj[, 2:7], jj$LOC, legpos="top",
+ boxcols=j.cols) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/57560.pdf"); oldpar <- par(mar=c(4, 4, 1, 2))

> spineplot(LOC ~ NEEDLE.L, data=jj, col=j.cols)

> par(oldpar); dev.off()
pdf 
  2 

> Normality(jj$NEEDLE.L)
[1] "NORMAL"

> tapply(jj$NEEDLE.L, jj$LOC, var)
    loc1     loc2     loc3 
2.461905 3.607895 2.407895 

> bartlett.test(NEEDLE.L ~ LOC, data=jj)

	Bartlett test of homogeneity of variances

data:  NEEDLE.L by LOC
Bartlett's K-squared = 1.0055, df = 2, p-value = 0.6049


> oneway.test(NEEDLE.L ~ LOC, data=jj)

	One-way analysis of means (not assuming equal variances)

data:  NEEDLE.L and LOC
F = 14.129, num df = 2.000, denom df = 38.232, p-value = 2.546e-05


> (eta.squared <-
+ summary(lm(NEEDLE.L ~ LOC, data=jj))$adj.r.squared)
[1] 0.3337755

> pairwise.t.test(jj$NEEDLE.L, jj$LOC)

	Pairwise comparisons using t tests with pooled SD 

data:  jj$NEEDLE.L and jj$LOC 

     loc1    loc2   
loc2 0.00031 -      
loc3 0.14564 3.1e-06

P value adjustment method: holm 

> is.sibirica <- with(jj, (NEEDLE.L < 8 & HEIGHT < 100))

> sibirica <- factor(is.sibirica, labels=c("communis", "sibirica"))

> summary(sibirica)
communis sibirica 
      24       37 

> pdf(file="pics/58290.pdf"); oldpar <- par(mar=c(4, 4, 1, 2))

> cdplot(sibirica ~ PINE.N, data=jj, col=j.cols[c(1, 3)])

> summary(glm(sibirica ~ PINE.N, data=jj, family=binomial))

Call:
glm(formula = sibirica ~ PINE.N, family = binomial, data = jj)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8549  -1.0482   0.7397   1.0042   1.3123  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)  -0.3117     0.4352  -0.716   0.4738  
PINE.N        0.3670     0.1776   2.066   0.0388 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 81.772  on 60  degrees of freedom
Residual deviance: 77.008  on 59  degrees of freedom
AIC: 81.008

Number of Fisher Scoring iterations: 4


> par(oldpar); dev.off()
pdf 
  2 

> # \section{How to choose the right method}
> 
> # \chapter{Draw}
> 
> # \section{Pictographs}
> 
> pdf(file="pics/27400.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> eq8 <- read.table("data/eq8.txt", h=TRUE)

> Str(eq8) # shipunov
 'data.frame':	832 obs. of  9 variables: 
  1 SPECIES * chr  "arvense" "arvense" "arvense" "arvense" ... 
  2 DL.R    : num  424 339 321 509 462 350 405 615 507 330 ... 
  3 DIA.ST  * num  2.3 2 2.5 3 2.5 1.8 2 2.2 2 1.8 ... 
  4 N.REB   : int  13 11 15 14 12 9 14 11 10 8 ... 
  5 N.ZUB   : int  12 12 14 14 13 9 15 11 10 8 ... 
  6 DL.OSN.Z* num  2 1 2 1.5 1.1 1.1 1 1 1 1 ... 
  7 DL.TR.V * num  5 4 5 5 4 4 4 4 5 5 ... 
  8 DL.BAZ  : num  3 2.5 2.3 2.2 2.1 2 2 2 1.9 1 ... 
  9 DL.PER  : num  25 13 13 23 12 15 13 14 10 9 ... 

> eq8m <- Aggregate1(eq8[, 2:9], eq8[, 1],
+  median, na.rm=TRUE) # shipunov

> stars(eq8m, cex=1.2, lwd=1.2, col.stars=rep("darkseagreen", 8))

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/27580_to_crop.pdf")

> library(TeachingDemos)

> faces(eq8m)

> dev.off()
pdf 
  2 

> pdf(file="pics/06645.pdf"); oldpar <- par(mar=c(0, 4, 0, 0))

> image(scale(iris[, -5]), axes=FALSE)

> axis(2, at=seq(0, 1, length.out=4),
+ labels=abbreviate(colnames(iris[, -5])), las=2)

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/34390.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> library(MASS)

> parcoord(iris[, -5], col=as.numeric(iris[, 5]), lwd=2)

> legend("top", bty="n", lty=1, lwd=2, col=1:3,
+ legend=names(table(iris[, 5])))

> par(oldpar); dev.off()
pdf 
  2 

> # \section{Grouped plots}
> 
> Boxplots(iris[, 1:4], iris[, 5], srt=0, adj=c(.5, 1),
+ legpos="topright") # shipunov

> Linechart(iris[, 1:4], iris[, 5], mad=TRUE) # shipunov

> matplot(iris[, 1:2], iris[, 3:4], pch=1,
+ xlab="Sepals", ylab="Petals")

> legend("topleft", pch=1, col=1:2, legend=c("Length", "Width"))

> pdf(file="pics/27180.pdf")

> pairs(iris[, 1:4], pch=19, col=as.numeric(iris[, 5]),
+ oma=c(2, 2, 4, 2))

> oldpar <- par(xpd=TRUE)

> legend(0, 1.06, horiz=TRUE, legend=levels(iris[, 5]),
+ pch=19, col=1:3, bty="n")

> par(oldpar)

> dev.off()
pdf 
  2 

> pdf(file="pics/27000_to_crop.pdf")

> betula <- read.table(
+ "http://ashipunov.me/shipunov/open/betula.txt", h=TRUE)

> library(lattice)

> d.tmp <- do.call(make.groups, betula[, c(2:4, 7:8)])

> d.tmp$LOC <- betula$LOC

> bwplot(data ~ factor(LOC) | which, data=d.tmp, ylab="")

> dev.off()
pdf 
  2 

> pdf(file="pics/61060_to_crop.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> eq.s <- stack(as.data.frame(scale(eq8m)))

> eq.s$SPECIES <- row.names(eq8m)

> dotplot(SPECIES ~ values | ind, eq.s, xlab="")

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/35270.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> coplot(percn ~ atten | cand, data=elections2, col="red",
+ bg="pink", pch=21, bar.bg=c(fac="lightblue"))

> par(oldpar); dev.off()
pdf 
  2 

> # \section{3D plots}
> 
> pdf(file="pics/55170_to_crop.pdf")

> library(vcd)

> ternaryplot(scale(iris[, 2:4], center=FALSE), grid_color="black",
+ cex=0.3, col=iris[, 5], main="")

> grid_legend(0.8, 0.7, pch=19, size=.5, col=1:3, levels(iris[, 5]))

> dev.off()
pdf 
  2 

> pdf(file="pics/54020_to_crop.pdf")

> library(scatterplot3d)

> i3d <- scatterplot3d(iris[, 2:4], color=as.numeric(iris[, 5]),
+ type="h", pch=14 + as.numeric(iris[, 5]), xlab="Sepal width",
+ ylab="", zlab="Petal width")

> dims <- par("usr")

> x <- dims[1]+ 0.82*diff(dims[1:2])

> y <- dims[3]+ 0.1*diff(dims[3:4])

> text(x, y, "Petal length", srt=40)

> legend(i3d$xyz.convert(3.8, 6.5, 1.5), col=1:3, pch=(14 + 1:3),
+ legend=levels(iris[, 5]), bg="white")

> dev.off()
pdf 
  2 

> pdf(file="pics/54440_to_crop.pdf")

> library(lattice)

> p <- cloud(Sepal.Length ~ Petal.Length * Petal.Width, data=iris,
+ groups=Species, par.settings=list(clip=list(panel="off")),
+ auto.key=list(space="top", columns=3, points=TRUE))

> update(p[rep(1, 4)], layout=c(2, 2), function(..., screen)
+ panel.cloud(..., screen=list(z=c(-70, 110)[current.column()],
+ x=-70, y=c(140, 0)[current.row()])))

> dev.off()
pdf 
  2 

> # \chapter{Discover}
> 
> # \section{Discovery with primary data}
> 
> # \subsection{Shadows of hyper clouds: PCA}
> 
> pdf(file="pics/90530.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> library(png) # package to read PNG images as data

> aa <- readPNG("data/pear2d_bw.png")

> bb <- which(aa == 0, arr.ind=TRUE) # pixels to coordinates

> ## plot together original (green) and PCA-rotated (gray):
> 
> bbs <- scale(bb)

> pps <- scale(prcomp(bb)$x[, 1:2]) # only two PCs

> xx <- range(c(bbs[, 1], pps[, 1]))

> yy <- range(c(bbs[, 2], pps[, 2]))

> plot(pps, pch=".", col=adjustcolor("black", alpha=0.5),
+ xlim=xx, ylim=yy)

> points(bbs, pch=".", col=adjustcolor("green", alpha=0.5))

> legend("bottomright", fill=adjustcolor(c("green", "black"),
+ alpha=0.5), legend=c("Original", "PCA-rotated"),
+ bty="n", border=0)

> par(oldpar); dev.off()
pdf 
  2 

> ca <- read.table(
+ "http://ashipunov.me/shipunov/open/carex.txt", h=TRUE)

> Str(ca)
 'data.frame':	62 obs. of  5 variables: 
  1 LOC    : int  1 1 1 1 1 1 1 1 1 1 ... 
  2 HEIGHT : int  157 103 64 77 21 27 19 35 43 92 ... 
  3 LEAF.W : num  2.5 2.5 2 2 1.5 1.5 2 1.5 2 2 ... 
  4 SPIKE.L: num  9.5 9 7.5 7 4 5 3.5 6 6 6.5 ... 
  5 SPIKE.W: num  6.5 6.5 6 5 4 4 3.5 5 5 5.5 ... 

> ca.pca <- princomp(scale(ca[, -1]))

> pdf(file="pics/27890.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> plot(ca.pca, main="")

> par(oldpar); dev.off()
pdf 
  2 

> summary(ca.pca)
Importance of components:
                          Comp.1    Comp.2     Comp.3     Comp.4
Standard deviation     1.6448264 0.7759300 0.59318563 0.52544578
Proportion of Variance 0.6874514 0.1529843 0.08940939 0.07015485
Cumulative Proportion  0.6874514 0.8404358 0.92984515 1.00000000

> pdf(file="pics/28180.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> ca.p <- ca.pca$scores[, 1:2]

> plot(ca.p, type="n", xlab="PC1", ylab="PC2")

> text(ca.p, labels=ca[, 1], col=ca[, 1])

> Hulls(ca.p, ca[, 1]) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> ca.h <- Hulls(ca.p, ca[, 1], plot=FALSE) # shipunov

> ca.o <- Overlap(ca.h) # shipunov

> summary(ca.o)
Overlaps for each hull, %:
  mean.overlap total.overlap
1        30.43         60.86
2        24.89         49.79
3        38.08         76.17
4         0.00          0.00
Mean of all overlaps 31.14 %

> oldpar <- par()

> pdf(file="pics/28360.pdf"); oldpar <- par(mar=c(4, 4, 2, 2))

> biplot(ca.pca, xlabs=ca[, 1])

> par(oldpar); dev.off()
pdf 
  2 

> par(oldpar)

> loadings(ca.pca)

Loadings:
        Comp.1 Comp.2 Comp.3 Comp.4
HEIGHT   0.518         0.845  0.125
LEAF.W   0.468  0.721 -0.263 -0.437
SPIKE.L  0.534        -0.432  0.724
SPIKE.W  0.476 -0.688 -0.178 -0.518

               Comp.1 Comp.2 Comp.3 Comp.4
SS loadings      1.00   1.00   1.00   1.00
Proportion Var   0.25   0.25   0.25   0.25
Cumulative Var   0.25   0.50   0.75   1.00

> iris.pca <- prcomp(iris[, -5])

> iris.pca$rotation
                     PC1         PC2         PC3        PC4
Sepal.Length  0.36138659 -0.65658877  0.58202985  0.3154872
Sepal.Width  -0.08452251 -0.73016143 -0.59791083 -0.3197231
Petal.Length  0.85667061  0.17337266 -0.07623608 -0.4798390
Petal.Width   0.35828920  0.07548102 -0.54583143  0.7536574

> iris.p <- iris.pca$x[, 1:2]

> plot(iris.p, type="n", xlab="PC1", ylab="PC2")

> text(iris.p, labels=abbreviate(iris[, 5], 1, method="both.sides"),
+ col=as.numeric(iris[, 5]))

> Ellipses(iris.p[, 1:2], as.numeric(iris[, 5])) # shipunov

> pdf(file="pics/28750.pdf")

> library(ade4)

> iris.dudi <- dudi.pca(iris[, 1:4], scannf=FALSE)

> s.class(iris.dudi$li, iris[, 5])

> dev.off()
pdf 
  2 

> iris.between <- bca(iris.dudi, iris[, 5], scannf=FALSE)

> randtest(iris.between)
Monte-Carlo test
Call: randtest.between(xtest = iris.between)

Observation: 0.7224358 

Based on 999 replicates
Simulated p-value: 0.001 
Alternative hypothesis: greater 

     Std.Obs  Expectation     Variance 
6.912789e+01 1.377969e-02 1.050907e-04 

> detach("package:ade4", unload=TRUE) # cca() clashes with vegan

> library(vegan)

> anosim(iris.p, iris$Species, permutations=99, distance="euclidean")

Call:
anosim(x = iris.p, grouping = iris$Species, permutations = 99,      distance = "euclidean") 
Dissimilarity: euclidean 

ANOSIM statistic R: 0.8732 
      Significance: 0.01 

Permutation: free
Number of permutations: 99


> # \subsection{Correspondence}
> 
> pdf(file="pics/37300.pdf"); oldpar <- par(mar=c(2, 2, 2, 2))

> library(MASS)

> HE <- margin.table(HairEyeColor, 1:2)

> HE.df <- Table2df(HE) # shipunov

> biplot(corresp(HE.df, nf = 2), xpd=TRUE)

> legend("topleft", fill=1:2, legend=c("hair","eyes"))

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/44460.pdf"); oldpar <- par(mar=c(2, 2, 1, 1))

> library(vegan)

> alla <- read.table("data/lakesea_abio.txt", sep="\t", h=TRUE)

> allc <- read.table("data/lakesea_bio.txt", sep="\t", h=TRUE)

> oldnames <- names(alla) # names are too long for the plot

> names(alla) <- LETTERS[1:ncol(alla)]

> all.cca <- cca(allc, alla[, -14])

> plot(all.cca, display=c("sp", "cn"))

> points(all.cca, display="sites", pch=ifelse(alla[, 14], 15, 0))

> legend("topleft", pch=c(15, 0, 3), col=c(1, 1, 2),
+ legend=c("lake sites", "sea sites", "plant species"))

> legend("bottomright", pch=names(alla)[-14], col="blue",
+ legend=oldnames[-14])

> text(-1.6, -4.2, "Carex lasiocarpa", pos=4)

> par(oldpar); dev.off()
pdf 
  2 

> # \subsection{Projections, unfolds, t-SNE and UMAP}
> 
> if(0){ # to activate plot, comment out here and below
+ 
+ pdf(file="pics/tapkee.pdf")
+ 
+ library(tapkee)
+ 
+ SR <- Gen.dr.data("swissroll")
+ 
+ COL <- rainbow(1100)[1:1000] # separates colors better
+ 
+ TM <- c("lle", "npe", "ltsa", "hlle", "la", "isomap", "spe", "pca")
+ 
+ names(TM) <- c("Locally Linear Embedding",
+ "Neighborhood Preserving1 Embd.",
+ "Local Tangent Space Alignment",
+ "Hessian Locally Linear Embd.",
+ "Laplacian Eigenmaps", "Isomap",
+ "Stochastic Proximity Embd.", "PCA")
+ 
+ oldpar <- par(mfrow=c(3, 3), xaxt="n", yaxt="n")
+ 
+ scatterplot3d(SR, color=COL, pch=20, main="Swiss Roll",
+ cex.symbols=1.2, xlab="", ylab="", zlab="",
+ axis=FALSE, tick.marks=FALSE,
+ label.tick.marks=FALSE, mar=c(1, 1, 2, 1))
+ 
+ for (n in 1:length(TM)) plot(Tapkee(SR, method=TM[n]),
+ col=COL, pch=20, main=names(TM)[n])
+ 
+ par(oldpar)
+ 
+ par(oldpar)
+ 
+ }

> if(0){ # to activate plot, comment out here and below
+ 
+ pdf(file="pics/tapkee_umap.pdf")
+ 
+ library(uwot)
+ 
+ library(tapkee)
+ 
+ ## library(Rtsne)
+ 
+ SR <- Gen.dr.data("swissroll")
+ 
+ COL <- rainbow(1100)[1:1000] # separates colors better
+ 
+ oldpar <- par(mfrow=c(1, 2), mar=c(1, 1, 3, 1), xaxt="n", yaxt="n")
+ 
+ ## plot(Rtsne(SR)$Y, col=COL, pch=20, main="t-SNE (Rtsne)")
+ 
+ plot(Tapkee(SR, method="t-sne"),
+ col=COL, pch=20, main="t-SNE (tapkee)")
+ 
+ plot(umap(SR, n_neighbors=15, approx_pow=TRUE, pca=50),
+ col=COL, pch=20, main="UMAP (uwot)")
+ 
+ par(oldpar)
+ 
+ dev.off()
+ 
+ }

> # \subsection{Non-negative matrix factorization}
> 
> library(NMFN)

> set.seed(1)

> iris.nnmf <- nnmf(iris[, -5], k=2) # we want two factors
Multiplicative Update Algorithm 
Iter =  50 	relative error =  0.06931816 	diff =  1311.549 	eucl dist =  62.21723 
Iter =  100 	relative error =  0.03606683 	diff =  114.538 	eucl dist =  16.09118 
Iter =  150 	relative error =  0.03541534 	diff =  4.631678 	eucl dist =  15.7323 
Iter =  200 	relative error =  0.03527807 	diff =  1.202054 	eucl dist =  15.66371 
Iter =  250 	relative error =  0.03520953 	diff =  0.6464263 	eucl dist =  15.6331 
Iter =  300 	relative error =  0.03517994 	diff =  0.4066489 	eucl dist =  15.61535 
Iter =  350 	relative error =  0.03516875 	diff =  0.2881102 	eucl dist =  15.60357 
Iter =  400 	relative error =  0.03516604 	diff =  0.2140224 	eucl dist =  15.59505 
Iter =  450 	relative error =  0.03516523 	diff =  0.1651553 	eucl dist =  15.58848 
Iter =  500 	relative error =  0.03516451 	diff =  0.1248346 	eucl dist =  15.58309 
Iter =  550 	relative error =  0.03516344 	diff =  0.09692363 	eucl dist =  15.57842 
Iter =  600 	relative error =  0.03516205 	diff =  0.08006654 	eucl dist =  15.57419 
Iter =  650 	relative error =  0.03516066 	diff =  0.06978153 	eucl dist =  15.57031 
Iter =  700 	relative error =  0.03515925 	diff =  0.06437713 	eucl dist =  15.56672 
Iter =  750 	relative error =  0.03515794 	diff =  0.06170888 	eucl dist =  15.56344 
Iter =  800 	relative error =  0.03515679 	diff =  0.05935957 	eucl dist =  15.56049 
Iter =  850 	relative error =  0.03515578 	diff =  0.05688894 	eucl dist =  15.55789 
Iter =  900 	relative error =  0.03515488 	diff =  0.05400323 	eucl dist =  15.55565 
Iter =  950 	relative error =  0.03515407 	diff =  0.05063829 	eucl dist =  15.55375 
Iter =  1000 	relative error =  0.03515329 	diff =  0.04684556 	eucl dist =  15.55217 

> plot(iris.nnmf$W, col=iris$Species, xlab="F1", ylab="F2")

> iris.nh <- Hulls(iris.nnmf$W, iris$Species) # shipunov

> summary(Overlap(iris.nh)) # shipunov
Overlaps for each hull, %:
           mean.overlap total.overlap
setosa             0.00          0.00
versicolor         4.98          4.98
virginica          6.73          6.73
Mean of all overlaps 5.85 %

> # \section{Discovery with distances}
> 
> # \subsection{Distances}
> 
> nd <- read.table("data/nd.txt", h=TRUE, sep="\t", row.names=1)

> nd.d <- as.dist(nd)

> str(nd.d)
 'dist' int [1:28] 110 122 170 208 268 210 173 219 101 135 ...
 - attr(*, "Labels")= chr [1:8] "Minot" "Bismarck" "Williston" "Jamestown" ...
 - attr(*, "Size")= int 8
 - attr(*, "call")= language as.dist.default(m = nd)
 - attr(*, "Diag")= logi FALSE
 - attr(*, "Upper")= logi FALSE

> library(cluster)

> iris.dist <- daisy(iris[, 1:4], metric="manhattan")

> str(iris.dist)
 'dissimilarity' num [1:11175] 0.7 0.8 1 0.2 1.2 ...
 - attr(*, "Size")= int 150
 - attr(*, "Metric")= chr "manhattan"

> iris.gower <- Gower.dist(iris) # shipunov

> str(iris.gower)
 'dist' num [1:11175] 0.0528 0.0506 0.0645 0.0139 0.0768 ...
 - attr(*, "Labels")= chr [1:150] "1" "2" "3" "4" ...
 - attr(*, "Size")= int 150
 - attr(*, "call")= language as.dist.default(m = out)
 - attr(*, "Diag")= logi FALSE
 - attr(*, "Upper")= logi FALSE

> library(MASS)

> eqscplot(moldino_l$LON, moldino_l$LAT,
+ cex=round(log(moldino_l$SQUARE))-5.5,
+ axes=FALSE, xlab="", ylab="", xpd=TRUE) # shipunov

> text(moldino_l$LON, moldino_l$LAT, labels=row.names(moldino_l),
+ pos=4, offset=1, cex=.9)

> m1 <- t((moldino > 0) * 1) # convert to 0/1 and transpose

> library(smirnov)

> m1.Txy <- smirnov(m1)

> m1.s <- (1 - m1.Txy) # convert similarity into dissimilarity

> dimnames(m1.s) <- list(row.names(m1))

> symnum(m1.s)
                                   
Bobrovyj          +                
Ekslibris         + +              
Gnutyj            , + *            
Leda              , + , *          
Malinovyj.Kruglyj + + + + +        
Slitnyj           , , , , + *      
Tajnik            , , , , + , *    
Verik             + + + + , , + +  
Zakhar            , + + , + + , + +
attr(,"legend")
[1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1

> m1.Txx <- diag(m1.Txy)

> names(m1.Txx) <- row.names(m1)

> rev(sort(round(m1.Txx, 3)))
            Verik Malinovyj.Kruglyj         Ekslibris          Bobrovyj 
            0.189             0.130             0.124             0.106 
           Zakhar              Leda           Slitnyj            Gnutyj 
            0.105             0.092             0.088             0.087 
           Tajnik 
            0.078 

> # \subsection{Making maps: multidimensional scaling}
> 
> pdf(file="pics/56450_to_crop.pdf")

> nd.d <- as.dist(nd)

> nd.c <- cmdscale(nd.d)

> new.names <- sub("y C", "y\nC", row.names(nd))

> library(MASS)

> eqscplot(nd.c, type="n", axes=FALSE, xlab="", ylab="")

> points(nd.c, pch=19)

> text(nd.c, labels=new.names, xpd=TRUE, pos=3, cex=0.8)

> dev.off()
pdf 
  2 

> pdf(file="pics/36340.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> library(KernSmooth)
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009

> iris.c <- cmdscale(iris.dist)

> est <- bkde2D(iris.c, bandwidth=c(.7, 1.5))

> plot(iris.c, type="n", xlab="Dim. 1", ylab="Dim. 2")

> text(iris.c,
+ labels=abbreviate(iris[, 5], 1, method="both.sides"))

> contour(est$x1, est$x2, est$fhat, add=TRUE,
+ drawlabels=FALSE, lty=3)

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/94140_to_crop.pdf")

> persp(est$x1, est$x2, est$fhat, theta=135, phi=45,
+ col="purple3", shade=0.75, border=NA,
+ xlab="Dim. 1", ylab="Dim. 2", zlab="Density")

> dev.off()
pdf 
  2 

> iris.dist2 <- dist(iris[, 1:4], method="manhattan")

> ## to remove zero distances:
> 
> iris.dist2[iris.dist2 == 0] <- abs(jitter(0))

> library(MASS)

> iris.m <- isoMDS(iris.dist2)
initial  value 5.444949 
iter   5 value 4.117042
final  value 4.075094 
converged

> cor(iris[, 1:4], iris.m$points) # variable importance ("loadings")
                   [,1]        [,2]
Sepal.Length  0.9041779 -0.30455326
Sepal.Width  -0.3996573 -0.87872539
Petal.Length  0.9956071  0.04209809
Petal.Width   0.9661085 -0.01138353

> (vv <- MDSv(iris.m$points)) # shipunov
[1] 97.509738  2.490262

> xxlab <- paste0("Dim 1 (", round(vv[1], 2), "%)")

> yylab <- paste0("Dim 1 (", round(vv[2], 2), "%)")

> aabb <- abbreviate(iris$Species, 1, method="both.sides")

> plot(iris.m$points, pch=aabb, xlab=xxlab, ylab=yylab)

> Biarrows(iris.m$points, iris[, -5]) # shipunov

> # \subsection{Making trees: hierarchical clustering}
> 
> pdf(file="pics/29530.pdf"); oldpar <- par(mar=c(0, 2, 2, 1))

> aa <- t(atmospheres) # shipunov

> aa.dist <- dist(aa) # planets are columns

> aa.hclust <- hclust(aa.dist, method="ward.D")

> plot(aa.hclust, xlab="", ylab="", sub="")

> par(oldpar); dev.off()
pdf 
  2 

> cutree(aa.hclust, k=2) # we want 2 groups
Mercury   Venus   Earth    Mars Jupiter  Saturn  Uranus Neptune 
      1       1       1       1       2       2       2       2 

> library(cluster) # advanced clustering package

> iris.dist <- daisy(iris[, 1:4], metric="manhattan") # distances

> iris.hclust <- hclust(iris.dist)

> iris.3 <- cutree(iris.hclust, 3) # we want 3 groups

> head(iris.3) # show cluster numbers
[1] 1 1 1 1 1 1

> Misclass(iris.3, iris[, 5], best=TRUE) # shipunov
Best classification table:
    obs
pred setosa versicolor virginica
   1     50          0         0
   2      0         50        16
   3      0          0        34
Misclassification errors (%):
    setosa versicolor  virginica 
         0          0         32 
Mean misclassification error: 10.7%

> Adj.Rand(iris.3, iris[, 5]) # shipunov
[1] 0.7322981

> pdf(file="pics/95700.pdf")

> library(cetcolor)

> heatmap(aa, col=cet_pal(12, "coolwarm"), margins=c(9, 6))

> dev.off()
pdf 
  2 

> pdf(file="pics/57440.pdf"); oldpar <- par(mar=c(2, 0, 0, 0))

> Ploth(iris.hclust, col=as.numeric(iris[, 5]),
+ pch=16, col.edges=TRUE, horiz=TRUE, leaflab="none") # shipunov

> legend("topleft", fill=1:nlevels(iris[, 5]),
+ legend=levels(iris[, 5]))

> par(oldpar); dev.off()
pdf 
  2 

> oldpar <- par(mar=c(2, 0, 0, 4))

> tox.dist <- as.dist(1 - abs(tox.cor))

> Ploth(hclust(tox.dist, method="ward.D"), horiz=TRUE) # shipunov

> par(oldpar)

> # \subsection{How to know the best clustering method}
> 
> PlotBest.hclust(as.dist(m1.s)) # shipunov

> PlotBest.dist(m1) # shipunov

> pdf(file="pics/141940.pdf")

> set.seed(21)

> palette(c("#377EB8", "#FF7F00", "#4DAF4A", "#F781BF", "#A65628",
+ "#984EA3", "#999999", "#E41A1C", "#DEDE00", "#000000"))

> nsam <- 500 # how many points to use

> K <- 3 # how many clusters to target

> noisy.moons <- Gen.cl.data(type="moons", N=nsam, noise=0.05)

> no.struct <- list(samples=cbind(runif(nsam), runif(nsam)),
+ labels=rep(1, nsam))

> varied <- Gen.cl.data(type="blobs", N=nsam, bnoise=c(1, 2.5, 0.5))

> oldpar <- par(mfrow=c(3, 3), mar=c(1, 1, 2, 1), xaxt="n", yaxt="n")

> for (n in c("no.struct", "noisy.moons", "varied")) {
+ newlabels <- cutree(hclust(dist(get(n)$samples),
+ method="ward.D2"), k=K)
+ plot(get(n)$samples, col=newlabels, pch=19,
+ main=paste0("Ward: ", n))
+ newlabels <- cutree(hclust(dist(get(n)$samples),
+ method="single"), k=K)
+ plot(get(n)$samples, col=newlabels, pch=19,
+ main=paste0("Single linkage: ", n))
+ newlabels <- kmeans(get(n)$samples, centers=K)$cluster
+ plot(get(n)$samples, col=newlabels, pch=19,
+ main=paste0("k-means: ", n))
+ }

> par(oldpar)

> palette("default")

> set.seed(NULL)

> dev.off()
pdf 
  2 

> # \subsection{How to compare clusterings}
> 
> aa.d1 <- hclust(dist(aa))

> aa.d2 <- hclust(as.dist(1 - abs(cor(atmospheres, method="s"))),
+ method="ward.D") # shipunov

> library(ape)

> aa.ph1 <- unroot(as.phylo(aa.d1)) # convert

> aa.ph2 <- unroot(as.phylo(aa.d2))

> dist.topo(aa.ph1, aa.ph2)
      tree1
tree2     2

> phangorn::treedist(aa.ph1, aa.ph2)
     symmetric.difference   branch.score.difference           path.difference 
                 2.000000                109.201423                  3.872983 
quadratic.path.difference 
               598.444274 

> pdf(file="pics/96960.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> ass <- cbind(aa.ph1$tip.label, aa.ph1$tip.label)

> aa.ph2r <- rotate(aa.ph2, c("Earth", "Neptune"))

> cophyloplot(aa.ph1, aa.ph2r, assoc=ass, space=30, lty=2)

> par(oldpar); dev.off()
pdf 
  2 

> plot(consensus(aa.ph1, aa.ph2r))

> aa12.match <- Hclust.match(aa.d1, aa.d2) # shipunov

> library(cetcolor)

> cols <- cet_pal(max(aa12.match), "blues")

> heatmap(aa12.match, scale="none", col=cols)

> aa.h <- hclust(dist(t(atmospheres)))

> MRH(aa.h, method="branches") # shipunov
        [,1] [,2] [,3] [,4] [,5] [,6] [,7]
Mercury    0    0    0    0    1    1    1
Venus      1    0    0    0    0    1    1
Earth      0    0    0    0    1    1    1
Mars       1    0    0    0    0    1    1
Jupiter    0    0    1    1    0    0    1
Saturn     0    0    1    1    0    0    1
Uranus     0    1    0    1    0    0    1
Neptune    0    1    0    1    0    0    1

> oldpar <- par()

> m1.dist <- as.dist(m1.s)

> m1.hclust <- hclust(m1.dist)

> m1.c <- cmdscale(m1.dist)

> library(vegan3d)

> orditree3d(m1.c, m1.hclust, text=attr(m1.dist, "Labels"), type="t")

> par(oldpar)
Warning in par(oldpar) : graphical parameter "cin" cannot be set
Warning in par(oldpar) : graphical parameter "cra" cannot be set
Warning in par(oldpar) : graphical parameter "csi" cannot be set
Warning in par(oldpar) : graphical parameter "cxy" cannot be set
Warning in par(oldpar) : graphical parameter "din" cannot be set
Warning in par(oldpar) : graphical parameter "page" cannot be set

> # \subsection{How good are resulted clusters}
> 
> pdf(file="pics/58350.pdf"); oldpar <- par(mar=c(1, 3, 1, 1))

> (m1.j <- Jclust(m1, 3, iter=1000)) # shipunov
........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

 Bootstrap support for 3 clusters, 1000 iterations: 

 support cluster                                  members
   91.65       1                         Bobrovyj, Zakhar
   81.90       2 Ekslibris, Gnutyj, Leda, Slitnyj, Tajnik
   62.40       3                 Malinovyj.Kruglyj, Verik

> plot(m1.j, rect.lty=2, sub="")

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/29730.pdf"); oldpar <- par(mar=c(0, 2, 0, 0))

> library(pvclust)

> m1.pvclust <- pvclust(t(m1), method.dist="manhattan",
+ nboot=100, parallel=TRUE)
Creating a temporary cluster...done:
socket cluster with 3 nodes on host ‘localhost’
Multiscale bootstrap... Done.

> plot(m1.pvclust, col.pv=c(au="red", bp="darkgreen", edge=0), main="")

> par(oldpar); dev.off()
pdf 
  2 

> m1.ba <- BootA(m1, FUN=function(.x) # shipunov
+ as.phylo(hclust(dist(.x, method="minkowski"),
+ method="average")), iter=100, mc.cores=4)
Running parallel bootstraps... done.
Calculating bootstrap values... done.

> plot(m1.ba$boot.tree, show.node.label=TRUE)

> plot(m1.ba$cons.tree) # majority-rule consensus

> aa.bcl <- Bclust(t(atmospheres), iter=100) # shipunov
....................................................................................................

> plot(aa.bcl$hclust)

> Bclabels(aa.bcl$hclust, aa.bcl$values, pos=3, offset=0.1) # shipunov

> # \subsection{Making groups: $k$-means and friends}
> 
> iris.km <- kmeans(iris[, -5], nstart=5, centers=3) # 3 clusters

> str(iris.km$cluster)
 int [1:150] 2 2 2 2 2 2 2 2 2 2 ...

> Misclass(iris$Species, iris.km$cluster, best=TRUE)
Best classification table:
            obs
pred          2  3  1
  setosa     50  0  0
  versicolor  0 48  2
  virginica   0 14 36
Misclassification errors (%):
   2    3    1 
 0.0 22.6  5.3 
Mean misclassification error: 9.3%

> pdf(file="pics/92950.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> library(kernlab)

Attaching package: ‘kernlab’

The following object is masked from ‘package:permute’:

    how

The following object is masked from ‘package:ggplot2’:

    alpha


> data(spirals)

> set.seed(1)

> sc <- specc(spirals, centers=2)

> plot(spirals, col=sc, xlab="", ylab="")

> par(oldpar); dev.off()
pdf 
  2 

> library(cluster)

> iris.f <- fanny(iris[, 1:4], 3)

> head(data.frame(sp=iris[, 5], iris.f$membership))
      sp        X1         X2         X3
1 setosa 0.9142273 0.03603116 0.04974153
2 setosa 0.8594576 0.05854637 0.08199602
3 setosa 0.8700857 0.05463714 0.07527719
4 setosa 0.8426296 0.06555926 0.09181118
5 setosa 0.9044503 0.04025288 0.05529687
6 setosa 0.7680227 0.09717445 0.13480286

> Misclass(iris.f$clustering, iris[, 5], best=TRUE) # shipunov
Best classification table:
    obs
pred setosa virginica versicolor
   1     50         0          0
   2      0        41          4
   3      0         9         46
Misclassification errors (%):
    setosa  virginica versicolor 
         0         18          8 
Mean misclassification error: 8.7%

> # \subsection{How to know cluster numbers}
> 
> pdf(file="pics/59100.pdf"); oldpar <- par(mar=c(5, 0, 3, 0))

> library(cluster)

> eq.a <- agnes(eq[, -1])

> plot(eq.a, which=1, col=c(0, "maroon"))

> par(oldpar); dev.off()
pdf 
  2 

> library(mclust)
Package 'mclust' version 5.4.6
Type 'citation("mclust")' for citing this R package in publications.

> iris.mclust <- Mclust(iris[, -5])

> summary(iris.mclust) # 2 clusters
---------------------------------------------------- 
Gaussian finite mixture model fitted by EM algorithm 
---------------------------------------------------- 

Mclust VEV (ellipsoidal, equal shape) model with 2 components: 

 log-likelihood   n df       BIC       ICL
       -215.726 150 26 -561.7285 -561.7289

Clustering table:
  1   2 
 50 100 

> iris.mclust$classification
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[149] 2 2

> library(dbscan)

> kNNdistplot(iris[, -5], k = 5) # look on the knee

> abline(h=.5, col = "red", lty=2)

> (iris.dbscan <- dbscan(iris[, -5], eps = .5, minPts = 5))
DBSCAN clustering for 150 objects.
Parameters: eps = 0.5, minPts = 5
The clustering contains 2 cluster(s) and 17 noise points.

 0  1  2 
17 49 84 

Available fields: cluster, eps, minPts

> Misclass(iris.dbscan$cluster, iris$Species, best=TRUE, ignore="0")
Best classification table:
  setosa versicolor virginica
1     49          0         0
2      0         44        40
       0          0         0
Misclassification errors (%):
    setosa versicolor  virginica 
         0          0        100 
Mean misclassification error: 33.3%
Note: data contains NAs

> plot(iris.p, type="n", xlab="", ylab="")

> text(iris.p, labels=abbreviate(iris[, 5], 1,
+ method="both.sides"), col=iris.dbscan$cluster+1)

> iris.hdbscan <- hdbscan(iris[, -5], minPts=10)

> plot(iris.hdbscan$hc)

> library(MeanShift)
Loading required package: parallel
Loading required package: wavethresh
WaveThresh: R wavelet software, release 4.6.8, installed

Copyright Guy Nason and others 1993-2016

Note: nlevels has been renamed to nlevelsWT


> bandwidth <- quantile(dist(iris[, -5]), 0.25)

> (bmsClustering(t(iris[, -5]), h=bandwidth))

Running blurring mean-shift algorithm...

Blurring mean-shift algorithm ran successfully.

Finding clusters...

The algorithm found 3 clusters.

$components
                 mode1    mode2    mode3
Sepal.Length 5.0007162 6.179920 7.104628
Sepal.Width  3.4178891 2.869793 3.070013
Petal.Length 1.4701932 4.800869 5.957860
Petal.Width  0.2442342 1.640072 2.046728

$labels
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2
[112] 2 2 2 2 2 2 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2
[149] 2 2


> aa <- prcomp(iris[, -5])$x[, 1] # only PC1

> Histr(aa, overlay="density", breaks=10)

> bb <- c(dist(iris[, -5])) # coverts "distance" object into vector

> Histr(bb, overlay="density", breaks=10)

> pdf(file=NULL); oldpar <- par()

> library(NbClust)

> iris.nb <- NbClust(iris[, -5], method="ward.D") # wait!
*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 
*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 9 proposed 2 as the best number of clusters 
* 10 proposed 3 as the best number of clusters 
* 3 proposed 6 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 
* 1 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  3 
 
 
******************************************************************* 

> par(oldpar); dev.off()
Warning in par(oldpar) : graphical parameter "cin" cannot be set
Warning in par(oldpar) : graphical parameter "cra" cannot be set
Warning in par(oldpar) : graphical parameter "csi" cannot be set
Warning in par(oldpar) : graphical parameter "cxy" cannot be set
Warning in par(oldpar) : graphical parameter "din" cannot be set
Warning in par(oldpar) : graphical parameter "page" cannot be set
pdf 
  2 

> # \subsection{Use projection pursuit for clustering}
> 
> library(PPCI)
Loading required package: rARPACK

> iris.mcdr <- mcdr(iris[, -5], p=3) # dimension reduction

> plot(iris.mcdr$fitted, col=iris$Species)

> iris.mcdc <- mcdc(iris[, -5], K=3) # clustering

> plot(iris.mcdc, labels=iris$Species)

> Misclass(iris.mcdc$cluster, iris$Species, best=TRUE) # shipunov
Best classification table:
    obs
pred virginica setosa versicolor
   1        49      0          2
   2         0     50          0
   3         1      0         48
Misclassification errors (%):
 virginica     setosa versicolor 
         2          0          4 
Mean misclassification error: 2%

> # \subsection{How to compare different ordinations}
> 
> pdf(file="pics/93900.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> set.seed(42)

> library(Rtsne)

> iris.unique <- unique(iris)

> tsne.out <- Rtsne(as.matrix(iris.unique[, -5]))

> irisu.pca <- prcomp(iris.unique[, -5])

> irisu.p <- irisu.pca$x[, 1:2]

> library(vegan)

> irisu.pr <- procrustes(irisu.p, tsne.out$Y)

> plot(irisu.pr, ar.col=iris.unique$Species, pch=" ",
+ xlab="", ylab="", main="") # arrows point to target (PCA)

> points(irisu.pr$Yrot, col=iris$Species, pch=21, bg="white")

> with(iris.unique, legend("topright", lty=1, col=1:nlevels(Species),
+ legend=levels(Species), bty="n"))

> legend("bottomright", lty=2:1, legend=c("PCA", "t-SNE"), bty="n")

> set.seed(NULL)

> par(oldpar); dev.off()
pdf 
  2 

> # \section{Answers to exercises}
> 
> s50 <- read.table("data/stars.txt", h=TRUE, sep="\t",
+ as.is=TRUE, quote="")

> str(s50)
'data.frame':	50 obs. of  13 variables:
 $ STAR.N   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ NAME     : chr  "Sirius" "Canopus" "Arcturus" "Rigil Kentaurus" ...
 $ IDENT    : chr  "9Alpha" "Alpha" "16Alpha" "Alpha1" ...
 $ CONSTEL.A: chr  "CMa" "Car" "Boo" "Cen" ...
 $ CONSTEL  : chr  "Canis Major" "Carina" "Bootes" "Centaurus" ...
 $ VMAG     : num  -1.46 -0.72 -0.04 -0.01 0.03 0.08 0.12 0.38 0.46 0.5 ...
 $ DIST     : num  8.6 312.6 36.7 4.4 25.3 ...
 $ PREC     : num  0.4 5.1 0.8 0.2 0.4 1.2 19.2 0.3 2.5 21.5 ...
 $ AMAG     : num  1.43 -5.63 -0.3 4.34 0.58 -0.48 -6.75 2.66 -2.76 -5.09 ...
 $ SPEC     : chr  "A1Vm" "F0II" "K1.5IIIFe-0.5" "G2V" ...
 $ RA       : chr  "06 45 08.9" "06 23 57.1" "14 15 39.7" "14 39 35.9" ...
 $ DEC      : chr  "-16 42 58" "-52 41 45" "+19 10 57" "-60 50 07" ...
 $ HIP      : int  32349 30438 69673 71683 91262 24608 24436 37279 7588 27989 ...

> RA10 <- as.numeric(substr(s50$RA, 1, 2)) + # hours
+ as.numeric(substr(s50$RA, 4, 5))/60 + # minutes
+ as.numeric(substr(s50$RA, 7, 10))/3600 # seconds

> DEC10 <- sign(as.numeric(substr(s50$DEC, 1, 3))) *
+ (as.numeric(substr(s50$DEC, 2, 3)) +
+ as.numeric(substr(s50$DEC, 5, 6))/60 +
+ as.numeric(substr(s50$DEC, 8, 9))/3600)

> coo <- cbind(RA10, DEC10)

> oldpar <- par(bg="black", fg="white", mar=rep(0, 4))

> plot(coo, pch="*", cex=(3 - s50$VMAG)) # constellation plot

> Hulls(coo, as.numeric(factor(s50$CONSTEL)), # shipunov
+ usecolors=rep("white", nlevels(factor(s50$CONSTEL))))

> points(runif(100, min(RA10), max(RA10)),
+ runif(100, min(DEC10), max(DEC10)), pch=".") # random "stars"

> par(oldpar)

> plot(coo, type="n") # constellation names

> text(coo, s50$CONSTEL.A)

> plot(coo, type="n") # star names

> text(coo, s50$NAME)

> library(dbscan)

> for (eps in 1:20) cat(c(eps, ":",
+ names(table(dbscan(coo, eps=eps)$cluster))), "\n")
1 : 0 
2 : 0 
3 : 0 
4 : 0 1 
5 : 0 1 
6 : 0 1 
7 : 0 1 
8 : 0 1 2 
9 : 0 1 2 3 
10 : 0 1 2 3 
11 : 0 1 2 
12 : 0 1 2 
13 : 0 1 2 
14 : 0 1 
15 : 0 1 
16 : 0 1 
17 : 0 1 
18 : 0 1 
19 : 0 1 
20 : 0 1 

> pdf(file="pics/108660.pdf")

> s50.db <- dbscan(coo, eps=9)

> oldpar <- par(bg="black", fg="white", mar=rep(0, 4))

> plot(coo, pch=8, cex=(3 - s50$VMAG))

> Hulls(coo, s50.db$cluster, # shipunov
+ usecolors=c("black", "white", "white", "white"))

> points(runif(100, min(RA10), max(RA10)),
+ runif(100, min(DEC10), max(DEC10)), pch=".")

> par(oldpar)

> dev.off()
pdf 
  2 

> Adj.Rand(as.numeric(factor(s50$CONSTEL)), s50.db$cluster) # shipunov
[1] 0.1061416

> beer <- read.table("data/beer.txt", sep="\t", h=TRUE)

> head(beer)
             C01 C02 C03 C04 C05 C06 C07 C08 C09 C10 C11 C12 C13 C14 C15 C16
Afanas.light   0   1   0   1   0   0   0   0   1   0   1   0   1   0   0   0
Baltika.3      0   1   0   0   0   0   1   0   1   1   1   0   0   0   0   1
Baltika.6      0   1   1   1   1   0   1   0   1   1   1   0   1   0   0   1
Baltika.9      0   1   1   1   1   0   1   0   1   1   1   0   1   0   0   1
Bochk.light    0   1   0   0   1   1   0   1   1   1   1   1   0   0   0   1
Budweiser      0   1   1   0   1   0   1   0   1   1   0   1   0   0   0   0
             C17 C18 C19 C20
Afanas.light   0   0   0   0
Baltika.3      0   0   0   0
Baltika.6      0   0   0   1
Baltika.9      0   0   0   1
Bochk.light    0   0   0   0
Budweiser      1   0   0   1

> pdf(file="pics/38270.pdf"); oldpar <- par(mar=c(0, 2, 0, 1))

> library(vegan)

> beer.d <- vegdist(beer, "jaccard")

> plot(hclust(beer.d, method="ward.D"), main="", xlab="", sub="")

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/62550.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> kubricks <- read.table("data/kubricks.txt", h=TRUE, row.names=1)

> kubricks.d <- dist(kubricks, method="binary")

> kubricks.c <- cmdscale(kubricks.d)

> plot(kubricks.c, type="n", axes=FALSE)

> text(kubricks.c, labels=row.names(kubricks), cex=2)

> library(vegan)

> lines(spantree(kubricks.d), kubricks.c[, 1:2], lty=2)

> par(oldpar); dev.off()
pdf 
  2 

> library(phangorn)

Attaching package: ‘phangorn’

The following objects are masked from ‘package:vegan’:

    diversity, treedist


> k <- as.phyDat(data.frame(t(kubricks)), type="USER",
+ levels = c(0, 1))

> kd <- dist.hamming(k) # Hamming distance for morphological data

> kdnj <- NJ(kd) # neighbor-joining tree for starter

> kp <- optim.parsimony(kdnj, k) # find most parsimonous tree
Final p-score 13 after  1 nni operations 

> ktree <- root(kp, outgroup="H", resolve.root=TRUE) # re-root

> plot(ktree, cex=2)

> # \chapter{Learn}
> 
> iris.train <- iris[seq(1, nrow(iris), 5), ]

> iris.unknown <- iris[-seq(1, nrow(iris), 5), ]

> # \section{Learning with regression}
> 
> # \subsection{Linear discriminant analysis}
> 
> library(MASS)

> iris.lda <- lda(Species ~ . , data=iris.train)

> iris.predicted <- predict(iris.lda, iris.unknown[, 1:4])

> Misclass(iris.predicted$class, iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         40         7
  virginica       0          0        33
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        0.0       17.5 
Mean misclassification error: 5.8%

> ldam <- manova(as.matrix(iris.unknown[, 1:4]) ~
+ iris.predicted$class)

> summary(ldam, test="Wilks")
                      Df    Wilks approx F num Df den Df    Pr(>F)    
iris.predicted$class   2 0.026878   145.34      8    228 < 2.2e-16 ***
Residuals            117                                              
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> summary(aov(as.matrix(iris.unknown[, 1:4]) ~
+ iris.predicted$class))
 Response Sepal.Length :
                      Df Sum Sq Mean Sq F value    Pr(>F)    
iris.predicted$class   2 51.047 25.5237   108.9 < 2.2e-16 ***
Residuals            117 27.423  0.2344                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 Response Sepal.Width :
                      Df  Sum Sq Mean Sq F value    Pr(>F)    
iris.predicted$class   2  9.9452  4.9726  46.072 1.749e-15 ***
Residuals            117 12.6278  0.1079                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 Response Petal.Length :
                      Df Sum Sq Mean Sq F value    Pr(>F)    
iris.predicted$class   2 341.30 170.650  826.29 < 2.2e-16 ***
Residuals            117  24.16   0.207                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

 Response Petal.Width :
                      Df Sum Sq Mean Sq F value    Pr(>F)    
iris.predicted$class   2 62.270 31.1350  828.54 < 2.2e-16 ***
Residuals            117  4.397  0.0376                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


> library(vegan)

> adonis(as.matrix(iris.unknown[, 1:4]) ~ iris.predicted$class)

Call:
adonis(formula = as.matrix(iris.unknown[, 1:4]) ~ iris.predicted$class) 

Permutation: free
Number of permutations: 999

Terms added sequentially (first to last)

                      Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
iris.predicted$class   2   1.85808 0.92904  437.53 0.88206  0.001 ***
Residuals            117   0.24843 0.00212         0.11794           
Total                119   2.10651                 1.00000           
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> pdf(file="pics/30320.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> iris.lda2 <- lda(iris[, 1:4], iris[, 5])

> iris.ldap2 <- predict(iris.lda2, dimen=2)$x

> plot(iris.ldap2, type="n", xlab="LD1", ylab="LD2")

> text(iris.ldap2, labels=abbreviate(iris[, 5], 1,
+ method="both.sides"))

> Ellipses(iris.ldap2, as.numeric(iris[, 5]),
+ centers=TRUE) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> set.seed(16881)

> iris.sample2 <- sample(nrow(iris), 30)

> set.seed(NULL) # set random number generator default

> iris.train2 <- iris[iris.sample2, ]

> iris.unknown2 <- iris[-iris.sample2, ]

> iris.lda2 <- lda(Species ~ . , data=iris.train2)

> iris.predicted2 <- predict(iris.lda2, iris.unknown2[, 1:4])

> Misclass(iris.predicted2$class, iris.unknown2[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         44          0         0
  versicolor      0         32        12
  virginica       0          0        32
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        0.0       27.3 
Mean misclassification error: 9.1%

> table(iris.train2$Species)

    setosa versicolor  virginica 
         6         18          6 

> sam <- Class.sample(iris[, 5], nsam=10) # shipunov

> nrow(iris[sam, ])
[1] 30

> RSS <- function() {
+  sam <- Class.sample(iris[, 5], 5) # 5 samples per class
+  iris.lda <- lda(Species ~ ., data=iris[sam, ])
+  iris.ldp <- predict(iris.lda, iris[!sam, -5])
+  iris.ldm <- Misclass(iris.ldp$class, iris[!sam, 5], quiet=TRUE)
+  mean(((CS <- colSums(iris.ldm)) - diag(iris.ldm))/CS) * 100
+ }

> res <- replicate(100, RSS())

> quantile(res, c(0.025, 0.5, 0.975))
     2.5%       50%     97.5% 
 1.833333  4.444444 10.370370 

> hist(res)

> # \subsection{Recursive partitioning}
> 
> pdf(file="pics/30520.pdf"); oldpar <- par(mar=c(0, 1, 0, 1))

> library(tree)

> iris.tree <- tree(Species ~ ., data=iris)

> plot(iris.tree)

> text(iris.tree)

> par(oldpar); dev.off()
pdf 
  2 

> iris.tree2 <- tree(Species ~ ., data=iris.train)

> iris.tp2 <- predict(iris.tree2, iris.unknown[, -5], type="class")

> Misclass(iris.tp2, iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         38         6
  virginica       0          2        34
Misclassification errors (%):
    setosa versicolor  virginica 
         0          5         15 
Mean misclassification error: 6.7%

> pdf(file="pics/69320_to_crop.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> library(party)
Loading required package: mvtnorm

Attaching package: ‘mvtnorm’

The following object is masked from ‘package:mclust’:

    dmvnorm

Loading required package: modeltools
Loading required package: stats4

Attaching package: ‘modeltools’

The following object is masked from ‘package:kernlab’:

    prior

Loading required package: strucchange
Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: sandwich

Attaching package: ‘party’

The following object is masked from ‘package:ape’:

    where


> SA <- abbreviate(iris$Species, 1, method="both.sides")

> iris.ct <- ctree(factor(SA) ~ ., data=iris[, 1:4])

> plot(iris.ct)

> Misclass(predict(iris.ct), SA) # shipunov
Classification table:
    obs
pred  a  s  v
   a 49  0  5
   s  0 50  0
   v  1  0 45
Misclassification errors (%):
 a  s  v 
 2  0 10 
Mean misclassification error: 4%

> par(oldpar); dev.off()
pdf 
  2 

> library(OneR)

Attaching package: ‘OneR’

The following object is masked from ‘package:PPCI’:

    breastcancer


> iris.one <- OneR(Species ~ ., data=iris.train)
Warning in OneR.data.frame(x = data, ties.method = ties.method, verbose = verbose,  :
  data contains unused factor levels

> summary(iris.one)

Call:
OneR.formula(formula = Species ~ ., data = iris.train)

Rules:
If Petal.Width = (0.198,0.66] then Species = setosa
If Petal.Width = (0.66,1.12]  then Species = versicolor
If Petal.Width = (1.12,1.58]  then Species = versicolor
If Petal.Width = (1.58,2.04]  then Species = virginica
If Petal.Width = (2.04,2.5]   then Species = virginica

Accuracy:
28 of 30 instances classified correctly (93.33%)

Contingency table:
            Petal.Width
Species      (0.198,0.66] (0.66,1.12] (1.12,1.58] (1.58,2.04] (2.04,2.5] Sum
  setosa             * 10           0           0           0          0  10
  versicolor            0         * 2         * 6           2          0  10
  virginica             0           0           0         * 3        * 7  10
  Sum                  10           2           6           5          7  30
---
Maximum in each column: '*'

Pearson's Chi-squared test:
X-squared = 52.8, df = 8, p-value = 1.179e-08


> iris.pre <- predict(iris.one, iris.unknown[, -5])

> Misclass(as.character(iris.pre), iris.unknown$Species,
+ ignore="UNSEEN", useNA="ifany", best=TRUE) # shipunov
Best classification table:
            obs
pred         setosa versicolor virginica
  setosa         35          0         0
  versicolor      0         37         3
  virginica       0          3        37
  <NA>            5          0         0
Misclassification errors (%):
    setosa versicolor  virginica 
      12.5        7.5        7.5 
Mean misclassification error: 9.2%
Note: data contains NAs

> # \section{Ensemble learnig}
> 
> # \subsection{Random Forest}
> 
> library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:ggplot2’:

    margin


> set.seed(17) # Random Forest is stochastic method

> iris.rf <- randomForest(Species ~ ., data=iris.train)

> iris.rfp <- predict(iris.rf, iris.unknown[, -5])

> Misclass(iris.rfp, iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         39         7
  virginica       0          1        33
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        2.5       17.5 
Mean misclassification error: 6.7%

> pdf(file="pics/30890.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> set.seed(17)

> iris.urf <- randomForest(iris[, -5]) # no response variable

> iris.rfc <- cmdscale(1 - iris.urf$proximity)

> plot(iris.rfc, xlab="", ylab="",
+ pch=abbreviate(iris[, 5], 1, method="both.sides"))

> Hulls(iris.rfc, as.numeric(iris[, 5]), outliers=FALSE,
+ coef=0.8, usecolor=rep(1, 3), lty=2) # shipunov

> Biarrows(iris.rfc, iris[, -5], shift=c(-0.1, 0)) # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> # \subsection{Gradient boosting}
> 
> library(gbm)
Loaded gbm 2.1.8

> set.seed(4)

> iris.gbm <- gbm(Species ~ .,
+ data=rbind(iris.train, iris.train)) # to make training bigger
Distribution not specified, assuming multinomial ...
Warning: Setting `distribution = "multinomial"` is ill-advised as it is currently broken. It exists only for backwards compatibility. Use at your own risk.

> iris.gbm.p1 <- predict(iris.gbm, iris.unknown[, -5],
+ n.trees=iris.gbm$n.trees)

> iris.gbm.p2 <- apply(iris.gbm.p1, 1, # membership trick
+ function(.x) colnames(iris.gbm.p1)[which.max(.x)])

> Misclass(iris.gbm.p2, iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         38         6
  virginica       0          2        34
Misclassification errors (%):
    setosa versicolor  virginica 
         0          5         15 
Mean misclassification error: 6.7%

> # \section{Learning with proximity}
> 
> library(class)

> iris.knn.pred <- knn(train=iris.train[, -5],
+ test=iris.unknown[, -5], cl=iris.train[, 5], k=5)

> Misclass(iris.knn.pred, iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         40        11
  virginica       0          0        29
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        0.0       27.5 
Mean misclassification error: 9.2%

> pdf(file="pics/97860.pdf"); oldpar <- par(mar=c(0, 0, 0, 0))

> iris.p <- prcomp(iris[, -5])$x[, 1:2]

> iris.p1 <- iris.p[seq(1, nrow(iris.p), 5), ]

> iris.p2 <- iris.p[-seq(1, nrow(iris.p), 5), ]

> library(tripack)

> iris.v <- voronoi.mosaic(iris.p1[, 1], iris.p1[, 2],
+ duplicate="remove")

> plot(iris.v, do.points=FALSE, main="", sub="")

> points(iris.p1[, 1:2], col=iris.train$Species, pch=16, cex=2)

> points(iris.p2[, 1:2], col=iris.unknown$Species)

> par(oldpar); dev.off()
pdf 
  2 

> library(stringdist)

> library(english)

> iris.en <- apply(round(iris[, -5]), 1,
+ function(.x) paste(as.character(as.english(.x)), collapse=" "))

> head(iris.en)
[1] "five four one zero"  "five three one zero" "five three one zero"
[4] "five three two zero" "five four one zero"  "five four two zero" 

> iris.end <- stringdistmatrix(iris.en)

> cl1 <- iris$Species

> sam <- Class.sample(iris[, 5], nsam=10, uniform=TRUE)

> cl1[!sam] <- NA # DNN wants unknown classes as NAs

> iris.pred <- DNN(dst=iris.end, cl=cl1, k=5)

> Misclass(iris.pred, iris$Species[is.na(cl1)]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          3         2
  versicolor      0         34         5
  virginica       0          3        33
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0       15.0       17.5 
Mean misclassification error: 10.8%

> library(ddalpha)
Loading required package: robustbase
Loading required package: sfsmisc
Loading required package: geometry

> iris.dd <- ddalpha.train(Species ~ ., data=iris.train)
Selected columns:  Species,   Sepal.Length,   Sepal.Width,   Petal.Length,   Petal.Width 

> iris.p <- predict(iris.dd, iris.unknown[, -5]) # default method

> Misclass(unlist(iris.p), iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         40         7
  virginica       0          0        33
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        0.0       17.5 
Mean misclassification error: 5.8%

> iris.pp <- predict(iris.dd, iris.unknown[, -5],
+ outsider.method="Ignore")

> sapply(iris.pp, as.character) # shows points outside train clouds
  [1] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
  [6] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [11] "Ignored"    "Ignored"    "Ignored"    "setosa"     "Ignored"   
 [16] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [21] "Ignored"    "setosa"     "setosa"     "Ignored"    "Ignored"   
 [26] "Ignored"    "Ignored"    "setosa"     "Ignored"    "Ignored"   
 [31] "Ignored"    "setosa"     "Ignored"    "Ignored"    "Ignored"   
 [36] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "setosa"    
 [41] "versicolor" "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [46] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [51] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [56] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "versicolor"
 [61] "Ignored"    "Ignored"    "versicolor" "Ignored"    "Ignored"   
 [66] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [71] "Ignored"    "versicolor" "versicolor" "versicolor" "Ignored"   
 [76] "versicolor" "versicolor" "versicolor" "Ignored"    "versicolor"
 [81] "Ignored"    "virginica"  "Ignored"    "Ignored"    "Ignored"   
 [86] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [91] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
 [96] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
[101] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
[106] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   
[111] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "virginica" 
[116] "Ignored"    "Ignored"    "Ignored"    "Ignored"    "Ignored"   

> # \section{Learning with rules}
> 
> library(e1071)

> iris.nb <- naiveBayes(Species ~ ., data=iris.train)

> iris.nb

Naive Bayes Classifier for Discrete Predictors

Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)

A-priori probabilities:
Y
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Conditional probabilities:
            Sepal.Length
Y            [,1]      [,2]
  setosa     5.16 0.2988868
  versicolor 5.96 0.6257440
  virginica  6.94 0.5059644

            Sepal.Width
Y            [,1]      [,2]
  setosa     3.47 0.4423423
  versicolor 2.87 0.4270051
  virginica  3.10 0.1490712

            Petal.Length
Y            [,1]      [,2]
  setosa     1.49 0.1663330
  versicolor 4.32 0.3966527
  virginica  5.77 0.4762119

            Petal.Width
Y            [,1]      [,2]
  setosa     0.26 0.0843274
  versicolor 1.34 0.2366432
  virginica  2.19 0.2282786


> iris.nbp <- predict(iris.nb, iris.unknown[, -5])

> Misclass(iris.nbp, iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         39        13
  virginica       0          1        27
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        2.5       32.5 
Mean misclassification error: 11.7%

> library(arulesCBA)
Loading required package: Matrix
Loading required package: arules

Attaching package: ‘arules’

The following object is masked from ‘package:modeltools’:

    info

The following object is masked from ‘package:wavethresh’:

    support

The following object is masked from ‘package:kernlab’:

    size

The following objects are masked from ‘package:base’:

    abbreviate, write


Attaching package: ‘arulesCBA’

The following object is masked from ‘package:party’:

    response


> irisd <- as.data.frame(lapply(iris[1:4], discretize, breaks=9))

> irisd$Species <- iris$Species

> irisd.train <- irisd[seq(1, nrow(irisd), 5), ]

> irisd.unknown <- irisd[-seq(1, nrow(irisd), 5), ]

> irisd.cba <- CBA(Species ~ ., irisd.train, supp=0.05, conf=0.8)

> inspect(irisd.cba$rules)
    lhs                            rhs                  support   confidence
[1] {Petal.Length=[1.5,2.63)}   => {Species=setosa}     0.2000000 1.0000000 
[2] {Petal.Length=[5.7,6.9]}    => {Species=virginica}  0.2000000 1.0000000 
[3] {Petal.Width=[2.14,2.5]}    => {Species=virginica}  0.2000000 1.0000000 
[4] {Petal.Width=[0.2,0.211)}   => {Species=setosa}     0.2000000 1.0000000 
[5] {Petal.Width=[0.867,1.3)}   => {Species=versicolor} 0.1333333 1.0000000 
[6] {Petal.Width=[0.211,0.867)} => {Species=setosa}     0.1333333 1.0000000 
[7] {Petal.Length=[4.5,4.9)}    => {Species=versicolor} 0.1333333 1.0000000 
[8] {Petal.Length=[4.1,4.5)}    => {Species=versicolor} 0.1333333 1.0000000 
[9] {}                          => {Species=virginica}  0.3333333 0.3333333 
    coverage  lift count size coveredTransactions totalErrors
[1] 0.2000000 3     6    2    6                   14         
[2] 0.2000000 3     6    2    6                    8         
[3] 0.2000000 3     6    2    3                    5         
[4] 0.2000000 3     6    2    2                    3         
[5] 0.1333333 3     4    2    4                    3         
[6] 0.1333333 3     4    2    2                    1         
[7] 0.1333333 3     4    2    4                    1         
[8] 0.1333333 3     4    2    2                    0         
[9]        NA 1    30    1    1                    0         

> irisd.cbap <- predict(irisd.cba, irisd.unknown)

> Misclass(irisd.cbap, irisd.unknown$Species) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         37          0         0
  versicolor      0         31         3
  virginica       3          9        37
Misclassification errors (%):
    setosa versicolor  virginica 
       7.5       22.5        7.5 
Mean misclassification error: 12.5%

> # \section{Learning from the black boxes}
> 
> # \subsection{Support Vector Machines}
> 
> library(e1071)

> iris.svm <- svm(Species ~ ., data=iris.train)

> iris.svmp <- predict(iris.svm, iris.unknown[, -5])

> Misclass(iris.svmp, iris.unknown[, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         40        14
  virginica       0          0        26
Misclassification errors (%):
    setosa versicolor  virginica 
         0          0         35 
Mean misclassification error: 11.7%

> pdf(file="pics/70010.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> iris.p <- prcomp(iris[, -5])$x[, 1:2]

> iris.svm.pca <- svm(Species ~ ., data=cbind(iris[5], iris.p))

> plot(iris.p, type="n", xlab="", ylab="")

> Gradd(iris.svm.pca, iris.p, cex=0.4) # shipunov

> text(iris.p, col=as.numeric(iris[, 5]),
+ labels=abbreviate(iris[, 5], 1, method="both.sides"))

> par(oldpar); dev.off()
pdf 
  2 

> # \subsection{Neural Networks}
> 
> library(nnet)

> iris.nn <- nnet(Species ~ . , data=iris.train, size=4)
# weights:  35
initial  value 38.235938 
iter  10 value 0.305649
iter  20 value 0.019828
iter  30 value 0.001140
iter  40 value 0.000331
iter  50 value 0.000155
final  value 0.000086 
converged

> iris.nnp <- predict(iris.nn, iris.unknown[, -5], type="class")

> Misclass(iris.nnp, iris.unknown$Species) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         40          1         0
  versicolor      0         31        10
  virginica       0          8        30
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0       22.5       25.0 
Mean misclassification error: 15.8%

> pdf(file="pics/112300_to_crop.pdf")

> library(NeuralNetTools)

> oldpar <- par(mar=c(0, 2, 0, 1), xpd=TRUE)

> plotnet(iris.nn)

> par(oldpar)

> dev.off()
pdf 
  2 

> Perceptron <- function(x, y, eta, n) {
+  ## x dataset, y classes, eta learning speed, n cycles
+  ## make initial weights randomly
+  weight <- runif(ncol(x) + 1)
+  ## make initial errors vector: we need it for visualization
+  errors <- rep(0, n)
+  ## loop over number of epochs (learning cycles) n
+  for (jj in 1:n) {
+   ## loop through training data set
+   for (ii in 1:length(y)) {
+    ## main equation: "w1*x + w2*y + ... + b" (_w_eights and _b_ias)
+    z <- sum(as.numeric(x[ii, ]) * weight[-1]) + weight[1]
+    ## decision step: Heaviside activation
+    ypred <- as.numeric(z > 0) # all > 0 become 1
+    ## update weight, it changes if the predicted value is incorrect
+    weightdiff <- eta * (y[ii] - ypred) *  c(1, as.numeric(x[ii, ]))
+    weight <- weight + weightdiff
+    ## update error vector
+    if ((y[ii] - ypred) != 0) errors[jj] <- errors[jj] + 1
+    }
+   }
+  ## name coefficients
+  names(weight) <- c("bias", names(x))
+  ## output will be a list
+  list(weight=weight, errors=errors)
+ }

> ## new "species": "setosa" and "non-setosa" (1 or 0)
> 
> y <- as.numeric(iris[, 5] == "setosa")

> ## make sample (10% of each class)
> 
> sam <- Class.sample(y, prop=0.1, uniform=TRUE) # shipunov

> ## training classes
> 
> y.trn <- y[sam]

> ## select two characters (but more characters is usually better)
> 
> x <- iris[, c(1, 3)]

> ## check if everything is separable
> 
> plot(x, col=y+1)

> ## training data
> 
> x.trn <- x[sam, ]

> (P <- Perceptron(x=x.trn, y=y.trn, eta=0.5, n=10))
$weight
        bias Sepal.Length Petal.Length 
   0.7122841    1.3388011   -2.2789806 

$errors
 [1] 1 3 1 0 0 0 0 0 0 0


> ## make testing sample
> 
> x.tst <- x[!sam, ]

> ## select weights from perceptron object
> 
> W <- P$weight

> ## predict each row of x.tst with the main Perceptron() equation
> 
> z.tst <- apply(x.tst, 1, function(.x) sum(.x * W[-1] + W[1]))

> ## decision
> 
> z.tst <- as.numeric(z.tst > 0)

> ## testing classes to compare with prediction
> 
> y.tst <- y[!sam]

> Misclass(z.tst, y.tst) # shipunov
Classification table:
    obs
pred  0  1
   0 74  0
   1 16 45
Misclassification errors (%):
   0    1 
17.8  0.0 
Mean misclassification error: 8.9%

> pdf(file="pics/63190_to_crop.pdf")

> library(NeuralNetTools)

> oldpar <- par(mar=c(0, 2, 0, 1), xpd=TRUE)

> plotnet(W, struct=c(2, 1), x_names=names(x), y_names="setosa")

> par(oldpar)

> dev.off()
pdf 
  2 

> # \section{Semi-supervised learning}
> 
> library(SSL)
Registered S3 methods overwritten by 'klaR':
  method      from 
  predict.rda vegan
  print.rda   vegan
  plot.rda    vegan

> iris.10 <- seq(1, nrow(iris), 10) # only 10 labeled points!

> iris.sslt1 <- sslSelfTrain(iris[iris.10, -5], iris[iris.10, 5],
+ iris[-iris.10, -5], nrounds=20, n=5)

> iris.sslt2 <- levels(iris$Species)[iris.sslt1]

> Misclass(iris.sslt2, iris[-iris.10, 5]) # shipunov
Classification table:
            obs
pred         setosa versicolor virginica
  setosa         45          0         0
  versicolor      0         43         6
  virginica       0          2        39
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        4.4       13.3 
Mean misclassification error: 5.9%

> set.seed(0)

> library(kpeaks)

> nclusters <- max(findk(iris[, -5])$pcounts) # step 1

> iris.cl <- kmeans(iris[, -5], centers=nclusters)$cluster # step 2

> sel <- Class.sample(iris.cl, 10) # shipunov

> iris.svm <- svm(as.factor(iris.cl[sel]) ~ ., data=iris[sel, -5])

> iris.svmp <- predict(iris.svm, iris[!sel, -5]) # step 3

> Misclass(iris.svmp, iris[!sel, 5], best=TRUE) # shipunov
Best classification table:
    obs
pred virginica setosa versicolor
   1        28      0          2
   2         0     40          0
   3        10      0         40
Misclassification errors (%):
 virginica     setosa versicolor 
      26.3        0.0        4.8 
Mean misclassification error: 10.4%

> set.seed(NULL)

> iris.dms <- Classproj(iris[, -5], iris$Species) # shipunov

> plot(iris.dms$proj, col=iris$Species)

> text(iris.dms$centers, levels(iris$Species), col=1:3)

> sam <- Class.sample(iris$Species, 10) # shipunov

> newclasses <- iris$Species

> newclasses[!sam] <- NA

> iris.dms <- Classproj(iris[, -5], newclasses) # shipunov

> plot(iris.dms$proj, col=iris$Species, pch=ifelse(sam, 19, 1))

> text(iris.dms$centers, levels(iris$Species), col=1:3)

> library(ksharp)

> iris.km <- kmeans(iris[, -5], centers=3)

> Misclass(iris.km$cluster, iris$Species, best=TRUE) # shipunov
Best classification table:
    obs
pred setosa versicolor virginica
   1     50          0         0
   2      0         48        14
   3      0          2        36
Misclassification errors (%):
    setosa versicolor  virginica 
         0          4         28 
Mean misclassification error: 10.7%

> names(iris.km$cluster) <- row.names(iris) # ksharp requirement

> iris.ksharp <- ksharp(iris.km, data=iris[, -5])

> plot(prcomp(iris[, -5])$x, col=iris$Species,
+ pch=ifelse(iris.ksharp$cluster == 0, -1, 14)+iris.km$cluster)

> legend("topright", col=c(1:3, rep(1, 4)),
+ pch=c(rep(19, 3), 15:17, 1),
+ legend=c("I. setosa", "I. versicolor", "I. virginica",
+ "Cluster I", "Cluster II", "Cluster III", "'noise'"))

> Misclass(iris.ksharp$cluster, iris$Species, ignore=0, best=TRUE)
Best classification table:
    obs
pred setosa versicolor virginica
   1     50          0         0
   2      0         44         7
   3      0          0        34
Misclassification errors (%):
    setosa versicolor  virginica 
       0.0        0.0       17.1 
Mean misclassification error: 5.7%
Note: data contains NAs

> aad <- dist(t(atmospheres))

> aadu <- Updist(aad, unlink=list(c("Earth", "Mercury"))) # shipunov

> plot(hclust(aadu))

> # \section{How to choose the right method}
> 
> # \section{Answers to exercises}
> 
> pdf(file="pics/37980.pdf"); oldpar <- par(mar=c(0, 1, 0, 1))

> eq.tree <- tree(eq[, 1] ~ ., eq[, -1]) # shipunov

> plot(eq.tree); text(eq.tree)

> par(oldpar); dev.off()
pdf 
  2 

> # \chapter{Example of \R session}
> 
> # \section{Starting...}
> 
> dir("data")
 [1] "argon_c.txt"            "argon.txt"              "beer_c.txt"            
 [4] "beer.txt"               "bigaln2.txt"            "bigaln_c.txt"          
 [7] "bigaln.tps"             "bigaln.txt"             "bpress_c.txt"          
[10] "bpress.txt"             "bugs_c.txt"             "bugs.txt"              
[13] "cashiers_c.txt"         "cashiers.txt"           "com12_c.txt"           
[16] "com12.rd"               "com80_c.txt"            "com80.txt"             
[19] "dact_c.txt"             "dact.txt"               "d_c.txt"               
[22] "distances.nts"          "d.txt"                  "eggs_c.txt"            
[25] "eggs.txt"               "elections_c.txt"        "elections.txt"         
[28] "eq8_c.txt"              "eq8.txt"                "errors_c.txt"          
[31] "errors.txt"             "exams_c.txt"            "exams.txt"             
[34] "grades_c.txt"           "grades.txt"             "ipomopsis_c.txt"       
[37] "ipomopsis.txt"          "kubricks_c.txt"         "kubricks.txt"          
[40] "lakesea_abio.txt"       "lakesea_bio_abio_c.txt" "lakesea_bio.txt"       
[43] "logit_c.txt"            "logit.txt"              "mydata1.txt"           
[46] "mydata2.txt"            "mydata3.txt"            "mydata_c.txt"          
[49] "mydata.txt"             "nd_c.txt"               "nd.txt"                
[52] "pear2d_bw.png"          "phanerozoic_c.txt"      "phanerozoic.txt"       
[55] "poisoning_c.txt"        "poisoning.txt"          "seedlings_c.txt"       
[58] "seedlings.txt"          "seeing_c.txt"           "seeing.txt"            
[61] "species_c.txt"          "species.txt"            "spur_c.txt"            
[64] "spur.txt"               "stars_c.txt"            "stars.txt"             
[67] "sundew_c.txt"           "sundew.txt"             "tea_c.txt"             
[70] "tea.txt"                "teeth_c.txt"            "teeth.txt"             
[73] "traits_c.txt"           "traits.txt"            

> data <- read.table("data/bugs.txt", h=TRUE)

> data <- read.table("http://ashipunov.me/data/bugs.txt", h=TRUE)

> head(data)
  SEX COLOR WEIGHT LENGTH
1   0     1  10.68   9.43
2   1     1  10.02  10.66
3   0     2  10.18  10.41
4   1     1   8.01   9.00
5   0     3  10.23   8.98
6   1     3   9.70   9.71

> str(data)
'data.frame':	10 obs. of  4 variables:
 $ SEX   : int  0 1 0 1 0 1 1 0 1 1
 $ COLOR : int  1 1 2 1 3 3 2 3 1 2
 $ WEIGHT: num  10.68 10.02 10.18 8.01 10.23 ...
 $ LENGTH: num  9.43 10.66 10.41 9 8.98 ...

> data.f <- data[data$SEX == 0, ]

> data.m.big <- data[data$SEX == 1 & data$LENGTH > 10, ]

> data$WEIGHT.R <- data$WEIGHT/data$LENGTH

> write.table(data, file="data/bugs_new.txt", quote=FALSE)

> # \section{Describing...}
> 
> summary(data)
      SEX          COLOR          WEIGHT           LENGTH      
 Min.   :0.0   Min.   :1.00   Min.   : 8.010   Min.   : 8.970  
 1st Qu.:0.0   1st Qu.:1.00   1st Qu.: 9.707   1st Qu.: 9.023  
 Median :1.0   Median :2.00   Median :10.100   Median : 9.330  
 Mean   :0.6   Mean   :1.90   Mean   :10.041   Mean   : 9.582  
 3rd Qu.:1.0   3rd Qu.:2.75   3rd Qu.:10.568   3rd Qu.:10.182  
 Max.   :1.0   Max.   :3.00   Max.   :11.450   Max.   :10.660  
    WEIGHT.R     
 Min.   :0.8900  
 1st Qu.:0.9832  
 Median :1.0475  
 Mean   :1.0496  
 3rd Qu.:1.1263  
 Max.   :1.2156  

> data1 <- data

> data1$SEX <- factor(data1$SEX, labels=c("female", "male"))

> data1$COLOR <- factor(data1$COLOR,
+ labels=c("red", "blue", "green"))

> summary(data$WEIGHT)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  8.010   9.707  10.100  10.041  10.568  11.450 

> min(data$WEIGHT)
[1] 8.01

> max(data$WEIGHT)
[1] 11.45

> median(data$WEIGHT)
[1] 10.1

> mean(data$WEIGHT)
[1] 10.041

> colMeans(data)
      SEX     COLOR    WEIGHT    LENGTH  WEIGHT.R 
 0.600000  1.900000 10.041000  9.582000  1.049648 

> round(colMeans(data), 1)
     SEX    COLOR   WEIGHT   LENGTH WEIGHT.R 
     0.6      1.9     10.0      9.6      1.0 

> data2 <- data

> data2[3, 3] <- NA

> mean(data2$WEIGHT)
[1] NA

> mean(data2$WEIGHT, na.rm=TRUE)
[1] 10.02556

> data2.o <- na.omit(data2)

> sum(data$WEIGHT)
[1] 100.41

> sum(data[2, ])
[1] 23.61996

> apply(data, 1, sum)
 [1] 22.24256 23.61996 23.56791 19.90000 23.34920 24.40897 22.89041 24.66560
 [9] 21.18453 25.89735

> table(data$SEX)

0 1 
4 6 

> table(data$COLOR)

1 2 3 
4 3 3 

> 100*prop.table(table(data$SEX))

 0  1 
40 60 

> sd(data$WEIGHT)
[1] 0.994501

> sapply(data[, 3:4], sd)
   WEIGHT    LENGTH 
0.9945010 0.6585641 

> sapply(data2[, 3:4], sd, na.rm=TRUE)
   WEIGHT    LENGTH 
1.0535548 0.6585641 

> 100*sd(data$WEIGHT)/mean(data$WEIGHT)
[1] 9.904402

> tapply(data$WEIGHT, data$SEX, mean)
        0         1 
10.577500  9.683333 

> table(data$COLOR, data$SEX)
   
    0 1
  1 1 3
  2 1 2
  3 2 1

> 100*prop.table(table(data$COLOR, data$SEX))
   
     0  1
  1 10 30
  2 10 20
  3 20 10

> tapply(data$WEIGHT, list(data$SEX, data$COLOR), mean)
          1     2      3
0 10.680000 10.18 10.725
1  9.073333 10.59  9.700

> # \section{Plotting...}
> 
> hist(data$WEIGHT, breaks=3)

> boxplot(data$LENGTH)

> boxplot(data$LENGTH ~ data$SEX)

> qqnorm(data$WEIGHT); qqline(data$WEIGHT)

> plot(data$LENGTH, data$WEIGHT, type="p")

> plot(data$LENGTH, data$WEIGHT, type="p", cex=0.5)

> plot(data$LENGTH, data$WEIGHT, type="p", cex=2)

> pdf(file="pics/47690.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> Ex.points() # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> Ex.lines() # shipunov
[1] 0 1 2 3 4 5 6

> Ex.cols() # shipunov

> Ex.fonts() # shipunov

> Ex.types() # shipunov

> plot(data$LENGTH, data$WEIGHT, type="p", pch=2)

> plot(data$LENGTH, data$WEIGHT, type="n")

> text(data$LENGTH, data$WEIGHT, labels=data$SEX)

> plot(data$LENGTH, data$WEIGHT, pch=as.character(data$SEX))

> plot(data$LENGTH, data$WEIGHT, type="n")

> text(data$LENGTH, data$WEIGHT, labels=data$SEX, col=data$SEX+1)

> plot(data$LENGTH, data$WEIGHT, type="n")

> points(data$LENGTH, data$WEIGHT, pch=data$SEX)

> pdf(file="pics/52050.pdf"); oldpar <- par(mar=c(4, 4, 1, 1))

> plot(data$LENGTH^3, data$WEIGHT, type="n",
+ xlab=expression("Volume (cm"^3*")"), ylab="Weight")

> text(data$LENGTH^3, data$WEIGHT,
+ labels=ifelse(data$SEX, "\\MA", "\\VE"),
+ vfont=c("serif","plain"), cex=1.5)

> par(oldpar); dev.off()
pdf 
  2 

> plot(data$LENGTH, data$WEIGHT, type="n")

> points(data$LENGTH, data$WEIGHT, pch=data$SEX*3, col=data$SEX+1)

> legend("bottomright", legend=c("male", "female"),
+ pch=c(0, 3), col=1:2)

> dev.copy(pdf, "graph.pdf")
pdf 
  5 

> dev.off()
pdf 
  2 

> pdf("graph.pdf")

> plot(data$LENGTH, data$WEIGHT, type="n")

> points(data$LENGTH, data$WEIGHT, pch=data$SEX*3, col=data$SEX+1)

> legend("bottomright", legend=c("male", "female"),
+ pch=c(0, 3), col=1:2)

> dev.off()
pdf 
  2 

> # \section{Testing...}
> 
> t.test(data$WEIGHT, data$LENGTH, paired=TRUE)

	Paired t-test

data:  data$WEIGHT and data$LENGTH
t = 1.529, df = 9, p-value = 0.1606
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.2201116  1.1381116
sample estimates:
mean of the differences 
                  0.459 


> t.test(data$WEIGHT, data$LENGTH, paired=FALSE)

	Welch Two Sample t-test

data:  data$WEIGHT and data$LENGTH
t = 1.2169, df = 15.62, p-value = 0.2417
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3421944  1.2601944
sample estimates:
mean of x mean of y 
   10.041     9.582 


> t.test(data$WEIGHT ~ data$SEX)

	Welch Two Sample t-test

data:  data$WEIGHT by data$SEX
t = 1.7277, df = 7.2425, p-value = 0.1262
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3213711  2.1097044
sample estimates:
mean in group 0 mean in group 1 
      10.577500        9.683333 


> data[, c("WEIGHT", "SEX")]
   WEIGHT SEX
1   10.68   0
2   10.02   1
3   10.18   0
4    8.01   1
5   10.23   0
6    9.70   1
7    9.73   1
8   11.22   0
9    9.19   1
10  11.45   1

> data3 <- unstack(data[, c("WEIGHT", "SEX")])

> t.test(data3[[1]], data3[[2]])

	Welch Two Sample t-test

data:  data3[[1]] and data3[[2]]
t = 1.7277, df = 7.2425, p-value = 0.1262
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3213711  2.1097044
sample estimates:
mean of x mean of y 
10.577500  9.683333 


> wilcox.test(data$WEIGHT, data$LENGTH, paired=TRUE)
Warning in wilcox.test.default(data$WEIGHT, data$LENGTH, paired = TRUE) :
  cannot compute exact p-value with ties

	Wilcoxon signed rank test with continuity correction

data:  data$WEIGHT and data$LENGTH
V = 40.5, p-value = 0.202
alternative hypothesis: true location shift is not equal to 0


> wilcox.test(data$WEIGHT ~ data$SEX)

	Wilcoxon rank sum exact test

data:  data$WEIGHT by data$SEX
W = 20, p-value = 0.1143
alternative hypothesis: true location shift is not equal to 0


> pairwise.t.test(data$WEIGHT, data$COLOR, p.adj="bonferroni")

	Pairwise comparisons using t tests with pooled SD 

data:  data$WEIGHT and data$COLOR 

  1   2  
2 0.7 -  
3 0.8 1.0

P value adjustment method: bonferroni 

> kruskal.test(data$WEIGHT ~ data$COLOR)

	Kruskal-Wallis rank sum test

data:  data$WEIGHT by data$COLOR
Kruskal-Wallis chi-squared = 1.6545, df = 2, p-value = 0.4372


> pairwise.wilcox.test(data$WEIGHT, data$COLOR)

	Pairwise comparisons using Wilcoxon rank sum exact test 

data:  data$WEIGHT and data$COLOR 

  1 2
2 1 -
3 1 1

P value adjustment method: holm 

> chisq.test(data$COLOR, data$SEX)
Warning in chisq.test(data$COLOR, data$SEX) :
  Chi-squared approximation may be incorrect

	Pearson's Chi-squared test

data:  data$COLOR and data$SEX
X-squared = 1.3194, df = 2, p-value = 0.517


> prop.test(sum(data$SEX), length(data$SEX), 0.5)

	1-sample proportions test with continuity correction

data:  sum(data$SEX) out of length(data$SEX), null probability 0.5
X-squared = 0.1, df = 1, p-value = 0.7518
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.2736697 0.8630694
sample estimates:
  p 
0.6 


> cor.test(data$WEIGHT, data$LENGTH, method="pearson")

	Pearson's product-moment correlation

data:  data$WEIGHT and data$LENGTH
t = 1.2276, df = 8, p-value = 0.2545
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.3089373  0.8217631
sample estimates:
      cor 
0.3981316 


> cor.test(data$WEIGHT, data$LENGTH, method="spearman")

	Spearman's rank correlation rho

data:  data$WEIGHT and data$LENGTH
S = 104, p-value = 0.2956
alternative hypothesis: true rho is not equal to 0
sample estimates:
     rho 
0.369697 


> summary(lm(data$LENGTH ~ data$SEX))

Call:
lm(formula = data$LENGTH ~ data$SEX)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.6583 -0.5369 -0.1825  0.5542  1.0317 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   9.5125     0.3478  27.349 3.44e-09 ***
data$SEX      0.1158     0.4490   0.258    0.803    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.6956 on 8 degrees of freedom
Multiple R-squared:  0.00825,	Adjusted R-squared:  -0.1157 
F-statistic: 0.06655 on 1 and 8 DF,  p-value: 0.8029


> aov(lm(data$LENGTH ~ data$SEX))
Call:
   aov(formula = lm(data$LENGTH ~ data$SEX))

Terms:
                data$SEX Residuals
Sum of Squares  0.032202  3.871158
Deg. of Freedom        1         8

Residual standard error: 0.6956255
Estimated effects may be unbalanced

> # \section{Finishing...}
> 
> # \section{Answers to exercises}
> 
> # \chapter{Ten Years Later, or use \R script}
> 
> pdf(file="pics/gridmoon.pdf", width=12); oldpar <- par(mar=c(0, 0, 0, 0))

> Gridmoon() # shipunov

> par(oldpar); dev.off()
pdf 
  2 

> # \section{How to make your \R script}
> 
> # \section{My \R script does not work!}
> 
> plot(log(trees$Volume), 1:nrow(trees))

> log(trees$Volume)
 [1] 2.332144 2.332144 2.322388 2.797281 2.933857 2.980619 2.747271 2.901422
 [9] 3.117950 2.990720 3.186353 3.044522 3.063391 3.058707 2.949688 3.100092
[17] 3.520461 3.310543 3.246491 3.214868 3.540959 3.456317 3.591818 3.645450
[25] 3.751854 4.014580 4.019980 4.065602 3.941582 3.931826 4.343805

> trees$Volume
 [1] 10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 24.2 21.0 21.4 21.3 19.1
[16] 22.2 33.8 27.4 25.7 24.9 34.5 31.7 36.3 38.3 42.6 55.4 55.7 58.3 51.5 51.0
[31] 77.0

> trees
   Girth Height Volume
1    8.3     70   10.3
2    8.6     65   10.3
3    8.8     63   10.2
4   10.5     72   16.4
5   10.7     81   18.8
6   10.8     83   19.7
7   11.0     66   15.6
8   11.0     75   18.2
9   11.1     80   22.6
10  11.2     75   19.9
11  11.3     79   24.2
12  11.4     76   21.0
13  11.4     76   21.4
14  11.7     69   21.3
15  12.0     75   19.1
16  12.9     74   22.2
17  12.9     85   33.8
18  13.3     86   27.4
19  13.7     71   25.7
20  13.8     64   24.9
21  14.0     78   34.5
22  14.2     80   31.7
23  14.5     74   36.3
24  16.0     72   38.3
25  16.3     77   42.6
26  17.3     81   55.4
27  17.5     82   55.7
28  17.9     80   58.3
29  18.0     80   51.5
30  18.0     80   51.0
31  20.6     87   77.0

> 1:nrow(trees)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31

> nrow(trees)
[1] 31

> if(0) {
+ 
+ ## print here anything _syntactically correct_
+ 
+ }

> abc <- function(x)
+ {
+ for (i in 1:10) x <- x+1
+ x
+ }

> abc(5)
[1] 15

> abc <- function(x)
+ {
+ for (i in 1:10) { x <- x+1; print(x) }
+ x
+ }

> abc(5)
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 15

> # \section{Common pitfalls in \R scripting}
> 
> # \subsection{Advices}
> 
> trees[, c("Height", "Volume")]
   Height Volume
1      70   10.3
2      65   10.3
3      63   10.2
4      72   16.4
5      81   18.8
6      83   19.7
7      66   15.6
8      75   18.2
9      80   22.6
10     75   19.9
11     79   24.2
12     76   21.0
13     76   21.4
14     69   21.3
15     75   19.1
16     74   22.2
17     85   33.8
18     86   27.4
19     71   25.7
20     64   24.9
21     78   34.5
22     80   31.7
23     74   36.3
24     72   38.3
25     77   42.6
26     81   55.4
27     82   55.7
28     80   58.3
29     80   51.5
30     80   51.0
31     87   77.0

> trees[, trees %-% c("Height", "Volume")] # shipunov
 [1]  8.3  8.6  8.8 10.5 10.7 10.8 11.0 11.0 11.1 11.2 11.3 11.4 11.4 11.7 12.0
[16] 12.9 12.9 13.3 13.7 13.8 14.0 14.2 14.5 16.0 16.3 17.3 17.5 17.9 18.0 18.0
[31] 20.6

> Pull(trees, Girth < 11) # shipunov
  Girth Height Volume
1   8.3     70   10.3
2   8.6     65   10.3
3   8.8     63   10.2
4  10.5     72   16.4
5  10.7     81   18.8
6  10.8     83   19.7

> # \subsection{The Case-book of Advanced \R user}
> 
> mode(trees)
[1] "list"

> trees2 <- trees[, 2:3]

> mode(trees2)
[1] "list"

> trees1 <- trees2[, 2]

> mode(trees1)
[1] "numeric"

> trees.new <- trees[trees[, 1] < 0, ]

> str(trees.new)
'data.frame':	0 obs. of  3 variables:
 $ Girth : num 
 $ Height: num 
 $ Volume: num 

> 1:nrow(trees.new)
[1] 1 0

> seq_len(nrow(trees.new))
integer(0)

> aa <- c(1, NA, 3)

> aa[aa != 1] # bad idea
[1] NA  3

> aa[aa != 1 & !is.na(aa)] # good idea
[1] 3

> trees[2, 2] <- NA

> trees[trees$Height == 80, "Girth"] # bad idea
[1]   NA 11.1 14.2 17.9 18.0 18.0

> trees[which(trees$Height == 80), "Girth"] # good idea
[1] 11.1 14.2 17.9 18.0 18.0

> data(trees) # restores changed data

> prop.test(3, 23)

	1-sample proportions test with continuity correction

data:  3 out of 23, null probability 0.5
X-squared = 11.13, df = 1, p-value = 0.0008492
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
 0.03430797 0.34666883
sample estimates:
        p 
0.1304348 


> pval <- prop.test(3, 23)$pvalue

> pval
NULL

> pval <- prop.test(3, 23)$p.value # correct identity!

> pval
[1] 0.0008492268

> aa <- sqrt(2)

> aa * aa == 2
[1] FALSE

> aa * aa - 2
[1] 4.440892e-16

> all.equal(aa * aa, 2)
[1] TRUE

> all.equal(aa * aa - 2, 0)
[1] TRUE

> Sum <- function(a, b) a + b

> a <- 3

> Sum(a=5, b=4)
[1] 9

> a
[1] 3

> Sum(a <- 5, b=4)
[1] 9

> a
[1] 5

> c(factor(LETTERS[1:3]), factor(letters[1:3]))
[1] 1 2 3 1 2 3

> c.factor <- function(..., recursive=TRUE)
+ unlist(list(...), recursive=recursive)

> c(factor(LETTERS[1:3]), factor(letters[1:3]))
[1] A B C a b c
Levels: A B C a b c

> ifelse("a" != "b", 1:10, c(0, "A", "B")) # the first surprise
[1] 1

> ifelse(c(TRUE, FALSE, TRUE), 1, 10) # the second surprise
[1]  1 10  1

> # \subsection{Good, Bad, and Not-too-bad}
> 
> read.table("http://ashipunov.me/shipunov/open/wilting.txt",
+ h=TRUE, sep="\t")
            SPECIES TIME
1   Alnus glutinosa  165
2   Alnus glutinosa  225
3   Alnus glutinosa  265
4   Alnus glutinosa  300
5   Alnus glutinosa  325
6      Alnus incana   70
7      Alnus incana  110
8      Alnus incana  130
9      Alnus incana  195
10     Alnus incana  195
11     Alnus incana  210
12     Alnus incana  210
13     Alnus incana  290
14     Alnus incana  450
15     Alnus incana  470
16   Betula pendula  160
17   Betula pendula  270
18   Betula pendula  470
19   Betula pendula  510
20   Betula pendula  580
21   Betula pendula  610
22      Padus avium  120
23      Padus avium  160
24      Padus avium  185
25      Padus avium  200
26      Padus avium  220
27      Padus avium  225
28      Padus avium  255
29      Padus avium  270
30      Padus avium  280
31 Salix acutifolia   70
32 Salix acutifolia  170
33 Salix acutifolia  185
34 Salix acutifolia  195
35 Salix acutifolia  225
36 Salix acutifolia  270
37 Salix acutifolia  300
38 Salix acutifolia  325
39 Salix acutifolia  360
40 Salix acutifolia  500
41     Salix caprea   85
42     Salix caprea  145
43     Salix caprea  165
44     Salix caprea  200
45     Salix caprea  245
46     Salix caprea  256
47     Salix caprea  265
48     Salix caprea  325
49     Salix caprea  345
50     Salix caprea  360
51     Salix caprea  365
52     Salix caprea  485
53     Salix caprea  595

> # \section{Answers to exercises}
> 
> pdf(file="pics/use_r_script.pdf", width=12); oldpar <- par(mar=c(0, 0, 0, 0))

> Gridmoon(Nightsky=FALSE, Moon=FALSE, Stars=FALSE, # shipunov
+ Hillcol="forestgreen", Text="Use R script!", Textcol="yellow",
+ Textpos=c(0.35, 0.85), Textsize=96)

> par(oldpar); dev.off()
pdf 
  2 

> # \chapter{\R fragments}
> 
> # \section{\R and databases}
> 
> head(eq_l) # locations
  N.POP WHERE  SPECIES
1     1  Tver  arvense
2     2  Tver  arvense
3     3  Tver  arvense
4     4  Tver  arvense
5     5  Tver pratense
6     6  Tver palustre

> head(eq_s) # measurements
  N.POP DL.R DIA.ST N.REB N.ZUB DL.OSN.Z DL.TR.V DL.BAZ DL.PER
1     1  424    2.3    13    12      2.0       5    3.0     25
2     1  339    2.0    11    12      1.0       4    2.5     13
3     1  321    2.5    15    14      2.0       5    2.3     13
4     1  509    3.0    14    14      1.5       5    2.2     23
5     1  462    2.5    12    13      1.1       4    2.1     12
6     1  350    1.8     9     9      1.1       4    2.0     15

> SP.NEW <- Recode(eq_s$N.POP, eq_l$N.POP, eq_l$SPECIES)

> eq8 <- cbind(SPECIES=SP.NEW, eq_s[, -1])

> head(eq8)
  SPECIES DL.R DIA.ST N.REB N.ZUB DL.OSN.Z DL.TR.V DL.BAZ DL.PER
1 arvense  424    2.3    13    12      2.0       5    3.0     25
2 arvense  339    2.0    11    12      1.0       4    2.5     13
3 arvense  321    2.5    15    14      2.0       5    2.3     13
4 arvense  509    3.0    14    14      1.5       5    2.2     23
5 arvense  462    2.5    12    13      1.1       4    2.1     12
6 arvense  350    1.8     9     9      1.1       4    2.0     15

> mm <- c("Plantago major", "Littorella uniflora")

> do.call(rbind, strsplit(mm, split=" ")) # exactly one space
     [,1]         [,2]      
[1,] "Plantago"   "major"   
[2,] "Littorella" "uniflora"

> read.table(textConnection(mm), sep=" ")
          V1       V2
1   Plantago    major
2 Littorella uniflora

> nn <- c("a b", "c d\ne f")

> read.table(textConnection(nn), sep=" ")
  V1 V2
1  a  b
2  c  d
3  e  f

> m.o
[1] L   S   XL  XXL S   M   L  
Levels: S < M < L < XL < XXL

> model.matrix( ~ m.o - 1, data=data.frame(m.o))
  m.oS m.oM m.oL m.oXL m.oXXL
1    0    0    1     0      0
2    1    0    0     0      0
3    0    0    0     1      0
4    0    0    0     0      1
5    1    0    0     0      0
6    0    1    0     0      0
7    0    0    1     0      0
attr(,"assign")
[1] 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$m.o
[1] "contr.poly"


> Tobin(m.o, convert.names=FALSE) # shipunov
     S M L XL XXL
[1,] 0 0 1  0   0
[2,] 1 0 0  0   0
[3,] 0 0 0  1   0
[4,] 0 0 0  0   1
[5,] 1 0 0  0   0
[6,] 0 1 0  0   0
[7,] 0 0 1  0   0

> # \section{\R and time}
> 
> dates.df <- data.frame(dates=c("2011-01-01","2011-01-02",
+ "2011-01-03","2011-01-04","2011-01-05"))

> str(dates.df$dates)
 chr [1:5] "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" "2011-01-05"

> dates.1 <- as.Date(dates.df$dates, "%Y-%m-%d")

> str(dates.1)
 Date[1:5], format: "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" "2011-01-05"

> d <- c(20130708, 19990203, 17650101)

> as.Date(as.character(d), "%Y%m%d")
[1] "2013-07-08" "1999-02-03" "1765-01-01"

> ts(1:10,            # sequence
+ frequency = 4,      # by quartile
+ start = c(1959, 2)) # start in the second quartile 1959
     Qtr1 Qtr2 Qtr3 Qtr4
1959         1    2    3
1960    4    5    6    7
1961    8    9   10     

> z <- ts(matrix(rnorm(30), 10, 3),
+ start=c(1961, 1), # start in January 1961
+ frequency=12) # by months

> class(z)
[1] "mts"    "ts"     "matrix"

> pdf(file="pics/32340.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> plot(z,
+ plot.type="single", # place all series on one plot
+ lty=1:3)

> par(oldpar); dev.off()
pdf 
  2 

> leaf <- read.table("data/sundew.txt", h=TRUE, sep=";")

> str(leaf)
'data.frame':	80 obs. of  2 variables:
 $ WET  : int  2 1 1 2 1 1 1 1 1 1 ...
 $ SHAPE: int  1 1 1 2 2 2 2 2 2 2 ...

> summary(leaf)
      WET            SHAPE    
 Min.   :1.000   Min.   :1.0  
 1st Qu.:1.000   1st Qu.:1.0  
 Median :1.000   Median :2.0  
 Mean   :1.325   Mean   :1.7  
 3rd Qu.:2.000   3rd Qu.:2.0  
 Max.   :2.000   Max.   :2.0  

> shape <- ts(leaf$SHAPE, frequency=36)

> str(shape)
 Time-Series [1:80] from 1 to 3.19: 1 1 1 2 2 2 2 2 2 2 ...

> pdf(file="pics/33160.pdf"); oldpar <- par(mar=c(4, 4, 4, 1))

> (acf(shape, main=expression(italic("Drosera")*" leaf")))

Autocorrelations of series ‘shape’, by lag

0.0000 0.0278 0.0556 0.0833 0.1111 0.1389 0.1667 0.1944 0.2222 0.2500 0.2778 
 1.000  0.614  0.287  0.079  0.074  0.068 -0.038 -0.085 -0.132 -0.137 -0.024 
0.3056 0.3333 0.3611 0.3889 0.4167 0.4444 0.4722 0.5000 0.5278 
 0.030  0.025 -0.082 -0.087  0.027  0.021 -0.043 -0.090 -0.137 

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/33310_to_crop.pdf")

> plot(stl(shape, s.window="periodic")$time.series, main="")

> dev.off()
pdf 
  2 

> # \section{\R and bootstrap}
> 
> library(boot)

Attaching package: ‘boot’

The following object is masked from ‘package:robustbase’:

    salinity

The following object is masked from ‘package:lattice’:

    melanoma


> ## Statistic to be bootstrapped:
> 
> ave.b <- function (data, indices)
+ {
+ d <- data[indices]
+ return(mean(d))
+ }

> (result.b <- boot(data=trees$Height, statistic=ave.b, R=100))

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = trees$Height, statistic = ave.b, R = 100)


Bootstrap Statistics :
    original    bias    std. error
t1*       76 0.0983871    1.069863

> pdf(file="pics/06074.pdf"); oldpar <- par(mar=c(4, 4, 1, 1))

> plot(result.b)

> par(oldpar); dev.off()
pdf 
  2 

> boot.ci(result.b, type="bca")
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 100 bootstrap replicates

CALL : 
boot.ci(boot.out = result.b, type = "bca")

Intervals : 
Level       BCa          
95%   (73.91, 77.76 )  
Calculations and Intervals on Original Scale
Some BCa intervals may be unstable

> spur <- scan("data/spur.txt")
Read 1511 items

> library(bootstrap)

Attaching package: ‘bootstrap’

The following object is masked from ‘package:mclust’:

    diabetes


> result.b2 <- bootstrap(x=spur, 100, function(x) mean(x))

> ## Median of bootstrapped values:
> 
> median(result.b2$thetastar)
[1] 7.519159

> ## Confidence interval:
> 
> quantile(result.b2$thetastar, probs=c(0.025, 0.975))
    2.5%    97.5% 
7.444077 7.610356 

> result.j <- jackknife(x=spur, function(x) mean(x))

> ## Median of jackknifed values:
> 
> median(result.j$jack.values)
[1] 7.516424

> ## Standard error:
> 
> result.j$jack.se
[1] 0.04258603

> ## Confidence interval:
> 
> quantile(result.j$jack.values, probs=c(0.025, 0.975))
    2.5%    97.5% 
7.513775 7.517748 

> boot <- 100

> tt <- matrix(ncol=2, nrow=boot)

> for (n in 1:boot)
+ 
+ {
+ 
+ spur.sample <- sample(spur, length(spur), replace=TRUE)
+ 
+ tt[n, 1] <- mean(spur.sample)
+ 
+ tt[n, 2] <- sd(spur.sample)
+ 
+ }

> (result <- data.frame(spur.mean=mean(spur), spur.sd=sd(spur),
+ boot.mean=mean(tt[, 1]), boot.sd=mean(tt[, 2])))
  spur.mean  spur.sd boot.mean  boot.sd
1  7.516082 1.655386  7.517484 1.663059

> dact <- scan("data/dact.txt")
Read 48 items

> quantile(dact, c(0.025, 0.5, 0.975))
   2.5%     50%   97.5% 
  3.700  33.500 104.825 

> apply(replicate(100, quantile(sample(dact, length(dact),
+ replace=TRUE), c(0.025, 0.5, 0.975))), 1, mean)
    2.5%      50%    97.5% 
  4.9700  37.6600 102.0667 

> sleep.K.rep <- replicate(100, K(extra ~ group,
+ data=sleep[sample(1:nrow(sleep), replace=TRUE), ]))

> quantile(sleep.K.rep, c(0.025, 0.975))
      2.5%      97.5% 
0.00304031 1.83676444 

> library(coin)
Loading required package: survival

Attaching package: ‘survival’

The following object is masked from ‘package:boot’:

    aml

The following object is masked from ‘package:robustbase’:

    heart


Attaching package: ‘coin’

The following objects are masked from ‘package:arules’:

    size, support

The following object is masked from ‘package:wavethresh’:

    support

The following object is masked from ‘package:kernlab’:

    size


> grades$V2 <- as.factor(grades$V2)

> wilcox_test(V1 ~ V2, data=subset(grades, V2 %in% c("A1", "B1")),
+ conf.int=TRUE)

	Asymptotic Wilcoxon-Mann-Whitney Test

data:  V1 by V2 (A1, B1)
Z = -1.9938, p-value = 0.04618
alternative hypothesis: true mu is not equal to 0
95 percent confidence interval:
 -1.000000e+00 -4.297215e-09
sample estimates:
difference in location 
         -5.132013e-09 


> # \section{\R and shape}
> 
> library(geomorph)
Loading required package: RRPP
Loading required package: rgl

Attaching package: ‘rgl’

The following object is masked from ‘package:plotrix’:

    mtext3d


> TangentSpace2 <- function(A)
+ {
+ x <- two.d.array(A)
+ pc.res <- prcomp(x)
+ pcdata <- pc.res$x
+ list(array=x, pc.summary=summary(pc.res), pc.scores=pcdata)
+ }

> am <- read.table("data/bigaln.txt", sep=";", head=TRUE)

> ag <- readland.tps("data/bigaln.tps", specID="imageID")

No curves detected; all points appear to be fixed landmarks.

Warning: not all specimens have scale adjustment (perhaps because they are already scaled);
no rescaling will be performed in these cases

> gpa.ag <- gpagen(ag)
  |                                                                              |                                                                      |   0%  |                                                                              |==============                                                        |  20%  |                                                                              |============================                                          |  40%  |                                                                              |==========================================                            |  60%  |                                                                              |======================================================================| 100%

> ta.ag <- TangentSpace2(gpa.ag$coords)

> screeplot(ta.ag$pc.summary) # importance of principal components

> pca.ag <- ta.ag$pc.summary$x

> pdf(file="pics/10668.pdf"); oldpar <- par(mar=c(4, 4, 0, 1))

> pca.ag.ids <- as.numeric(gsub(".png", "", row.names(pca.ag)))

> branch <- cut(am$P.1, 3, labels=c("lower", "middle", "top"))

> b.code <- as.numeric(Recode(pca.ag.ids, am$PIC, branch,
+ char=FALSE)) # shipunov

> plot(pca.ag[, 1:2], xlab="PC1", ylab="PC2", pch=19, col=b.code)

> legend("topright", legend=paste(levels(branch), "branches"),
+ pch=19, col=1:3)

> par(oldpar); dev.off()
pdf 
  2 

> c.lower <- mshape(gpa.ag$coords[, , b.code == 1])

> c.top <- mshape(gpa.ag$coords[, , b.code == 3])

> all.mean <- mshape(gpa.ag$coords)

> ag.links <- matrix(c(1, rep(c(2:7, 12:8), each=2), 1),
+ ncol=2, byrow=TRUE)

> pdf(file="pics/10667_to_crop.pdf")

> old.par <- par(mfrow=c(1, 2))

> GP <- gridPar(grid.col="grey",
+ tar.link.col="darkseagreen", tar.pt.bg=0)

> plotRefToTarget(c.lower, all.mean, links=ag.links, gridPars=GP)

> title(main="lower branches", line=-5, cex=0.8)

> plotRefToTarget(c.top, all.mean, links=ag.links, gridPars=GP)

> title(main="top branches", line=-5, cex=0.8)

> par(old.par)

> dev.off()
pdf 
  2 

> # \section{\R and Bayes}
> 
> chickwts <- chickwts[chickwts$feed %in% c("horsebean", "linseed"), ]

> chickwts$feed <- factor(chickwts$feed)

> plot(weight ~ feed, data=chickwts, main="Chick weights")

> t.test(weight ~ feed, data=chickwts, var.eq=TRUE)

	Two Sample t-test

data:  weight by feed
t = -2.934, df = 20, p-value = 0.008205
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -100.17618  -16.92382
sample estimates:
mean in group horsebean   mean in group linseed 
                 160.20                  218.75 


> library(BayesFactor)
Loading required package: coda

Attaching package: ‘coda’

The following object is masked from ‘package:kernlab’:

    nvar

************
Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).

Type BFManual() to open the manual.
************

Attaching package: ‘BayesFactor’

The following object is masked from ‘package:modeltools’:

    posterior


> bf <- ttestBF(formula = weight ~ feed, data=chickwts)

> bf
Bayes factor analysis
--------------
[1] Alt., r=0.707 : 5.975741 ±0%

Against denominator:
  Null, mu1-mu2 = 0 
---
Bayes factor type: BFindepSample, JZS


> # \section{\R, DNA and evolution}
> 
> # \section{\R and reporting}
> 
> # \section{\R without graphics}
> 
> library(txtplot)

> with(mtcars, txtplot(cyl, mpg))
35 +--+------------+-------------+-------------+------------+--+
   |  *                                                        |
   |                                                           |
30 +  *                                                        +
   |  *                                                        |
25 +  *                                                        +
   |  *                                                        |
   |  *                          *                             |
20 +                             *                             +
   |                             *                          *  |
   |                                                        *  |
15 +                                                        *  +
   |                                                        *  |
   |                                                        *  |
10 +--+------------+-------------+-------------+------------+--+
      4            5             6             7            8   

> # \section{Answers to exercises}
> 
> wet <- ts(leaf$WET, frequency=36)

> str(wet)
 Time-Series [1:80] from 1 to 3.19: 2 1 1 2 1 1 1 1 1 1 ...

> acf(wet)

> plot(stl(wet, s.window="periodic")$time.series)

> cu <- read.table(
+ "http://ashipunov.me/shipunov/open/cuscuta.txt",
+ h=TRUE, sep="\t")

> str(cu)
'data.frame':	96 obs. of  3 variables:
 $ HOST  : chr  "Alchemilla" "Alchemilla" "Alchemilla" "Alchemilla" ...
 $ DEGREE: int  3 2 2 2 1 2 1 0 0 1 ...
 $ WHERE : int  3 3 3 3 3 1 1 0 0 1 ...

> cu2 <- cu[cu$HOST %in% c("Alchemilla","Knautia"), ]

> cu2$HOST <- factor(cu2$HOST)

> cu2.s <- split(cu2$DEGREE, cu2$HOST)

> boxplot(cu2.s)

> sapply(cu2.s, median)
Alchemilla    Knautia 
         2          0 

> cliff.delta(cu2.s$Alchemilla, cu2.s$Knautia)

Cliff's Delta

delta estimate: 0.5185185 (large)
95 percent confidence interval:
     lower      upper 
-0.1043896  0.8492326 


> wilcox.test(cu2.s$Alchemilla, cu2.s$Knautia)
Warning in wilcox.test.default(cu2.s$Alchemilla, cu2.s$Knautia) :
  cannot compute exact p-value with ties

	Wilcoxon rank sum test with continuity correction

data:  cu2.s$Alchemilla and cu2.s$Knautia
W = 41, p-value = 0.09256
alternative hypothesis: true location shift is not equal to 0


> pdf(file="pics/75380.pdf"); oldpar <- par(mar=c(2, 4, 2, 1))

> ansari.test(cu2.s$Alchemilla, cu2.s$Knautia)
Warning in ansari.test.default(cu2.s$Alchemilla, cu2.s$Knautia) :
  cannot compute exact p-value with ties

	Ansari-Bradley test

data:  cu2.s$Alchemilla and cu2.s$Knautia
AB = 40, p-value = 0.7127
alternative hypothesis: true ratio of scales is not equal to 1


> library(beeswarm)

> la <- layout(matrix(c(1, 3, 2, 3), ncol=2, byrow=TRUE))

> for (i in 1:2) hist(cu2.s[[i]], main=names(cu2.s)[i],
+ xlab="", xlim=range(cu2.s))

> bxplot(cu2.s) ; beeswarm(cu2.s, cex=1.2, add=TRUE)

> par(oldpar); dev.off()
pdf 
  2 

> Rro.test(cu2.s$Alchemilla, cu2.s$Knautia) # shipunov
        z   p.value 
1.9913639 0.0464409 

> library(boot)

> meddif.b <- function (data, ind) { d <- data[ind];
+ median(d[cu2$HOST == "Alchemilla"]) - median(
+ d[cu2$HOST == "Knautia"]) }

> meddif.boot <- boot(data=cu2$DEGREE, statistic=meddif.b,
+ strata=cu2$HOST, R=999)

> boot.ci(meddif.boot, type="bca")
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 999 bootstrap replicates

CALL : 
boot.ci(boot.out = meddif.boot, type = "bca")

Intervals : 
Level       BCa          
95%   ( 0,  2 )  
Calculations and Intervals on Original Scale
Some BCa intervals may be unstable

> library(coin)

> cu$HOST <- as.factor(cu$HOST)

> kruskal_test(DEGREE ~ HOST, data=cu, distribution=approximate())

	Approximative Kruskal-Wallis Test

data:  DEGREE by
	 HOST (Alchemilla, Briza, Carex flava, Cirsium, Dactylis, Equisetum, Erodium, Galium, Hieracium, Hypericum, Knautia, Leontodon, Phleum, Plantago, Potentilla, Ranunculus acris, Stellaria, Trifolium, Vicia cracca)
chi-squared = 24.063, p-value = 0.1175


> pairwise.Rro.test(cu$DEGREE, cu$HOST) # shipunov

	Pairwise comparisons using Robust rank order test 

data:  cu$DEGREE and cu$HOST 

                 Alchemilla Briza   Carex flava Cirsium Dactylis Equisetum
Briza            0.0066     -       -           -       -        -        
Carex flava      0.3817     0.9310  -           -       -        -        
Cirsium          0.8392     0.1680  0.3164      -       -        -        
Dactylis         0.3668     0.6628  0.8759      0.4834  -        -        
Equisetum        0.8629     0.7886  0.8629      0.7460  0.9310   -        
Erodium          0.7485     0.0022  0.7485      0.8282  0.7959   0.8986   
Galium           0.9427     0.0615  0.5541      0.8675  0.6320   0.8629   
Hieracium        0.8715     0.0615  0.6628      0.7859  0.6780   0.8986   
Hypericum        0.7859     < 2e-16 0.2910      0.9276  0.0269   0.8078   
Knautia          0.2647     0.8629  0.9427      0.3164  0.8769   0.8715   
Leontodon        0.9500     0.0005  0.3089      0.8629  0.3089   0.8078   
Phleum           0.0411     0.9500  0.9310      0.2900  0.7886   0.8501   
Plantago         0.3164     0.8629  0.9310      0.6113  0.9310   0.9310   
Potentilla       0.3164     0.8629  0.9310      0.6113  0.9310   0.9310   
Ranunculus acris 0.8629     8.4e-08 0.2418      0.9310  0.2910   0.6628   
Stellaria        0.1248     0.9276  1.0000      0.3629  0.8629   0.8715   
Trifolium        0.9276     0.1071  0.6171      0.8078  0.6780   0.8715   
Vicia cracca     0.8629     0.7386  0.8629      0.8078  0.9310   0.9427   
                 Erodium Galium Hieracium Hypericum Knautia Leontodon Phleum
Briza            -       -      -         -         -       -         -     
Carex flava      -       -      -         -         -       -         -     
Cirsium          -       -      -         -         -       -         -     
Dactylis         -       -      -         -         -       -         -     
Equisetum        -       -      -         -         -       -         -     
Erodium          -       -      -         -         -       -         -     
Galium           0.8715  -      -         -         -       -         -     
Hieracium        0.8715  0.9310 -         -         -       -         -     
Hypericum        < 2e-16 0.8078 0.4834    -         -       -         -     
Knautia          0.6113  0.3951 0.4864    0.0269    -       -         -     
Leontodon        0.7077  0.9310 0.8629    0.7859    0.1605  -         -     
Phleum           0.1383  0.1844 0.1844    < 2e-16   0.9049  0.0112    -     
Plantago         0.7739  0.6320 0.6320    < 2e-16   0.9310  0.2910    0.8715
Potentilla       0.7739  0.6320 0.6320    < 2e-16   0.9310  0.2910    0.8715
Ranunculus acris 0.7739  0.8629 0.8078    1.0000    0.1093  0.8629    0.0014
Stellaria        0.3668  0.3164 0.3164    < 2e-16   0.9385  0.0615    0.9310
Trifolium        0.9090  0.9310 0.9530    0.7393    0.4834  0.8769    0.2514
Vicia cracca     0.8715  0.8629 0.8715    0.7959    0.8629  0.8078    0.8078
                 Plantago Potentilla Ranunculus acris Stellaria Trifolium
Briza            -        -          -                -         -        
Carex flava      -        -          -                -         -        
Cirsium          -        -          -                -         -        
Dactylis         -        -          -                -         -        
Equisetum        -        -          -                -         -        
Erodium          -        -          -                -         -        
Galium           -        -          -                -         -        
Hieracium        -        -          -                -         -        
Hypericum        -        -          -                -         -        
Knautia          -        -          -                -         -        
Leontodon        -        -          -                -         -        
Phleum           -        -          -                -         -        
Plantago         -        -          -                -         -        
Potentilla       1.0000   -          -                -         -        
Ranunculus acris 0.3089   0.3089     -                -         -        
Stellaria        0.9310   0.9310     0.0416           -         -        
Trifolium        0.6732   0.6732     0.8078           0.3668    -        
Vicia cracca     0.9310   0.9310     0.6780           0.8629    0.8715   

P value adjustment method: BH 

> # \chapter{Most essential \R commands}
> 
> # \chapter{The short \R glossary}
> 
> pdf(file="pics/76680.pdf"); oldpar <- par(mar=c(2, 2, 0, 1))

> plot(density(rnorm(10^5)), main="",  xlab="", ylab="")

> par(oldpar); dev.off()
pdf 
  2 

> pdf(file="pics/10643_to_crop.pdf")

> library(plotrix)

> plot(c(-1, 1), c(-1, 1), type="n", xlab="", ylab="", axes=FALSE)

> for(n in seq(0.1, 0.9, 0.1)) draw.circle(0, 0, n)

> set.seed(11); x <- rnorm(100, sd=.28); y <- rnorm(100, sd=.28)

> points(x, y, pch=19)

> dev.off()
pdf 
  2 

> # \chapter{References}