You are on page 1of 237

1

R
2
R

JYQ0621


3
R
CI
P
R / .
2006.11
ISBN
I.R II. III.
IV.
CIP2006






850mm1168mm 1/32
7.625
200
2006 11 1
2006 11 1
ISBN


4
R

R GNU

UNIXLINUXMacOS
WINDOWS R
http://www.r-project.org R
R 8
CRAN http://cran.r-project.org
5
R

Open SourceR
S R

R
R

6
R

R GNU

R
R R

1
7
R

2
SAS
SPSSSplus

3 R

R
4

R R
R
R

8
R

1 R 1
1.1 R 1
1.2 R 2
1.3 R 3
1.4 R 4
1.5 R 6
2 7
2.1 7
2.2 9
2.3 26
2.4 31
3 40
3.1 40
3.2 46
3.3 49
4 59
4.1 59
4.2 69
4.3 78
4.4 85
5 101
5.1 102
5.2 103
9
R
5.3 109
5.4 111
5.5 112
6 114
6.1 115
6.2 119
6.3 125
7 133
7.1 133
7.2 144
8 154
8.1 154
8.2 158
8.3 160
9 169
9.1 169
9.2 170
9.3 173
9.3 175
9.4 178
10 R 181
10.1 181
10.2 191
10.3 202
10.4 R 209
10.5 R 214
220
10
R

1
.1 R

R
Bell Laboratories Rick BeckerJohn Chambers
Allan Wilks S
S S-PLUS
R Ross Ihaka Robert Gentleman
R R R
R

1
2
3
4

6R
7R
8R

9R R
11
R

R
http://www.r-project.org R
CRANhttp://www.cran.r-project.org
http://www.lmbe.seu.edu.cn/CRAN/
R

1
.2 R

R R
R

R
25 R
R
http://www.lmbe.seu.edu.cn/CRAN/
R
R

R
Object R
SASSPSS Minitab


12
R

R R
R
R

1
.3 R

R
R
R R
1-1 R R ConsoleR

> R
13
R

1-
1 R
R
q R
R

1
.4 R

R R
mean

>help(mean)

>?mean()
R 1-2
14
R

1-
2 R

R
help.start( )
R R
manualsR Reference R
miscellaneous materialR An Introduction to R
Writing R Extensions The R Language Definition R Data
Import/ExportR Installation and Administration
R 1-3R
Packages
Search Engine & Keywords

15
R

1-
3 R


mean example(mean)
R
mean> x <- c(0:10, 50)
mean> xm <- mean(x)
mean> c(xm, mean(x, trim = 0.1))
[1] 8.75 5.50
mean> mean(USArrests, trim = 0.2)
Murder Assault UrbanPop Rape
7.42 167.60 66.20 20.16
16
R

1
.5 R

R windowsUNIXMacOS
windows98
R
R 2.3.0 2006 4 24
ISBN 3-900051-07-0
17
R

2
.1

2.1. 1
R object

structures R
R console ls()

> ls()
character(0)

> x<-c(1,2,3,4,5,6) #
> ls()
[1] "x"
x R
.
R Orange orange

R clength

R

18
R
2.2 .
2
R

1numeric
integers
decimal fractionsscientific notation
integers
doubleprecision
doubleprecision
2logical
TTRUE FFALSE
3character

MR
4complex
a+bi
5raw

6missing value

not availablemissing value
NA not
available NA NA
2-1
2-
1

character is.character() as.character()
complex is.complex() as.complex()
19
R
double is.double() as.double()
integer is. integer() as. integer()
logical is. logical() as. logical()
NA is.na() as.na()
numeric is.numeric() as.numeric()

2
.2

R vector
matrixarraylistdata frames
factor

2. 2.1
R
vector

5 x 13579R

> x <-c(1,3,5,7,9)
c( ) c( )

R <-,->
= R
<-->
> c(1,3,5,7,9) ->y
>y
20
R
[1] 2 5 8 3
> z = c(1,3,5,7,9)
>z
[1] 1 3 5 7 9
assign
> assign("w",c(1,3,5,7,9))
>w
[1] 1 3 5 7 9
length 1
lengthmode

> length(x)
[1] 5
> mode(x)
[1] "numeric"
R

> t=1:10
>t
[1] 1 2 3 4 5 6 7 8 9 10
5:1
> r=5:1
>r
[1] 5 4 3 2 1

> 2*1:5
[1] 2 4 6 8 10
seq
21
R
seq(from, to, by)from to
by
> seq(1,10,2)
[1] 1 3 5 7 9
by by 1
> seq(1,10)
[1] 1 2 3 4 5 6 7 8 9 10
110 seq
> seq(10,1,-1)
[1] 10 9 8 7 6 5 4 3 2 1

seq(,by=,length=)
> seq(1,by=2,length=10)
[1] 1 3 5 7 9 11 13 15 17 19
rep( )
rep(x, times, ...)x
times
> rep(c(1,3),4)
[1] 1 3 1 3 1 3 1 3
c(1,3) 4

> rep(c(1,3),each=4)
[1] 1 1 1 1 3 3 3 3
rep( )
> rep(1:3,rep(2,3))
[1] 1 1 2 2 3 3

22
R

R
R mean()
median()var()sd()
> x=c(1,3,5,7,9)
> mean(x)
[1] 5
> median(x)
[1] 5
> var(x)
[1] 10
> sd(x)
[1] 3.162
sortrev
rankprod

> y=c(2,6,7,3,5)
> sort(y)
[1] 2 3 5 6 7
> rev(y)
[1] 5 3 7 6 2
> rank(y)
[1] 1 4 5 2 3
> prod(y)
[1] 1260
append
> append(y,10:15,after=3)
[1] 2 6 7 10 11 12 13 14 15 3 5
23
R
y 10 15
after
> append(y,10:15)
[1] 2 6 7 3 5 10 11 12 13 14 15
R
2-2
2-
2

sum
max
min
range
mean
median
var
sd
sort
rev
rank
append
replace
match
pmatch
all
any
prod
24
R

2 . 2 .2
matrix

R matrix
( )
matrix(data = NA, nrow = 1, ncol = 1, byrow = FALSE,
dimnames = NULL)
data nrow
ncol byrow FALSE
Tdimnames

> matrix(c(1,2,3,4,5,6),nrow=2,ncol=3)
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
2 3
> matrix(c(1,2,3,4,5,6),nrow=2,ncol=3,byrow=TRUE)
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 4 5 6
2 3 dimnames

> matrix(c(1,2,3,4,5,6), nrow=2,ncol=3,dimnames=list(c("R1","R2"),


c("C1","C2","C3")))
C1 C2 C3
R1 1 3 5
R2 2 4 6
25
R
diag()
> diag(1:4)
[,1] [,2] [,3] [,4]
[1,] 1 0 0 0
[2,] 0 2 0 0
[3,] 0 0 3 0
[4,] 0 0 0 4
diag()

> A=matrix(1:16,4,4)
>A
[,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16
> diag(A)
[1] 1 6 11 16
diag()
> diag(4) # 4
[,1] [,2] [,3] [,4]
[1,] 1 0 0 0
[2,] 0 1 0 0
[3,] 0 0 1 0
[4,] 0 0 0 1

A[i,j]
> A[2,] # 2
[1] 2 6 10 14
26
R
> A[2,2] # 2 2
[1] 6
> A[2,] # 2
[1] 2 6 10 14
> A[2:3,1:3] # 2,3 1,2,3
[,1] [,2] [,3]
[1,] 2 6 10
[2,] 3 7 11

> A+10
[,1] [,2] [,3] [,4]
[1,] 11 15 19 23
[2,] 12 16 20 24
[3,] 13 17 21 25
[4,] 14 18 22 26
> A*2
[,1] [,2] [,3] [,4]
[1,] 2 10 18 26
[2,] 4 12 20 28
[3,] 6 14 22 30
[4,] 8 16 24 32
> A+A
[,1] [,2] [,3] [,4]
[1,] 2 10 18 26
[2,] 4 12 20 28
[3,] 6 14 22 30
[4,] 8 16 24 32
27
R
R A*B
A%*%B
> A*A
[,1] [,2] [,3] [,4]
[1,] 1 25 81 169
[2,] 4 36 100 196
[3,] 9 49 121 225
[4,] 16 64 144 256
> A%*%A
[,1] [,2] [,3] [,4]
[1,] 90 202 314 426
[2,] 100 228 356 484
[3,] 110 254 398 542
[4,] 120 280 440 600
dim() nrow()
ncol()solve()

> dim(A)
[1] 4 4
> nrow(A)
[1] 4
> ncol(A)
[1] 4
> solve(A)
solve.default(A) : Lapack dgesv:
A
> solve(matrix(rnorm(16),4,4)) # 44

[,1] [,2] [,3] [,4]


28
R
[1,] -2.6929 -0.154 -0.968 -0.8672
[2,] 0.2736 0.112 0.372 0.2527
[3,] -0.1234 -0.548 0.610 -0.5030
[4,] 0.0129 1.875 -0.974 0.0118
2
-3
2-
3

as.matrix()
is.matrix()
diag()
eigen()
solve()
chol() Choleski
svd()
qr() QR
det()
dim()
t()
apply()

R
colSums()
colMeans() rowSums()rowMeans()

> colSums(A)
[1] 10 26 42 58
> colMeans(A)
[1] 2.5 6.5 10.5 14.5
apply()
apply(X, MARGIN, FUN, ...) X
29
R
MARGIN 1
2 FUN
( 2-2 )
> apply(A,2,sum) #
[1] 10 26 42 58
> apply(A,2,mean) #
[1] 2.5 6.5 10.5 14.5
colSums()colMeans()

> apply(A,2,var) #
[1] 1.667 1.667 1.667 1.667
> apply(A,2,sd) #
[1] 1.291 1.291 1.291 1.291
rbind()cbind()
rbind()cbind()
> (B=matrix(1:6,2,3))
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
> (C=matrix(6:1,2,3))
[,1] [,2] [,3]
[1,] 6 4 2
[2,] 5 3 1
> rbind(B,C) #
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
[3,] 6 4 2
[4,] 5 3 1
30
R
> cbind(B,C) #
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 3 5 6 4 2
[2,] 2 4 6 5 3 1

rbind()
cbind()

> x=c(1,3,5,7,9)
> y=c(2,4,6,8,10)
> cbind(x,y)
x y
[1,] 1 2
[2,] 3 4
[3,] 5 6
[4,] 7 8
[5,] 9 10
> rbind(x,y)
[,1] [,2] [,3] [,4] [,5]
x 1 3 5 7 9
y 2 4 6 8 10

2.2 .3
array

array()array(data
= NA, dim = length(data), dimnames = NULL)
data dim dimnames
31
R

> xx=array(1:24,c(3,4,2)) # (3,4,2) 3


> xx
,,1
[,1] [,2] [,3] [,4]
[1,] 1 4 7 10
[2,] 2 5 8 11
[3,] 3 6 9 12
,,2
[,1] [,2] [,3] [,4]
[1,] 13 16 19 22
[2,] 14 17 20 23
[3,] 15 18 21 24
xx

> xx[2,3,2]
[1] 20
> xx[2,1:3,2]
[1] 14 17 20
> xx[,2,]
[,1] [,2]
[1,] 4 16
[2,] 5 17
[3,] 6 18
dim()
> dim(xx)
[1] 3 4 2
32
R
dim()
> zz=c(2,5,6,8,1,4,6,9,10,7,3,5)
> dim(zz)=c(2,2,3)
> zz
,,1
[,1] [,2]
[1,] 2 6
[2,] 5 8
,,2
[,1] [,2]
[1,] 1 6
[2,] 4 9
,,3
[,1] [,2]
[1,] 10 3
[2,] 7 5
+ - */

2.2.4

list

components

list()
list 1= 1 2= 2.
xyz
33
R

> x=c(1,1,2,2,3,3,3)
> y=c("","","","","","","")
> z=c(80,85,92,76,61,95,83)
> LST=list(class=x,sex=y,score=z)
> LST
$class
[1] 1 1 2 2 3 3 3
$sex
[1] "" "" "" "" "" "" ""
$score
[1] 80 85 92 76 61 95 83
LST[[1]],LST[[2]]

> LST[[3]]
[1] 80 85 92 76 61 95 83
LST[[2]][1:3]
> LST[[3]][2:5]
[1] 85 92 76 61
$

> LST$score
[1] 80 85 92 76 61 95 83

LST$sLST$score
> LST$s
[1] 80 85 92 76 61 95 83
[]
> LST$s[2:5]
34
R
[1] 85 92 76 61
lengthmodenames

> length(LST)
[1] 3
> mode(LST)
[1] "list"
> names(LST)
[1] "class" "sex" "score"

2 .2.5
data frame

matrix
list R

R data.frame ( )
data.framedata1data2
> student=data.frame(x,y,z)
> student
35
R
x y z
1 1 80
2 1 85
3 2 92
4 2 76
5 3 61
6 3 95
7 3 83

> student=data.frame(class=x,sex=y,score=z)
> student
class sex score
1 1 80
2 1 85
3 2 92
4 2 76
5 3 61
6 3 95
7 3 83

> row.names(student)=c(" x"," x"," x"," x"," x"," x","
x")
> student
class sex score
x 1 80
x 1 85
x 2 92
36
R
x 2 76
x 3 61
x 3 95
x 3 83

2 .2. 6
factor

factor factordatalevelslabels
data levels labels
levelslabels

> y=c("","","","","","","")
> f=factor(y)
>f
[1]
Levels:
levels

> levels(f)
[1] "" ""

ordered

> score_f=c("B","C","D","B","A","D","A")
> score_o=ordered(score,levels=c("D","C","B","A") )
> score_o
[1] B C D B A D A
Levels: D < C < B < A
37
R

2
.3

R dataset R

R R Data Import/Export

2 .3. 1 c
c

> x=c(1,2,3,4)
>x
[1] 1 2 3 4
> y=c("a","b","c")
>y
[1] "a" "b" "c"

2.3. 2 sc
an
scan c scan
scan
38
R
c

> x = scan()
1: 1 2 3 4 5 6
7:
Read 6 items
scan
dat.txt
> x=scan(file="dat.txt")
scan

> x=scan(file="dat.txt"sep=",")

2. 3.3
data.entry

> data.entry(xx=c(NA))
xx
2-1
> data.entry(z) z () 2-1
39
R

2-
1

edit
edit
fix
> fix(student)

2. 3.4
1.
R
R
read.table

student.txt

sex
40
R

2-
2
> s1=read.table("student.txt")
> s1
V1 V2 V3
1 class sex score
2 1 80
3 1 85
4 2 92
5 2 76
6 3 61
7 3 95
8 3 83
2
-3

> s2=read.table("student.txt",header=T)
41
R
> s2
class sex score
1 1 80
2 1 85
3 2 92
4 2 76
5 3 61
6 3 95
7 3 83
headerT

2.
R url
read.table
> address=http://www.the-data-mine.com/bin/view/Misc/WebHome
/sample.txt
> read.table(file=url(address))
3.
foreign
R 8
library(foreign)
1SAS
SASR SAS Transport format(XPORT)
SAS .ssd .sas7bdat
Transport format(XPORT),read.xport( )
2SPSS
read.spss( ) SPSS
3Epi info :
42
R
R epi5 6
> read.epiinfo(.rec)
read.epiinfo(.rec)->

> read.epiinfo(d:/ttt.rec)->ttt
4Stata :
R Stata5,6,7
> read.dta(.dta)
$

>mean(data$age)
d at a age

2
.4

2 .4.1

= function ( 1 2)
{

43
R

}

1.

R std
R sd
> std = function(x) { sqrt(var(x)) }
{}
> std = function(x) sqrt(var(x))
std

> x=c(1,3,5,7,9)
> std(x)
[1] 3.162278

> std
function(x) sqrt(var(x))
2. function
function R

3.

1

44
R
welcomewelcome use R
> welcome = function() print("welcome to use R")
> welcome()
[1] "welcome to use R"
2

> welcome.sb = function(names) print(paste("welcome",names,"to


use R"))
> welcome.sb("Mr fang")
[1] "welcome Mr fang to use R"
> welcome.sb("Mr Wang")
[1] "welcome Mr Wang to use R"
3
welcome.sb

> welcome.sb()
paste("welcome", names, "to use R") :
names
welcome.sb
R

> welcome.sb=function(names="Mr fang")print(paste("welcome",


names,"to use R"))
> welcome.sb()
[1] "welcome Mr fang to use R"
=10=5
t
> sim.t=function(n){
mu=10;sigma=5;
45
R
x=rnorm(n,mu,sigma)
(mean(x)-mu)/(std(x)/n)
}
> sim.t(5) # 5
[1] -5.794325
sim.t

> sim.t = function(n,mu=10,sigma=5){


x=rnorm(n,mu,sigma)
(mean(x)-mu)/(std(x)/n)
}
> sim.t(5) # 5 10 5
[1] 2.363376
> sim.t(5,0,1) # 5 0 1
[1] -0.720194
> sim.t(5,4) # 5 4 5
[1] -0.4255511
> sim.t(5,sigma=100) # 5 10 100
[1] -10.22042
> sim.t(5,sigma=100,mu=1) # 5 1 100
[1] 1.624949


sim.t(5,0,1)5 n0 mu1 sigma
sim.t(5,4)5 n4 mu
sigma sigma 5
sim.t(5,sigma=100,mu=1)
46
R

R R

x y

> plot.f=function(f,a,b,...){
xvals=seq(a,b,length=100)
plot(xvals,f(xvals),type="l",...)
}
> plot.f(sin,0,2*pi) 0 2 2-4
R curve
> curve(sin,0,2*pi) 0 2 2-4

2-
4 R

plot.f f

> plot.f(exp,-1,1) #-1 1


> plot.f(log,0,1) # 0 1
47
R

2-
5 R
4.

> my.average = function(x) sum(x)/length(x)


> my.average(c(1,2,3))
[1] 2
{}
vms 5

> vms=function(x){
xx=rev(sort(x))
xx=xx[1:5]
mean(xx)
}
> y=c(5,15,32,25,26,28,65,48,3,37,45,54,23,44)
> vms(y)
[1] 51.2
48
R
2 . 4. 2 f or
for
R
for for

for
for ( in ) {

}
x
sum
> x.sum = function(x){
s=0
for(i in 1:length(x)) s = s + x[i]
+s
}
> x.sum(c(1,2,3))
[1] 6
i 12lengthx
s=s+x[i]{}
2510
25 t for
> par(mfrow=c(2,2))
> for(i in c(2,5,10,25)) hist(rt(100,df=i),breaks=10)
49
R

2-
6 t

2 .4 . 3

> abs.x = function(x){


if (x<0) {x=-x}
x
}
> abs.x(-3)
[1] 3
> abs.x(3)
[1] 3
> abs.x(c(3,-3))
[1] 3 -3
50
R
Warning message: > 1
in: if (x < 0) {
x c(3,-3)

> abs.x=function(x){
if (x[x<0]) {x[x<0]=-x[x<0]}
x
}
> abs.x(c(3,-3))
[1] 3 3

if

if{
}
else{

}
51
R

3
.1

3 .1 .1
R runif( )
runif(n, min=0, max=1)n min
max minmax
[0,1]
> runif(5,0,1) # 5 [0,1]
[1] 0.5993 0.7391 0.2617 0.5077 0.7199
> runif(3,1,3) # 3 [1,3]
[1] 1.204 1.359 2.653
> runif(5) # 5 [0,1]
[1] 0.2784 0.7755 0.4107 0.8392 0.7455
100

> x=runif(100)
> hist(x,prob=T,col=gray(.9),main="uniform on [0,1]")
> curve(dunif(x,0,1),add=T) #
52
R

3-
1

3 .1 .2

IQ 100 16
0 1
rnorm( )
rnorm(n, mean=0, sd=1)n mean
0sd
1
> rnorm(5,10,5) # 5 10 5
[1] 3.172 14.705 7.173 5.842 8.879
> rnorm(5) # 5
[1] -0.58204 0.04606 0.96016 -0.68698 -0.35504
100
53
R

> x=rnorm(100)
> hist(x,prob=T,main="normal mu=0,sigma=1")
> curve(dnorm(x),add=T)

3-
2

3.1.3
n Bernoulli trials

p x x
B(n,p)n p
rbinom( )
rbinom(n, size, prob)n size
prob

> n=1; p=0.5
> rbinom(10,n,p)
54
R
[1] 0 0 0 1 0 0 1 1 0
B(10,0.5)
> n=10; p=0.5
> rbinom(5,n,p) 5 B(10,0.5)
[1] 5 6 3 3 3
n
100 n 1015
50, p 0.25
> par(mfrow=c(1,3))
> p=0.25
> for( n in c(10,20,50))
{ x=rbinom(100,n,p)
hist(x,prob=T,main=paste("n =",n))
xvals=0:n
points(xvals,dbinom(xvals,n,p),type="h",lwd=3)
}
> par(mfrow=c(1,1))
3-3 n

3-
3
55
R

3. 1.4

2500 2500
x xexp( )

R rexp( )rexp(n,
lamda = 1)n lamda1/mean
>x=rexp(100,1/10) # 100 10
>hist(x,prob=T,col=gray(0.9),main= 10
)
>curve(dexp(x,1/10),add=T)

3-
4

3.1.5
poisson
t F
56
R
r
3-1
3-
1
R
Beta beta(a,b) shape1,
shape2
Binomial binom(n,p) size, prob
Cauchy cauchy( ) location, scale
Chi-square chisq(df) df
Exponential exp(lamda) rate
F F f(df1,df2) df1 df2
Gamma gamma() shape rate
Geometric geom() prob
Hypergeometric hyper() m,n,k
Logistic logis() location scale
Negative binomial nbinom() size prob
Normal norm() mean, sd
Multivariate normal mvnorm() mean,cov
Poisson pois() lambda
T t t() df
Uniform unif() min, max
Weibull weibull() shape, scale
Wilcoxon wilcox() m, n

r
pqd 3-2

3-
2
57
R

r-
d-
p-
q-
dnorm
pnorm
qnorm

P(x 2)
> pnorm(2)
[1] 0.9772499
P(x a) =
0.95

a
> qnorm(0.95)
[1] 1.644854

3
.2

3 .2.1
R R
sample( ) sample(x,
n, replace = F, prob = NULL)X
n replace = F
replace = T prob

58
R
R H T
10
> sample(c("H","T"),10,rep=T)
[1] "H" "T" "T" "H" "T" "H" "H" "H" "T" "H"
10
> sample(1:6,10,rep=T)
[1] 3 3 6 1 2 5 2 5 2 4
> sample(100,10) #10010
[1] 91 27 20 35 49 56 70 57 32 12
> sample(100,10,rep=T) #10010
[1] 40 7 9 63 38 98 27 7 83 86
> dice=as.vector(outer(1:6,1:6,paste)) #
> sample(dice,5,replace=T) # 5
[1] "2 2" "2 3" "6 3" "5 2" "3 1"
> sample(dice,10,replace=T) # 10
[1] "2 4" "3 3" "5 6" "5 1" "5 6" "2 3" "6 2" "2 4" "3 4" "1 4"
> dice=as.vector(outer(1:6,1:6,paste))#
> dice
[1] "1 1" "2 1" "3 1" "4 1" "5 1" "6 1" "1 2" "2 2" "3 2" "4 2" "5 2"
"6 2"
[13] "1 3" "2 3" "3 3" "4 3" "5 3" "6 3" "1 4" "2 4" "3 4" "4 4" "5
4" "6 4"
[25] "1 5" "2 5" "3 5" "4 5" "5 5" "6 5" "1 6" "2 6" "3 6" "4 6" "5
6" "6 6"
outera,b,function function ab
a %o% b function
paste X Y
X Y
59
R
X
outer(1:6,1:6,paste)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "1 1" "1 2" "1 3" "1 4" "1 5" "1 6"
[2,] "2 1" "2 2" "2 3" "2 4" "2 5" "2 6"
[3,] "3 1" "3 2" "3 3" "3 4" "3 5" "3 6"
[4,] "4 1" "4 2" "4 3" "4 4" "4 5" "4 6"
[5,] "5 1" "5 2" "5 3" "5 4" "5 5" "5 6"
[6,] "6 1" "6 2" "6 3" "6 4" "6 5" "6 6"
as.vector(X) X

3 .2. 2 b oo tst ra
p
bootstrap Efron 1982
(Resampling)

n
1/n bootstrap
bootstrap
R faithful eruptions
bootstrap

> faithful #
eruptions waiting
1 3.60 79
2 1.80 54

271 1.82 46
272 4.47 74
60
R
> attach(faithful) #
> sample(eruptions,10,replace=T) #
10
[1] 4.617 4.800 4.667 2.367 3.833 1.883 4.350 2.017 4.467 4.183
> Sample=sample(eruptions,1000,rep=T) #
1000 bootstrap
> par(mfrow=c(1,2))
> hist(eruptions,breaks=25)
> hist(Sample,breaks=25)
> par(mfrow=c(1,1))
3-5 eruptions
bootstrap

3-
5
>deta
ch #
61
R

3
.3

3. 3.1
R

1.

{xn} E(X1)Var(X1)
0< 2< n
2

X + X 2 + ... X n n n
Yn * = 1
n
X n
/ n

2.

(z-np)
z ~ b(n,p) x = n
np(1-p)
x

62
R

R
> n=10;p=0.25
> z=rbinom(1,n,p)
> x=(x-n*p)/sqrt(n*p*(1-p))
[1] 0.3651484

100
R
> m =100 # m
> n = 10; p = 0.25
> z = rbinom(m,n,p) # 100
> x = (z-n*p)/sqrt(n*p*(1-p)) # 100
> hist(x,prob=T,main=paste("n =",n))
> curve(dnorm(x),add=T) #

3-
6
3.
m=100
63
R
n=10=0.25

> sim.clt <- function (m=100,n=10,p=0.25)
{ z = rbinom(m,n,p)
x = (z-n*p)/sqrt(n*p*(1-p))
hist(x,prob=T,breaks=20,main=paste("n =",n,p =,p))
curve(dnorm(x),add=T)
}
> sim.clt() # m=100n=10p=0.25
> sim.clt(1000) # m=1000n=10p=0.25
> sim.clt(1000,30) # m=1000n=30p=0.25
> sim.clt(1000,30,0.5) # m=1000n=30p=0.5
3-7

3-
7
4.

64
R

Q1 Q3
q q 25
Q150%75 Q3

R
qqnorm qqline qqline

100 0 1
10 5 10
[0,1]

> par(mfrow=c(2,2))
> x=rnorm(100,0,1);qqnorm(x,main="N(0,1)");qqline(x)
> x=rnorm(100,10,25);qqnorm(x,main="N(10,25)");qqline(x)
> x=rexp(100,1/10);qqnorm(x,main="exp(0.1)");qqline(x)
> x=runif(100,0,1); qqnorm(x,main="U(0,1)");qqline(x)
> par(mfrow=c(1,1))
3-8

65
R

3-
8

3. 3. 2

sim.fun( )

sim.fun
sim.fun R

sim.fun <-function (m,f,...) # m f
{
sample <-1:m
for (i in 1:m) {
sample[i] <-f(...)
}
66
R
sample
}
1

>f<- function(n=10,p=0.5){s=rbinom(1,n,p);(s-n*p)/sqrt(n*p*(1-p)) }
npn f
10p f 0.5
> x=sim.fun(1000,f) # 1000
> hist(x,prob=T)

3-
9 10
00
2

> f = function(n=10) (mean(runif(n)-1/2)/(1/sqrt(12*n))


> x=sim.fun(1000,f) # 1000
> hist(x,prob=T)
67
R

3-
10 1
000
3 1000
sim.fun( )

>f=function(n=10,mu=0,sigma=1){r=rnorm(n,mu,sigma);(mean(r)-m
u)/(sigma/sqrt(n)) }
> x = sim.fun(1000,f) # 1000 10 N(0,1)
> hist(x,breaks=10,prob=T)
68
R

3-
11 1
000

> x = sim.fun(1000,f,30,5,2) # 1000 30 N(5,4)

> hist(x,breaks=10,prob=T)

2
3-
12 1
000 N(5
,2 )
69
R
4
n
10
= =1/
> f <-function(n,mu=10)(mean(rexp(n,1/mu)-mu))/(mu/sqrt(n))
n 151030
100
> x=seq(-3,3,0.01)
> par(mfrow=c(2,2))
> hist(sim.fun(100,f,1,10),prob=T,main="n=1")
> points(x,dnorm(x,0,1),type="l")
> hist(sim.fun(100,f,5,10),prob=T,main="n=5")
> points(x,dnorm(x,0,1),type="l")
> hist(sim.fun(100,f,10,10),prob=T,main="n=10")
> points(x,dnorm(x,0,1),type="l")
> hist(sim.fun(100,f,30,10),prob=T,main="n=30")
> points(x,dnorm(x,0,1),type="l")
> par(mfrow=c(1,1))
3-13 n

70
R

3-
13
71
R

Exploratory Data Analysis EDA

4
.1


R
1barplot
2histdotchartstem

3boxplot

4qqnorm


EDA
4.1 24

72
R

4-
1
math stat math stat
1 81 72 13 83 78
2 90 90 14 81 94
3 91 96 15 77 73
4 74 68 16 60 66
5 70 82 17 66 58
6 73 78 18 84 87
7 88 89 19 80 86
8 78 82 20 85 84
9 95 96 21 70 82
10 63 75 22 54 56
11 85 86 23 93 98
12 60 71 24 68 76
"mathstat.txt"

> MS=read.table("mathstat.txt") # MS
> MS
maths stats
1 81 72
2 90 90

24 68 76

> stem(MS$maths)
The decimal point is 1 digit(s) to the right of the |
5|4
6 | 00368
7 | 003478
73
R
8 | 01134558
9 | 0135
> stem(MS$stats)
The decimal point is 1 digit(s) to the right of the |
5 | 68
6 | 68
7 | 1235688
8 | 22246679
9 | 04668



EDA
EDA
EDA <- function (x)
{ par(mfrow=c(2,2)) # 4
2 2
hist(x); #
dotchart(x); #
boxplot(x,horizontal=T); #
qqnorm(x);qqline(x) #
par(mfrow=c(1,1)) #
}
> EDA(MS$maths) #
> EDA(MS$stat) #
4-1 4-2

74
R

4-
1

4-
2
4.2 2005 66
10 10

75
R

>pay=c(11,19,14,22,14,28,13,81,12,43,11,16,31,16,23,42,22,26,17,22,
13,27,108,16,43,82,14,11,51,76,28,66,29,14,14,65,37,16,37,35,
39,27,14,17,13,38,28,40,85,32,25,26,16,12,54,40,18,27,16,14,
33,29,77,50,19,34)
> EDA(pay)

4-3

10
> log.pay =log10(pay)
> EDA(log.pay)

4.3 EWR ewr Newark


3 10
76
R

4-
4
> ewr # R
data()
Year Month AA CO DL HP NW TW UA US inorout
1 2000 Nov 8.6 8.3 8.6 10.4 8.1 9.1 8.4 7.6 in
2 2000 Oct 8.5 8.0 8.4 11.2 8.2 8.5 8.5 7.8 in

45 1999 Feb 24.1 26.7 21.9 24.8 20.7 21.1 23.8 20.6 out
46 1999 Jan 24.8 29.5 22.2 27.4 21.4 24.0 23.7 20.2 out
> boxplot(ewr[,3:10])
77
R

4-
5 N
ewark

> par(mfrow=c(2,4)) # 2 4
> for (i in 3:10) boxplot (ewr [,i] ~ as.factor (ewr$inorout), main =
names(ewr)[i])
> par(mfrow=c(1,1)) #
boxplot boxplot(y ~ x) x
as.factor
inorout for
8

4-
6 N
ewark
78
R

HP America West 10
EWR
30
(NW)

4.4
6

1.
1
> X = runif(100); EDA(X)

4-
7
2
> X = rnorm(100); EDA(X)
79
R

4-8
3 t
> X = rt(100,10); EDA(X)

4-9t
2.
1F
> X = rf(100,10,10); EDA(X)
80
R

4-
10
2
> X=abs(rnorm(200)); EDA(X)

4-
11
3
81
R
> X=rexp(200); EDA(X)

4-
12

4
.2

4 .2. 1

1. table
R table
X
tablex
4.5
c
82
R
table
> x=c("","","","","","","","","")
> table(x)
x

4 5
x 9
4 5
2. Barplot

R barplot()

4.6 25
1 2 3 4
3 4 1 1 3 4 3 3 1 3 2 1 2 1 2 3 2 3 1 1 1 1 4 3 1
> drink=c(3, 4,1,1,3,4,3,3,1,3,2,1,2,1,2,3,2,3,1,1,1,1,4,3,1)
> barplot(drink)
> barplot(table(drink))
4-13 Y
4-14

4
-13 Y 4
-14 Y

4-15 length
Y
83
R

4-16
> barplot(table(drink)/length(drink))
>barplot(table(drink), col = c("red","yellow","blue","white"))

4-
15 Y 4-
16 Y
3. Pie Graph

100
R
pie()

> drink.count=table(drink) #y drink.count


>pie(drink.count)
4-17 R
1234 R

>names(drink.count)=c("","","","")
>pie(drink.count)
84
R

4-
17
4-17 1234

>pie(drink.count,col=c("purple","green","cyan","white"))
# 4-17 4-17

4. 2.2
1.

mean()median()var()sd()
4.7
200021002200230023502450250027002900
28503500380026003000330032004000 31004200

> salary = c(2000,2100,2200,2300,2350,2450,2500 ,2700,2900,2850,
3500,3800,2600,3000,3300,3200,4000,3100,4200)
> mean(salary)
[1] 2900
> median(salary)
85
R
[1] 2850
> var(salary)
[1] 410000
> sd(salary)
[1] 640.3124
R fivenum
summary
> fivenum(salary)
[1] 2050 2400 2850 3250 4200
> summary(salary)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2050 2400 2850 2900 3250 4200
2.


15 000

> salarym=c(salary,15000)
> mean(salarym)
[1] 3503
2900
3503

> median(salarym)
[1] 2875
R
86
R
mean trim
> mean(salarym,trim0.2)
[1] 2871
20
> mean(salarym,0.5)
[1] 2875
50
trim


IQR(mad)
> IQR(salarym)
[1] 925
> mad(salarym)
[1] 704.2
3. Stem-and-Leaf Graph

R stem

> stem(salary)
The decimal point is 3 digit(s) to the right of the |
2 | 01234
2 | 556799
3 | 0123
3 | 58
4 | 02
87
R
2 000-4 000

> stem(salarym)
The decimal point is 4 digit(s) to the right of the |
0 | 2222223333333334444
0|
1|
1|5
15 000
4.
R
cut 2 0003
0003 0004 0004 000 table()

> salaryg=cut(salary,breaks=c(2000,3000,4000,max(salary)))
> table(salaryg)
salaryg
(2e+03,3e+03) (3e+03,4e+03) (4e+03,4.2e+03)
11 6 1

2 0003 000
11 30004000 6 4000
1
5. Histogram

88
R

1 R hist()
R
probability T F

>hist(salary)
>hist(salary,prob=T)

4-
18

R rug X
4-19
> rug(salary)
89
R

4-
19

6. Boxplot Graph

R
boxplot( )
horizontal T

> boxplot(salary)
> boxplot(salary,horizontal=T)
4-20
90
R

4-
20

7. Densitis
R
density() R
faithful datasets
2 eruptions waiting
272 4-21

> hist(faithful$eruptions,prob=T,,breaks=25) # faithful


> lines(density(faithful$eruptions),col='red')

4-
21
91
R

4
.3

4. 3.1
1.
R table()
table (
)
4.8
4-2
4-
2

Smoke Study
1 Y 5 <5h
2 N 5-10 5-10h
3 N 5-10 5-10h
4 Y 10 >10h
5 N 10 >10h
6 Y 5 <5h
7 Y 5-10 5-10h
8 Y 5 <5h
9 N 10 >10h
92
R
10 Y 5-10 5-10h
> smoke=c("Y","N","N","Y","N","Y","Y","Y","N","Y")
>study=c("<5h","5-10h","5-10h",">10h",">10h","<5h","5-10h","<5h
", ">10h","5-10h")
> table(smoke,study)
study
smoke <5h >10h 5-10h
N 0 2 2
Y 3 1 2



R
prop.table( )prop.table(x, margin)
margin=1 margin=2

> tab=table(smoke,study)
> prop.table(tab,1)
study
smoke <5h >10h 5-10h
N 0.0000 0.5000 0.5000
Y 0.5000 0.1667 0.3333
> prop.table(tab,2)
study
smoke <5h >10h 5-10h
N 0.0000 0.6667 0.5000
Y 1.0000 0.3333 0.5000
> prop.table(tab)
study
93
R
smoke <5h >10h 5-10h
N 0.0 0.2 0.2
Y 0.3 0.1 0.2
apply

> prop = function(x) x/sum(x)


> apply(tab,2,prop)
study
<5h >10h 5-10h
N 0 0.6667 0.5
Y 1 0.3333 0.5
1
apply(tab,1,prop)
t( )
> t(apply(tab,1,prop))
smoke <5h >10h 5-10h
N 0.0 0.5000 0.5000
Y 0.5 0.1667 0.3333
2.

R
barplot( )
barplot
> par(mfrow=c(1,3))
> barplot(table(smoke,study)) study
> barplot(table(study,smoke)) smoke
> barplot(table(study,smoke),beside=T,legend.text=c("<5h","5-10h",
94
R
"> 10h")) #
main sub
beside False
True R False
legend.text 4.3-1
4-22

4-
22

4 .3. 2

4.9
5 5 5 13 7 11 11 9 8 9
11 8 4 5 9 5 10 5 4 10

R

> x=c(5,5,13,7,11,11,9,8,9)
> y=c(11,8,4,5,9,5,10,5,4,10)
> boxplot(x,y)
4-23 x y
95
R
x y xy
x y

1 2
5 5 5 13 7 11 11 9 8 9 11 8 4 5 9 5 10 5 4 10
1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
> d=(5,5,5,13,7,11,11,9,8,9,11,8,4,5,9,5,10,5,4,10)
> g=(1,1,1, 1,1, 1, 1,1,1,1, 2,2,2,2,2,2, 2,2,2, 2)
> boxplot(d~g)

4-
23
4-23 d~g

4. 3.3

1.
R
plot( )
4.10 19852001 y
96
R
x 4-3

4
-3

> data.entry(c(NA)) #
> plot(x,y) # xy
> abline(lm(y~x)) #

4-
24
97
R
4-24 xy

2.
Person

Cov( X , Y )
=
Var ( X )Var (Y )

r=
( X X )(Y Y )
t t

( X X ) (Y Y )
t
2
t
2

Person [-1,1]-1 < r < 0


-10 < r < 1
1r = -1
r = 1 r = 0

spearman
6 d i 2
rs = 1
n( n 2 1)
di(xi-yi)xi yi
n spearman [-1,1]
-1 < r < 0-1
0 < r < 1 1
r = -1 r = 1
r = 0 R
cor
4.10
> cor(x,y)
98
R
[1] 0.9998
> cor(y,x)
[1] 0.9998

x y y x

pearson cor( ) spearman



> cor(x,y,method="spearman")
[1] 1
spearman
pearson R spearman R
rank( )
> cor(rank(x),rank(y))
[1] 1
spearman

4
.4

4.4.1

R data frame
4.3.3

99
R

1. attach
attach( ) detach( )

attach( )
$

reg1.txt R
> yx=read.table("reg1.txt",header=T)
> yx
t y x
1 1990 2937 2822
2 1991 3149 2990

11 2000 13395 12582
12 2001 16386 15301

$ attach( )

>x
"x"
> yx$x
[1] 2822 2990 3297 4255 5127 6038 6910 8234
9263 10683 12582 15301
> attach(yx)
>x
[1] 2822 2990 3297 4255 5127 6038 6910 8234
9263 10683 12582 15301
detach
> detach()
100
R
>x
"x"
R

1

2rm( ) 2

> x="That's wrong"


>x
[1] "That's wrong"
> rm(x)
>x
[1] 2822 2990 3297 4255 5127 6038 6910 8234
9263 10683 12582 15301
> detach()
>x
"x"
2.

4.3.3
[]
> yx[,"y"] # y
[1] 2937 3149 3483 4349 5218 6242 7408 8651
9876 11444 13395 16386
101
R
> yx[,1] #
[1] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
2001
> yx[1:5,1:3] # 1 5 1 3
t y x
1 1990 2937 2822
2 1991 3149 2990
3 1992 3483 3297
4 1993 4349 4255
5 1994 5218 5127
> yx[1,]
t y x
1 1990 2937 2822
> yx[1,2] # 2 1
[1] 2937
> yx[,] #
3.

> yx$x # x
[1] 2822 2990 3297 4255 5127 6038 6910 8234
9263 10683 12582 15301
$[[()]]

> yx[['y']] # y
[1] 2937 3149 3483 4349 5218 6242 7408 8651
102
R
9876 11444 13395 16386
> yx[[1]] # t
[1] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
2001

1998
> yx[yx$t>'1998',]
t y x
9 1998 9876 9263
10 1999 11444 10683
11 2000 13395 12582
12 2001 16386 15301

4. 4.2

R unstack( )
stack( )
4.11 dataset PlantGrowth

> data(PlantGrowth)
> PlantGrowth #
weight group
1 4.17 ctrl 11 4.81 trt1 21 6.31 trt2
2 5.58 ctrl 12 4.17 trt1 22 5.12 trt2
3 5.18 ctrl 13 4.41 trt1 23 5.54 trt2
4 6.11 ctrl 14 3.59 trt1 24 5.50 trt2
5 4.50 ctrl 15 5.87 trt1 25 5.37 trt2
6 4.61 ctrl 16 3.83 trt1 26 5.29 trt2
103
R
7 5.17 ctrl 17 6.03 trt1 27 4.92 trt2
8 4.53 ctrl 18 4.89 trt1 28 6.15 trt2
9 5.33 ctrl 19 4.32 trt1 29 5.80 trt2
10 5.14 ctrl 20 4.69 trt1 30 5.26 trt2
ctrltrt1
1 trt2 2 10

> attach(PlantGrowth)
> stripchart(weight~group,pc=1) #
> boxplot(weight~group, horizontal=T) #
> detach

4-
25


ctrltrt1
trt2
> unPGunstack(PlantGrowth)
> unPG
ctrl trt1 trt2
1 4.17 4.81 6.31
2 5.58 4.17 5.12
104
R
3 5.18 4.41 5.54
4 6.11 3.59 5.50
5 4.50 5.87 5.37
6 4.61 3.83 5.29
7 5.17 6.03 4.92
8 4.53 4.89 6.15
9 5.33 4.32 5.80
10 5.14 4.69 5.26
> boxplot(unPG)
boxplotweight~group
stack( ) unPG
> sPG=stack(unPG)
> sPG
values ind
1 4.17 ctrl
2 5.58 ctrl
3 5.18 ctrl

29 5.80 trt2
30 5.26 trt2

4.4.3

R
1.
table( ) table( )
xyz table(x,y)
xy table(x,y,z) z xy
105
R


4.12 MASS Cars93
> library(MASS) # Car93 MASS
> data(Cars93)
> attach(Cars93)
> names(Cars93)
[1] "Manufacturer" "Model" "Type"
[4] "Min.Price" "Price" "Max.Price"
[7] "MPG.city" "MPG.highway" "AirBags"
[10] "DriveTrain" "Cylinders" "EngineSize"
[13] "Horsepower" "RPM" "Rev.per.mile"
[16] "Man.trans.avail" "Fuel.tank.capacity" "Passengers"
[19] "Length" "Wheelbase" "Width"
[22] "Turn.circle" "Rear.seat.room" "Luggage.room"
[25] "Weight" "Origin" "Make"
27 Price()(0
12 ] (1220 ](20max(Price))
MPGhighway
020 ] 2030 ] 30maxMPGhighway

cut
> price=cut(Price,c(0,12,20,max(Price))) Price
> levels(price)=c("cheap","okay","expensive") price
> mpg=cut(MPG.highway,c(0,20,30,max(MPG.highway)))
> levels(mpg)=c("gas guzzler","oky","miser")
Type price
Type priceType mpg
> table(Type)
106
R
Type
Compact Large Midsize Small Sporty Van
16 11 22 21 14 9
> table(price,Type)
> table(price,Type)
Type
price Compact Large Midsize Small Sporty Van
cheap 3 0 0 18 1 0
okay 9 3 8 3 9 8
expensiv 4 8 14 0 4 1
> table(price,Type,mpg)
, , mpg = gas guzzler
Type
price Compact Large Midsize Small Sporty Van
cheap 0 0 0 0 0 0
okay 0 0 0 0 0 2
expensive 0 0 0 0 0 0

, , mpg = oky
Type
price Compact Large Midsize Small Sporty Van
cheap 1 0 0 4 0 0
okay 5 3 6 0 6 6
expensive 4 8 14 0 4 1

, , mpg = miser
Type
price Compact Large Midsize Small Sporty Van
cheap 2 0 0 14 1 0
107
R
okay 4 0 2 3 3 0
expensive 0 0 0 0 0 0

2.

table( ) barplot( )
R
besideTRUE
> barplot(table(price,Type))
> barplot(table(price,Type),beside=T)
price Type 4-26
4-26 Type CompactLarge
price cheapokayexpensive

4-
26 C
ars
93 pri
ce Ty
pe

> barplot(table(Type,price))
> barplot(table(Type,price),beside=T)
108
R
Type prcie 4-27
4-27 prcie cheapokayexpensive
CompactLarge

4-
27 C
ars
93 Typ
e p
rice

3.

R boxplot( )

Cars93 price Type


> boxplot(Price~Type)
4-28price Type CompactLarge

109
R

4-
28 C
ars
93 pri
ce Ty
pe
4.13 1000 10
10
> r1=rnorm(1000)
> f1=factor(rep(1:10,100))
> boxplot(r1~f1)
4-29 rep( ) 100 1
10 1000 rep( ) 2.2.1 factor( )
10 factor( ) 2.2.6
110
R
4-
29 10 1000

4. stripchart

R
stripchart( ) stripchartztz
t z x t
y
4.12 Cars93 price Type
4.12
> stripchart(Type~price)
4-30
x 123..6 Type CompactLarge
4-28 4.4-5

4-
30 C
ars
93 pri
ce Ty
pe
111
R
4.14 100 5

> r2 = rnorm(100)
> f2 = factor(rep(1:5,20))
> stripchart(r2~f2)
4-31
[-1,1]

4-
31 1
00 5

5.
4.15 datasets iris Fisher
150
> iris
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
112
R

149 6.2 3.4 5.4 2.3 virginica
150 5.9 3.0 5.1 1.8 virginica
> levels(iris$Species) # Species
[1] "setosa" "versicolor" "virginica"
setosaversicolor virginica
12 3
> iris.lab = rep(c("1", "2", "3"), rep(50, 3))
1

> plot(iris[,1],iris[,3],type="n") # iris 1 3


type=n
> text(iris[,1],iris[,3],cex=0.6) #
cex=0.6
> plot(iris[,1],iris[,3],type="n")
> text(iris[,1],iris[,3],iris.lab,cex=0.7) #
cex=0.7

4-
32 F
ish
er ir
is
113
R
2

R pairs

4.16
> pairs(iris)

4-
33 F
ish
er ir
is
> pairs(iris[1:4],pch=21,bg=iris.lab) # iris.lab
114
R

4-
34 F
ish
er ir
is

115
R


x >
2 2
s >
s>
p> P

95%

parmeter estimation

5
.1


1.
point estimation

116
R
2.
interval estimation



x x - x +
n n
68.27%

x x -2 x +2
n n
95.45%

x x -3 x +3
n n
99.37%

x -
z=
/ n
confidence level
)
(x- z
/ n confidence

interval
90% z
= 1.65

x -1.
65 x +1.
65
n n
95% z = 1.96

x -1.
96 x +1.
96
n n
99% z = 2.58
117
R

x -2.
58 x +2.
58
n n
P
P
Ep= P
P
95% P
z=1.96
p(1-p) p(1- p)
p -1.96 p +1 .96
n n
R prop.testt.test wilcox.test

5
.2

5.2.1

u = X
/ n

1X
2n
u 95%
> a=0.05
> ua=qnormca/2,1-a/2
> plotua,dnormua,xlim=c-3,3,ylim=c0,0.5,type="h"
#
> x=seq-3,3,0.1
> curvednormx,-3,3,add=T #
118
R
> legend-0.5,0.3,"a=0.95",bty="n"

5-
1 u 9
5%

5.1
62
7.23.54.3
6.210.15.46.84.55.16.63.88.2
95%

> u.conf.int <- functionx,sigma,conf.level=0.95 # x
{
n = lengthx
xbar = meanx
a=1-conf.level
ua=qnorm1-a/2
Se=sigma/sqrtn
xbar + c-ua * Se, ua * Se
}
> x = c7.2,3.5,4.3,6.2,10.1,5.4,6.8,4.5,5.1,6.6,3.8,8.2
> u.conf.intx,2
119
R
[1] 4.843 7.107
954.8437.107

5.2.2

X
t=
s/ n
s

1 Xi n t
n-1 t
2 n

t95%
> a=0.05
> ta=qnormca/2,1-a/2
> plotta,dtta,12,xlim=c-4,4,ylim=c0,0.5,type="h"
#
> x=seq-4,4,0.1
> curvedtx,-4,4,add=T # t
> legend-1,0.3," a=0.95",bty="n"
120
R

5-
2 t 95
%

t
> t.conf.int <- functionx,conf.level=0.95
{
n = lengthx
xbar = meanx
a=1-conf.level
ta=qt1-a/2,n-1
s=sdx
Se=s/sqrtn
xbar + c-ta * Se, ta * Se
}
> x = c7.2,3.5,4.3,6.2,10.1,5.4,6.8,4.5,5.1,6.6,3.8,8.2
> t.conf.intx
[1] 4.741 7.209
95%4.741 7.209
u 95%4.8437.107
R t
121
R
t t.test 95%

> t.testx
One Sample t-test
data: x
t = 10.65, df = 11, p-value = 3.914e-07
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
4.741 7.209
sample estimates:
mean of x
5.975

5.2.3
t > z t
z t Se s
s t
z
t z

> x=rnorm100; y=rt100,10 # 100 t


> parmfrow=c1,3
> boxplotx,y
> histx
> histy
> parmfrow=c1,1
122
R

5-
3 100 t

> x=seq-4,4,.01
> plotx,dnormx,type="l",lty=1
> fori in c1,5,10 pointsx,dtx,df=i,type="l",lty=i+1
> legend -1,0.15,c "N 0,1 ","t 10 ","t 5 ","t 1
",lty=1:4,bty="n"

5-
4 t
123
R

5
.3


1
2
3

8

R
R
wilcox.test
5 . 2

212404632228365484505250646972759614760
1501218720948047286720052788 95%

> x = c2124046322283654845052506469727596
147601501218720948047286720052788
> EDAx
124
R

5-
5

> wilcox.testx,conf.int = T
Wilcoxon signed rank test
data: x
V = 120, p-value = 6.104e-05
alternative hypothesis: true mu is not equal to 0
95 percent confidence interval:
6972 28926
sample estimates: pseudomedian
13065
95697228926

t.test R

125
R
t

5
.4


100 42
A
42% A
42% A 9%

42% A 95% 9%

Bernoulli P
n n
Bernoulli x x
p = x/n P 1001-%

p ua/2 p (1- p )/n p-ua/2 p (1- p )/n, P+ua/2 p (1- p )/n
R
prop.test x , n xn 95%

> prop.test42,100
1-sample proportions test with continuity correction
data: 42 out of 100, null probability 0.5
X-squared = 2.25, df = 1, p-value = 0.1336
alternative hypothesis: true p is not equal to 0.5
126
R
95 percent confidence interval:
0.3233 0.5229
sample estimates:
p
0.42
95%0.32,0.52
90%
> prop.test42,100,conf.level=0.90
1-sample proportions test with continuity correction
data: 42 out of 100, null probability 0.5
X-squared = 2.25, df = 1, p-value = 0.1336
alternative hypothesis: true p is not equal to 0.5
90 percent confidence interval:
0.3372 0.5072
sample estimates:
p
0.42
90%0.33,0.50

5
.5

5-6
5.3 :100 95%
127
R
p

> m=100; n=20; p=0.5; # 20 100


> phat=rbinomm,n,p/n # n
> Se=sqrtphat*1-phat/n # Se
> a=0.05; zstar=qnorm1-a/2
> matplotrbindphat-zstar*Se,phat+zstar*Se,rbind1:m,1:m,
type="l",lty=1
> ablinev=p # p=0.5

5-
6 100 9
5% 95 p

2
3
128
R

Xi

6
129
R

P<0.01
P<0.05
H0
,

130
R

6
.1

6. 1.1
1.
2

1H0: = 0, H1: 0
2
x - 0
3 u =
/ n
4 u P
5 P H0 H1
P > H0 H1
6.1
25 2.4 10

23
H0 25 H1 < 25
R
t R t.test t.test
p

u.test <- functionn,xbar,sigma,mu


{
Se=sigma/sqrtn
u=xbar-mu/Se
p=pnormu
131
R
cu=u, p=p
}
> u.test10,23,2.4,25 # u p
H0 25
u p
-2.6352 0.0042
P 0.0042

2.
2

1H0: = 0, H1: 0
2

x - 0
3 t =
s/ n
4 t P
5 P H0 H1
P > H0 H1
6.2

()
50012
502496510508506498512
497515503510506
?
R t.test
> x=c502,496,510,508,506,498,512,497,515,503,510,506
> t.testx,mu=500,alternative ="greater" # H0=500, H1
500
One Sample t-test
data: x
132
R
t = 2.956, df = 11, p-value = 0.006529
alternative hypothesis: true mean is greater than 500
95 percent confidence interval:
502.1 Inf
sample estimates:
mean of x
505.3
p = 0.0065 0.05=0.05 H 0

H1

6.1.2
100
45
50%

H0 p = 0.5 H1
p 0.5
X~ b1,px1x2xn X
n x Np,p
x -p
1-p/n u = N0,1 p
p(1-p)/n
u prop.test

> prop.test45,100,p=.5
1-sample proportions test with continuity correction
data: 45 out of 100, null probability 0.5
X-squared = 0.81, df = 1, p-value = 0.3681
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
0.3514 0.5525
133
R
sample estimates:
p
0.45
p 0.3681 p

p
45 55
p 100 45

1000 450
p=0.5
> prop.test450,1000,p=.5
1-sample proportions test with continuity correction
data: 450 out of 1000, null probability 0.5
X-squared = 9.801, df = 1, p-value = 0.001744
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
0.4189 0.4815
sample estimates:
p
0.45
p 0.001744
p n n

134
R

6
.2

6. 2.1 t
6.3

10
48
474445464743474248
36454738394236424635

5%

> x1=c48,47,44,45,46,47,43,47,42,48
> x2=c36,45,47,38,39,42,36,42,46,35
> qqnormx1,xlab="x1";qqlinex1 # x1Q-Q
> qqnormx2,xlab="x2";qqlinex2 # x2Q-Q
135
R

6
-1 6.
2Q
-Q

homogeneity of variance

1.
2 2
1 2 F
11 X
2 2
X 1N 2N 22 X
1 X
2
2 2 2 2
S 1 S 2 1 2 n 1 n2
,
H0:1 =2 1:1 2
2 2 2 2
H
F = S 1 / 1 ~ F ( n 1, n 1)
2 2

S 22 / 22
1 2

H
0
S2
F = 12 ~ F ( n 1 1, n 2 1)
S2
F F /2
n1-1,n2-1 F1-/2n1-1,n2-1 F1-/2n1-1,n2-1=1/F/2
n1-1,n2-1,, F>F/2 F F1-/2 P< H0
136
R
F1-/2 < F F/2 P H0
> var.testx1,x2
F test to compare two variances
data: x1 and x2
F = 0.2273, num df = 9, denom df = 9, p-value = 0.03793
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.05646 0.91521
sample estimates:
ratio of variances
0.2273
p0.0379 < 0.05
2 .
t
R
var.equal=TRUE
Splus t Welch
t
12 22, H 0 : 1 2t
X Y
t =
s 12 s2
+ 2
n m
H0 t t
s12 s22 2 s14 s 24
l= ( + ) /( 2 + 2 )
n m n ( n 1) m ( m 1)

> t.testx1,x2
Welch Two Sample t-test
137
R
data: x1 and x2
t = 3.288, df = 12.89, p-value = 0.005939
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
1.746 8.454
sample estimates:
mean of x mean of y
45.7 40.6
p = 0.0059 < 0.05

3 .

HO: 1 = 2 ;H1: 1 2
t=( X 1 X 2 ) ( 1 2)
~ t ( n1 + n 2 2 ) ,
S X 1 X 2

1 1
S X1X
= S c2 + 2
Sc
n n 2
2
1

pooled variance,
Sc = (
2 n 1 1) s12 + ( n 2 1) s 22
, s12 s22
(n 1 1) + (n 2 1)

S X 1 X 2
X 1 X 2
0 t =
H ~ t (n1 + n 2 2 )
S X 1 X 2

t t n
/2 1+n2-2
,
P {|
t|t
/2}=

t/2 t 100, t ,
|t|
> t/2 H 0|| t
t /2 H 0


R t.test
138
R
var.equal=TRUE
> t.testx1,x2,var.equal=T
Two Sample t-test
data: x1 and x2
t = 3.288, df = 18, p-value = 0.004086
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
1.842 8.358
sample estimates:
mean of x mean of y
45.7 40.6
p = 0.004 < 0.05

6.2.2 t
t

Yi=Xi+ii i
0 Y X t
R t.test paired=TRUE

6.4
8.5kg
10
139
R

94.5 101 110 103.5 97 88.5 96.5 101 104 116.5


85 89.5 101.5 96 86 80.5 87 93.5 93 102
H 0H
1

t
> before = c94.5,101,110,103.5,97,88.5,96.5,101,104,116.5
> after = c85,89.5,101.5,96,86,80.5,87,93.5,93,102
> t.testbefore,after,paired=T
Paired t-test
data: before and after
t = 14.2, df = 9, p-value = 1.854e-07
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
8.28 11.42
sample estimates:
mean of the differences
9.85

8.5kg

6.2.3
prop.test

6.5


140
R
45 56
35 47
H01 = 2 H11 2
prop.test prop.testx,nx n
x

> prop.testc45,56,c45+35,56+47
2-sample test for equality of proportions with continuity correction
data: c45, 56 out of c45 + 35, 56 + 47
X-squared = 0.0108, df = 1, p-value = 0.9172
alternative hypothesis: two.sided
95 percent confidence interval:
-0.1374478 0.1750692
sample estimates:
prop 1 prop 2
0.5625000 0.5436893
P 0.9172 1 = 2

6
.3

2 distribution
Zi N
n
0,1 2 = Z i 2 2 n
i =1
141
R
6-2 df
> x=seq0,20,0.1
> curvedchisqx,2,0,20,ylab="px"
> curvedchisqx,4,add=T,lty=2
> curvedchisqx,6,add=T,lty=3
> curvedchisqx,8,add=T,lty=4
> curvedchisqx,10,add=T,lty=5
> legend13,0.4,c"df=2","df=4","df=6","df=8","df=10", lty=1:5,
bty="n"

6-
2df

6-2
df 6-3 5 100

> X5=rchisq100,5
> EDAX5
142
R

2
6-
3 5

6-3
6.3-1

6.3.1
Chi-squared goodness of fit tests

1/6
6.6 ?
150

1 2 3 4 5 6
143
R
22 21 22 27 22 36 150

1/6 150 25
6 36


fi i ei i
2
n
( f e )2
2 = i i
i =1 ei
2

1 80% 5
2
n-1

> freq = c22,21,22,27,22,36


> probs = c1,1,1,1,1,1/6 #
> chisq.testfreq,p=probs
Chi-squared test for given probabilities
data: freq
X-squared = 6.72, df = 5, p-value = 0.2423
i p i

pi=1/6 pi
144
R
2 6.72 df=6-1 = 5p 0.2423

6.7
5
26 5

letter E T N R O
freq 29 21 17 17 16
ETNRO
100 29 E 4

ETNR O
letter E T N R O
freq 100 110 80 55 14
E =0.29, T =0.21,
N=0.17, R =0.17 O=0.16

> freq = c100,110,80,55,14


> probs = c29, 21, 17, 17, 16/100
> chisq.testfreq,p=probs
Chi-squared test for given probabilities
data: freq
X-squared = 55.3955, df = 4, p-value = 2.685e-11

6.3.2
145
R
Chi-squared tests of independence

6.8


12813 647 359 42


65963 4000 2642 303

Pnone and yes = Pnone


Pyes n
n
n n

> yesbelt = c12813,647,359,42


> nobelt = c65963,4000,2642,303
> chisq.testrbindyesbelt,nobelt
Pearson's Chi-squared test
data: rbindyesbelt, nobelt
X-squared = 59.22, df = 3, p-value = 8.61e-13

P

146
R
6. 3. 3

Chi-squared tests for homogeneity


6.9

sample

100 100

> die.fair = sample1:6,100,p=c1,1,1,1,1,1/6,rep=T #

> die.bias = sample1:6,100,p=c.5,.5,1,1,1,2/6,rep=T #

> res.fair = tabledie.fair;res.bias = tabledie.bias


> count=rbindres.fair,res.bias
> count
1 2 3 4 5 6
res.fair 15 21 22 17 15 10
res.bias 7 4 18 22 16 33

6 2

147
R

2 100
1

25/200 =21 + 4
/ 100+100 100*25/200
= 12.5 21

2 = ( f i ei )
2

ei

2-16 -1 = 5 1
1 chisq.test
> chisq.testcount
Pearson's Chi-squared test
data: count
X-squared = 27.84, df = 5, p-value = 3.903e-05
p
expected
> chisq.testcount$exp # ei
1 2 3 4 5 6
res.fair 11 12.5 20 19.5 15.5 21.5
res.bias 11 12.5 20 19.5 15.5 21.5
148
R

Regression Analysis

7
.1
149
R

7.1.1

)
y =a+bx
)
y y (?)x
ab a
b a x y , b
x y b

a b least
square estimate
7-1 x y
y yii=12n
xii=12n y$i

( yi y$i )

residual

150
R
n n
Q= (y
i =1
i y i ) 2 = [y
i =1
i ( a + bxi )] 2

Q a b
n

(x i x )( y i y)
l xy
b = i =1
n
=
(x )2 l xx
i x
i =1

a = y bx
7.1 R
MaxRate=220-Age
15
Agex18 23 25 35 65 54 34 56 72 19 23 42 18 39 37
MaxRatey202 186 187 180 156 169 174 172 153 199 193 174 198
183 178
lm
plot abline
> x = c18,23,25,35,65,54,34,56,72,19,23,42,18,39,37
>y=c202,186,187,180,156,169,174,172,153,199,193,174,198,183,178
> plotx,y #

7-
1
> fm = lmy ~ x #
151
R
> ablinefm #
> fm
Call: lmformula = y ~ x
Coefficients:
Intercept x
210.048 -0.798
fm summary

resid coef predict


b0 b1

> coeffm # fm$coef


Intercept x
210.0485 -0.7977
> residfm # fm$resid
1 2 3 4 5 6 7 8 9
6.3106 -5.7007 -3.1053 -2.1280 -2.1962 2.0288 -8.9258 6.6242 0.3879
10 11 12 13 14 15
4.1083 1.2993 -2.5439 2.3106 4.0629 -2.5326

7.1.2
EDA

EDA
152
R

> EDAresidfm

7-
2
plot
4 parmfrow=c2
2 R
plotxy

1Residuals vs. fitted


y = 0
2QQ QQplot

3Scale

4Cook Cook's distance


153
R

> plotfm

7-
3 pl
otl
mMa
xRa
te~Ag
e
7.1.3

0 1
1

2 2
s2 = 1 ( y yi ) 2 =
1
ei s
2

n2 n2
i

2n-2
n-20 1
154
R
21
1 b1

s
SE (b1 ) =
(x i x)2

b1 t = b1 1 n-2
SE (b1 )
t
H01 = a HAH1 1 a

b1 a
t=
SE (b1 )
p
- 1
1=0 H01 = -1
HAH11 -1
p

> n= lengthx
> res = residfm #
> b1 =coeffm[2] # Age
> s = sqrtsumres^2/n-2
> SE = s/sqrtsumx-meanx^2
> t = b1--1/SE
> ptt,n-2,lower.tail=FALSE/2 # t
15-2
0.003155
p
155
R
-1
R H01 = 0
p summary Pr>|t|

> summaryfm
Call: lmformula = y ~ x
Residuals:
Min 1Q Median 3Q Max
-8.926 -2.538 0.388 3.187 6.624
Coefficients:
Estimate Std. Error t value Pr>|t|
Intercept 210.048 2.867 73.3 < 2e-16 ***
x -0.798 0.070 -11.4 3.8e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.58 on 13 degrees of freedom
Multiple R-Squared: 0.909, Adjusted R-squared: 0.902

30

0 R 0 = 0

b0 0

SE (b0 ) = s
x i
2

=s
1
+
x2
n ( x x )
i
2
n ( xi x ) 2
156
R

b0 0
t = n-2 t
SE (b0 )

220
H0: 0 =2 20 H1: 0 220
p
s
> bo =coeffm[1] #
> SEbo = s * sqrt sumx^2/ n*sumx-meanx^2
> t = bo - 220/SE
> ptt,n-2
1.929e-22
p () 220

7. 1.4
x y y
y
y y y
b0+b1 t*SE

y / x x y

1 (x x)2
SE = s +
n ( xi x ) 2
s i y
SE

1 (x x ) 2
SE = s 1 + +
n ( xi x ) 2
157
R
R predict

1
> predictfm,data.framex=50
[1] 170.2
x = 50 y 170.2
95% 7-4

2
> sx=sortx # x
> pred = predictfm,data.framex=sx,interval="confidence"
#
> conf = predictfm,data.framex=sx,interval="prediction"
#
> plotx,y; ablinefm #
> linessx,conf[,2]; linessx,conf[,3] # 95%
> linessx,pred[,2],lty=3; linessx,pred[,3],lty=3
#95%
158
R
7-
4

7.1.5

lm
1 lm lm
x y
> fm = lmy ~ x # fm
2summary

> summaryfm
3 abline
> plotx,y
> ablinefm
4 resid
> residlm.result
5 coef
> coeffm

> coeffm[1] #
> coeffm[2] #
6 i y i = b0 + b1 xi
fitted.values
> fittedfm
7SEb0 SEb1 summary

Coefficients summary

159
R
> coefsummaryfm
Estimate Std. Error t value Pr>|t|
Intercept 210.0485 2.86694 73.27 2.124e-18
x -0.7977 0.06996 -11.40 3.848e-08
x 2 3

> coefsummaryfm[2,2]
[1] 0.06996
8 predict

x 50 60
> predictfm,data.framex= c50,60
1 2
170.2 162.2
9
predict
x
x

90%

> pred = redictfm,data.framex=sortx, level=.9, interval=


"confidence"
fit lwr upr
1 195.7 192.5 198.9
2 195.7 192.5 198.9

15 152.6 147.8 157.4
160
R
x 3 2
[,2] plot
lines
> pred = predictfm,data.framex=sortx,level=.9, interval=
"confidence"
> plotx,y; ablinefm
> linessortx,pred[,2],type="l" #
> linessortx,pred[,3],type="l" #

7
.2

1 000

30 000 15 000
10 000

161
R
GPA SAT

7.2.1
xi yi
yi = 0 + 1 xi + i
i N0,2
xiyi01
2

y i = 0 + 1 x i 1 + 2 x i 2 + L + p xip + i
i N0,2x i

y i = 0 + 1 xi + 2 x i + i
2

2 Y = X +
xyi
b yi
y i = b0 + b1 xi 1 + b2 xi 2 + L + b p xip ei = yi y i
( yi y i ) 2 bi

bi-i
bi i
SE(bi)
n-p+1 t 0 , 1 , L , p p+1

162
R
7.2 .2
7.2

lm

> x1 = 1:20
> x2 = sample1:100,20
> y = x1+x2 #
> lmy ~ x1+x2
Call: lmformula = y ~ x1 + x2
Coefficients:
Intercept x1 x2
-5.19e-15 1.00e+00 1.00e+00
b0 = 0, b1 = 1, b2 = 1
> y = x1+x2 + rnorm20,0,2 # N0,2
> lmy ~ x1+x2
Call: lmformula = y ~ x1 +x2
Coefficients:
Intercept x1 x2
-0.487 1.031 0.998
b0 = -0.487, b1 = 1.031, b2 = 0.998
> y = x1+x2 + rnorm20,0,10 # N0,10
> lmy ~ x1+x2
Call: lmformula = y ~ x1+ x2
Coefficients:
Intercept x1 x2
-14.431 2.287 0.975
163
R

bi i bi
bi i 2


y y~ x1 + x2
y ~ x0
= 0-1
> lmy ~ x1+x2 -1 #
Call: lmformula = y ~ x1 + x2 - 1
Coefficients:
x1 x2
1.71 0.86
lm summary
anova summary

> summarylmy ~ x1+x2


Call: lmformula = y ~ x1 + x2
Residuals:
Min 1Q Median 3Q Max
-22.816 -9.267 -0.377 10.292 17.566
Coefficients:
Estimate Std. Error t value Pr>|t|
Intercept -14.431 8.372 -1.72 0.10291
x1 2.287 0.490 4.67 0.00022 ***
x2 0.975 0.093 10.49 7.7e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.6 on 17 degrees of freedom
164
R
Multiple R-Squared: 0.878, Adjusted R-squared: 0.864
F-statistic: 61.1 on 2 and 17 DF, p-value: 1.72e-08
summary lm
residuals

Estimate Std. Error


i0 t t value p
y p ***

residuals
R2R2
F p
1 = 2 = p = 0
ANOVA

7.2.3
7.3
19902001 12
yx1 x2
x3 x4

7-
1
y x1 x2 x3 x4
1990 29.3710 185.984 28.2187 55.601 653.23
1991 31.4948 216.625 29.9017 72.258 660.91
1992 34.8337 266.519 32.9691 91.196 667.82
165
R
1993 43.4895 345.605 42.5530 112.710 674.68
1994 52.1810 466.700 51.2688 203.819 681.35
1995 62.4220 574.949 60.3804 234.999 688.55
1996 74.0799 668.505 69.0982 241.338 697.65
1997 86.5114 731.427 82.3404 269.672 708.00
1998 98.7595 769.672 92.6280 268.577 720.87
1999 114.4408 805.794 106.8258 298.963 727.91
2000 133.9523 882.281 125.8151 392.742 739.92
2001 163.8604 943.464 153.0138 421.933 744.32

reg2.txt
reg2
> reg2=read.table"reg2.txt"
> reg2
y x1 x2 x3 x4
1990 29.37 186.0 28.22 55.60 653.2
1991 31.49 216.6 29.90 72.26 660.9

2001 163.86 943.5 153.01 421.93 744.3

pairs

1
> pairsreg2
166
R

7-
5
2
> fm=lmy~x1+x2+x3+x4
> fm
Call: lmformula = y ~ x1 + x2 + x3 + x4
Coefficients:
Intercept x1 x2 x3 x4
4.09009 0.00498 1.13221 -0.02648 -0.00932
3
> anovafm
Analysis of Variance Table
Response: y
Df Sum Sq Mean Sq F value Pr>F
x1 1 18698 18698 25761.74 9.6e-14 ***
x2 1 2125 2125 2927.53 1.9e-10 ***
x3 1 3 3 4.17 0.08 .
x4 1 0.012 0.012 0.02 0.90
167
R
Residuals 7 5 1
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
4t
> summaryfm$coef
Estimate Std. Error t value Pr>|t|
Intercept 4.090091 45.723550 0.08945 9.312e-01
x1 0.004980 0.007213 0.69040 5.122e-01
x2 1.132210 0.041402 27.34685 2.243e-08
x3 -0.026478 0.013519 -1.95856 9.101e-02
x4 -0.009324 0.072078 -0.12937 9.007e-01
5
> plotrownamesreg2,reg2$y,type="p",xlab="year"
> linesrownamesreg2,fm$fit

7-
6
7.4
1609
168
R

x 100 200 300 450 600 800 1000


y 253 337 395 451 495 534 574

> x = c100,200,300,450,600,800,1000
> y = c253, 337,395,451,495,534,574
> lm.1 = lmy ~ x # y=a+bx
> lm.2 = lmy ~ x + Ix^2 # y=a+bx+cx2
> lm.3 = lmy ~ x + Ix^2 + Ix^3 #
2 3
y=a+bx+cx +dx
summary
> summarylm.1$coef
Estimate Std. Error t value Pr>|t|
Intercept 269.4661 24.18421 11.142 0.0001015
x 0.3341 0.04181 7.992 0.0004951
> summarylm.2$coef
Estimate Std. Error t value Pr>|t|
Intercept 200.211950 1.695e+01 11.811 0.0002941
x 0.706182 7.568e-02 9.332 0.0007342
Ix^2 -0.000341 6.754e-05 -5.049 0.0072374
> summarylm.3$coef
Estimate Std. Error t value Pr>|t|
Intercept 1.555e+02 8.182e+00 19.003 0.0003182
x 1.119e+00 6.454e-02 17.332 0.0004185
Ix^2 -1.254e-03 1.360e-04 -9.220 0.0026986
169
R
Ix^3 5.550e-07 8.184e-08 6.782 0.0065519
Ix^2Ix^2
I

> plotx,y
> linesx,fittedlm.1,lty=1
> linesx,fittedlm.2,lty=2
> linesx,fittedlm.3,lty=3
> legend700,400,c"","","",lty=1:3

7-
7

R2
> summarylm.1$r.squared
[1] 0.9274
> summarylm.2$r.squared
[1] 0.9902
170
R
> summarylm.3$r.squared
[1] 0.9994

171
R

Analysis of Variance ANOVA

8.
1

()


= A + B
AB

2
k k
172
R
K

Xi ~ Ni,2 i=1,2,k
3.

t

k X1X2Xk
Xj nj Xij Xj i
i j
Xij j 2 j
Xij
j
Xij = j + ij ij i.i.d ~ N0,2
1 2= k
H0: 1 2= kH1: 1 2
k
8-1
3 XY Z X
40 Y Z 5060
9 X
Y Z

XY Z

173
R

> X = rnorm5,40,3 # X ~ N40,32


> Y = rnorm5,50,3 # Y ~ N50,32
> Z = rnorm5,60,3 # Z ~ N60,32
> XYZ=cX,Y,Z
> Mean=cmeanX,meanY,meanZ
>boxplot X,Y,Z,XYZ,Mean,horizontal=T,names=c "X","Y","Z",
"XYZ", "Mean"

8-
1 3

k nj

= ( X ij X . j )2
j= 1 i= 1
nj
1
X .j j X . j = n X
i =1
ij
j

X j
174
R
X
k nj

= ( X ij X )2
j= 1 i= 1

= ( X . j X ) 2 = n j ( X . j X ) 2 =
j i j

= +
2
2
2 2
2
F

k-1nk F

F= /
k-1 n-k

k X i n-k


175
R

8
.2


H0:12k=
H1:1 ,2 , ,k

1
2
SSE fE = n-k
SSA fA = k-1
SST fT = n-1
SST = SSA +SSE
fT=fA+fE n-1=k-1+n-k
3

MSA=SSA/fA =SSA/k-1
MSE=SSE/fE = SSE/n-k
4 F
F = MSA/MSE ~ Fk-1,n-k
176
R
8-1
8.
1
F P
A SSA fA MS
A FA P
A

E SSE fE MSE
T SST fT
5
F P

PA > H0 A
PA H0 A
R
oneway.test
anova
8.1 dataset PlantGrowth
4.11
> dataPlantGrowth
ctrltrt1
1 trt2 2 10

4.11
R one-way analysis
of variance oneway.test
weight group
> oneway.testweight ~ group, data=PlantGrowth, var.equal=T
One-way analysis of means
data: weight and group
F = 4.846, num df = 2, denom df = 27, p-value = 0.01591
177
R
p 0.0159

anova aov
anova lm

> anovalmweight ~ group, data=PlantGrowth


Analysis of Variance Table
Response: weight
Df Sum Sq Mean Sq F value Pr>F
group 2 3.77 1.88 4.85 0.016 *
Residuals 27 10.49 0.39
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
group k-1
k=3 Mean Sq. Sum sq
F p
oneway.test
aov anovalm
aov summary

> summaryaovweight ~ group, data=PlantGrowth


Df Sum Sq Mean Sq F value Pr>F
group 2 3.77 1.88 4.85 0.016 *
Residuals 27 10.49 0.39
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
178
R

8
.3

8.3.1

AB K h
AB Ai , Bj

A
H0A 1 = 2 = = k
H1A i i=1,2,k
B
H0B 1 = 2 = = h
H1B i i=1,2,h

1 AB
1 K
A i X i = X ij (i = 1, 2 ,L k )
K i=1
1K
B j X j = X ij (i = 1, 2 , L h )
h i=1
k h
1
X =
k h i =1 j =1
X ij

SS T = (X ij X ) fT = kh-1
k h
2

i =1 j =1

A SS A = h (X i X ) fA=k-1
k
2

i =1
179
R

B SS B = k (X i X ) fB=h-1
h
2

j =1

SSE = (X ij X i X j + X ) fE=k-1
k h
2

i =1 j =1

h-1
SST = SSA + SSB + SSEfT = fA + fB + fE
3
SSA SSA SS SSB SS SSE
MSA =
fA
=
(k-1)
MSB = f B = (k-1) MSE = f E = (k-1)(h-1)
B E

4
SSA /fA MSA SSB /fB MSB
FA = = FB = =
SSE /fE MSE SSE /fE MSE
8.
2
F P
A SSA fA MSA FA PA
B SSB fB MSB FB PB
E SSE fE MSE
T SST fT
5
F P

PA > H0A A

PA H0A
A
PB > H0B B

PB H0B B
8.2 4 3
180
R
range
A B
8.
3 A B
AB A1 A2 A3 A4
B1 582 491 601 758
B2 562 541 709 582
B3 653 516 392 487
> range=c582,562,653,491,541,516,601,709,392,758,582,487
> A=c1,1,1,2,2,2,3,3,3,4,4,4
> B=c1,2,3,1,2,3,1,2,3,1,2,3
x yz
yz x
pchplot character
> plotrange~A,pch=B # pch
> legend1.5,750,legend=1:3,pch=B #1.5,750

8-
2
181
R

> parmfrow=c1,2
> boxplotrange~A,xlab="A"; boxplotrange~B,xlab="B"
> parmfrow=c1,1

8-
3


> A = factorA ; B = factorB # AB

> range.aov <- aovrange ~ A + B #


Call: aovformula = range ~ A + B
Terms:
A B Residuals
Sum of Squares 15759 22385 73198
Deg. of Freedom 3 2 6
Residual standard error: 110.5
Estimated effects are balanced
> summaryrange.aov
Df Sum Sq Mean Sq F value Pr>F
A 3 15759 5253 0.43 0.74
182
R
B 2 22385 11192 0.92 0.45
Residuals 6 73198 12200

4 3

8 .3.2
ANOVA
ANOVA

ANOVA

AB
Ai
Bj

AiBj m
Xijli=1,2,,kj=1,2,,hl=1,2,,m
A
H0A A
H1A A
B
H0B B
H1B B
AB
H0AB AB
H1AB AB

1
183
R
1 h m
A i X i = X ijl
hm j =1 l =1
1 k m
X ij
B j X j =
km i =1 l =1
AB m
1 m
X ij = X ijl
m l =1
1 k h m
X= X ijl
k h m i=1 j =1 l =1
2
2
A SS ( A) = h m ( X i X ) fA = k-1
k

i =1

B SS ( B ) = k m ( X j X )
h 2

j =1

fB =h-1

SS ( AB ) = m ( X ij X i X j + X )
2k h

i =1 j =1

fAB = k-1
h-1

SSE = ( X ijk X ij ) fE = kh
k h m 2

i =1 j =1 l =1

m-1

SST = (X ijl X ) fT = khm-1


k h m
2

i =1 j =1 l =1

SST = SSA + SSB + SSEfT = fA + fB + fE


3
MSA = SSA / fAMSB = SSB / fBMSAB = SSAB / fABMSE = SSE /fE
4
SSA /fA MSA SS /f MS SS /f MS
FA =
SSE /fE
=
MSE
FB = SSB /fB = MSB FAB = SSAB /fAB = MSAB
E E E E E E
184
R
8.
4
F P
A SSA fA MSA FA PA
B SSB fB MSB FB PB
A
B SSAB fAB MSAB FAB PAB
E SSE fE MSE
T SST fT
5
F P

PA > H0A A

PA H0A A
PB > H0B B

PB H0B B
PAB > H0AB A B

PAB H0AB A B

8 .3
AB

A: A15kg/h A215kg/h A325kg/h


B: B15% B210% B315% B420%
8.
5 /

B1 B2 B3 B4
A1 60.7 61.5 61.6 61.7
185
R
61.1 61.3 62.0 61.1
61.5 61.7 62.2 62.1
A2
60.8 61.2 62.8 61.7
60.6 60.6 61.4 60.7
A3
60.3 61.0 61.5 60.9
R
> Y=c60.7,61.1,61.5,61.3,61.6,62.0,61.7,61.1,61.5,60.8,61.7,61.2,
62.2,62.8,62.1,61.7, 60.6,60.3,60.6,61.0,61.4,61.5,60.7,60.9
> A=c1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3
> B=c1,1,2,2,3,3,4,4, 1,1,2,2,3,3,4,4, 1,1,2,2,3,3,4,4
> A=factorA; B=factorB
> rate.aov <- aovY ~ A + B + A*B
# rate.aov <- aovY ~ A:B
Call: aovformula = Y ~ A + B + A * B
Terms:
A B A:B Residuals
Sum of Squares 3.083 3.630 0.300 1.140
Deg. of Freedom 2 3 6 12
Residual standard error: 0.3082
Estimated effects are balanced
> summaryrate.aov
Df Sum Sq Mean Sq F value Pr>F
A 2 3.08 1.54 16.23 0.00039 ***
B 3 3.63 1.21 12.74 0.00049 ***
A:B 6 0.30 0.05 0.53 0.77829
Residuals 12 1.14 0.09
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
186
R

PA<0.05PB<0.05PAB>0.05AB
ABA
B

9
.1

9.1.1


t-F-2

187
R

9 .1.2

rank

9.1 5.2
> x = c21240,4632, 22836,5484,5052,5064,6972,7596,14760,15012,
18720, 9480, 4728, 67200,52788
> ri = rankx
[1] 1 2 1 13 5 3 4 6 7 9 10 11 8 2 15 14
ri x


188
R

9
.2

9.2.1 K-
S
Kolmogorov-Smirnov K-S

Q-Q
K-S
R
Poisson R

D = nmax(|S(Xi-1)-F0(Xi)|,|S(Xi)-F0(Xi)|

> ks.testxi,"pnorm"
One-sample Kolmogorov-Smirnov test
data: xi
D = 1, p-value = 2.220e-16
alternative hypothesis: two.sided

P- 0.05 0.05

189
R
9.2. 2

t wilcoxon
R wilcox.test
9.2
5080


x = c21240,4632, 22836,5484,5052,5064,6972,7596,14760,15012,
18720, 9480, 4728, 67200,52788
> stemx
The decimal point is 4 digits to the right of the |
0 | 55555789559
2 | 13
4|3
6|7
t
H0 5080
5080
> wilcox.testx,mu=5080
Wilcoxon signed rank test
data: x
V = 109, p-value = 0.003357
alternative hypothesis: true mu is not equal to 5080
p
wilcox.test
R
median.test p
190
R
median.test <- function x, median = NA{
x <- as.vectorx
n <- lengthx
bigger <- sumx > median
equal <- sumx == median
count <- bigger + equal/2
count <- mincount, n - count
p <- 2 * pbinomcount, n, 0.5
cPositive=bigger,Negative=count,P=p
}

> median.testx,median=5080
Positive Negative P
11 4 0.1185
Positive 11 Negative 4
p 0.1185

9
.3

9.3.1 K -S

Kolmogorov-Smirnov

n1 n2 S1X S2 X
191
R
DjS1Xj-S2
Xj:
n1n2
Z = max | D j |
j n1 + n2
6.2
> x1=c48,47,44,45,46,47,43,47,42,48
> x2=c36,45,47,38,39,42,36,42,46,35
> boxplotx1,x2,horizontal=T,names=c"x1","x2"

9-
1 x1 x2
> ks.testx1,x2
Two-sample Kolmogorov-Smirnov test
data: x1 and x2
D = 0.6, p-value = 0.05465
alternative hypothesis: two.sided
Warning message: p- in:
ks.testx1, x2
0.05
192
R

9.3 .2
wilcoxon wilcox.test

m n
m+n

WX WY
Wilcoxon
p-
WX WY

> wilcox.testx1,x2
Wilcoxon rank sum test with continuity correction
data: x1 and x2
W = 84.5, p-value = 0.009661
alternative hypothesis: true mu is not equal to 0
Warning message: p- in:
wilcox.test.defaultx1, x2
wilcox.test

9
.3
193
R

Kruskal-Wallis

Wilcoxonkk
Ri
i=1,,kRi

i
FixxFx+i
i F1,,Fk
H0F1==FkHaFix
=Fx+ii=1,,kFi
Kruskal-Wallis
2
12 k
R 12 k
Ri2
H= ni i R =
N ( N + 1) i =1 ni

N ( N + 1) i =1 ni
3( N + 1)

niiN

k
R = Ri / N = ( N + 1) / 2
i =1

H
Ri/ni H

k-12Kruskal-Wallis

Kruskal-Wallis
Wilcoxen
t
194
R


Kruskal-Wallis R oneway.test
9.3 4.11 Kruskal-Wallis
> kruskal.testweight ~ group, data=PlantGrowth
Kruskal-Wallis rank sum test
data: weight by group
Kruskal-Wallis chi-squared = 7.988, df = 2, p-value = 0.01842
kruskal.test
dtatframe p
p
9.4
300
6

50
6

300 6 27
AB C3 5 5

14 3 4 5 2 3 4 5
24 4 5 5 4 5 4 4
33 4 2 4 5 5 4 4
R
195
R
> scores = c4,3,4,5,2,3,4,5,4,4,5,5,4,5,4,4,3,4,2,4,5,5,4,4
> person = c1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3

> boxplotscores~person

9-
2
2
13 Kruskal-Wallis

> kruskal.testscores ~ person


Kruskal-Wallis rank sum test
data: scores by person
Kruskal-Wallis chi-squared = 1.939, df = 2, p-value = 0.3793
p 0.3793
> oneway.testscores ~ person
One-way analysis of means not assuming equal variances
data: scores and person
F = 1.554, num df = 2.00, denom df = 12.53, p-value = 0.2495
p

196
R

9
.4

Friedman Friedman

k
k +1
b( R i 2 ) 2
i =1

k H0

12b k k +1
Friedman = (Ri 2 )2
k (k + 1) i =1
1 k
i
Ri=Rjij
Ri=Rj R1+R2++Rk=b1+2++k
=b/2kk+1 i Ri
b/2k+1 Ri=Ri/b k+1/2

z
Friedman

Friedman p Friedman
b k
k!b
1/k!b
k!b
197
R
Friedman b k
b k

k
b k b+Friedman
X2k


Rij, mi
A = R ij
2

(b 1)( B bk (k + 1) 2 / 4
B= 1
b mi2 F F=
A B

v1=k-1v2=b-1
k-1
Kruskal-Wallis Friedman

9.5 5
9-1
9-1 3 5

20.3 21.2 18.2 18.6 18.5
25.6 24.7 19.3 19.3 20.7
24.0 23.1 20.6 19.8 21.4

Friedman
> X=matrix c 20.3,21.2,18.2,18.6,18.5,25.6,24.7,19.3,19.3,20.7,
198
R
24.0, 23.1,20.6,19.8,21.4,5
> friedman.testX
Friedman rank sum test
data: X
Friedman chi-squared = 7.6, df = 2, p-value = 0.02237
Friedman 5

199
R

10 R

1
0.1

10.1 120 63
sex
edu
income 10.210.2 R 120

1. income

> dat=read.table"income.txt"
> dat
sex edu income
1 3299
2 2378

200
R
119 3168
120 4019
> attachdat
2.
> tablesex # 1
sex

56 64
> tableedu
edu

31 34 55
> parmfrow=c1,2;barplottablesex;barplottable
edu;parmfrow=c1,1;

10
-1

> tablesex,edu # 2
edu
sex
18 19 19
201
R
13 15 36
> barplottablesex,edu,legend=c"",""

10
-2

3.
> freq=histincome #

10
-3

> freq$breaks #
[1] 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 5500
202
R
6000 6500
> freq$mids #
[1] 750 1250 1750 2250 2750 3250 3750 4250 4750 5250 5750
6250
> freq$count #
[1] 1 7 10 27 21 19 17 10 6 1 0 1
> cbindm=freq$mids,f=freq$count #
m f
[1,] 750 1
[2,] 1250 7
[3,] 1750 10
[4,] 2250 27
[5,] 2750 21
[6,] 3250 19
[7,] 3750 17
[8,] 4250 10
[9,] 4750 6
[10,] 5250 1
[11,] 5750 0
[12,] 6250 1
> stemincome #
The decimal point is 3 digits to the right of the |
0|
1 | 01112334
1 | 556666679
2 | 0000111112222223333334444
2 | 5555666778888888889999
3 | 0000222222233344444
3 | 55666667788999999
203
R
4 | 000000111123
4 | 567889
5|4
5|
6|2
> qqnormincome;qqlineincome # QQ

10
-4

> summaryincome #
Min. 1st Qu. Median Mean 3rd Qu. Max.
976 2210 2850 2940 3640 6230
4.

> byincome, sex, mean #


INDICES:
[1] 3013
204
R
---------------------------------------------------
INDICES:
[1] 2875
> byincome, sex, sd #
INDICES:
[1] 1156
---------------------------------------------------
INDICES:
[1] 855
> byincome, sex, summary # ()

INDICES:
Min. 1st Qu. Median Mean 3rd Qu. Max.
976 2210 2940 3010 3840 6230
---------------------------------------------------
INDICES:
Min. 1st Qu. Median Mean 3rd Qu. Max.
1290 2200 2840 2880 3430 4800
>boxplotincome~sex, notch=T
#
205
R

10
-5
> byincome,listsex,edu,summary
#
: :
Min. 1st Qu. Median Mean 3rd Qu. Max.
976 1610 2280 2170 2720 3320
---------------------------------------------------
: :
Min. 1st Qu. Median Mean 3rd Qu. Max.
1620 2040 2160 2330 2380 3420
---------------------------------------------------
: :
Min. 1st Qu. Median Mean 3rd Qu. Max.
1110 3410 3810 3600 4220 4880
---------------------------------------------------
: :
Min. 1st Qu. Median Mean 3rd Qu. Max.
1510 2190 2650 2800 3520 4080
206
R
---------------------------------------------------
: :
Min. 1st Qu. Median Mean 3rd Qu. Max.
1200 2350 3210 3230 3760 6230
---------------------------------------------------
: :
Min. 1st Qu. Median Mean 3rd Qu. Max.
1290 2690 3110 3100 3900 4800
> boxplotincome~sex+edu,notch=T #

10
.1-
610
-6

5. 2

2 2
> result = tablesex,edu
edu
sex
18 19 19
207
R
13 15 36
> chisq.testresult
Pearson's Chi-squared test
data: tablesex, edu
X-squared = 6.03, df = 2, p-value = 0.04917

6. t

t
> var.testincome[sex==""],income[sex==""]
F test to compare two variances
data: income[sex == ""] and income[sex == ""]
F = 1.83, num df = 55, denom df = 63, p-value = 0.02101
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
1.10 3.08
sample estimates:
ratio of variances
1.83
p=0.02<0.05
Welch t
> t.testincome[sex==""],income[sex==""]
Welch Two Sample t-test
data: income[sex == ""] and income[sex == ""]
t = 0.736, df = 100, p-value = 0.4637
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-235 511
sample estimates:
208
R
mean of x mean of y
3013 2875
p=0.4637>0.05

7.

t

> bartlett.testincome~edu #
Bartlett test of homogeneity of variances
data: income by edu
Bartlett's K-squared = 8.35, df = 2, p-value = 0.01537
p=0.015<0.05

> oneway.testincome ~ edu


One-way analysis of means not assuming equal variances
data: income and edu
F = 18.6, num df = 2.0, denom df = 71.4, p-value = 3.048e-07
p=3.048e-07<0.01

> byincome,edu,summary
INDICES:
Min. 1st Qu. Median Mean 3rd Qu. Max.
976 1950 2210 2240 2680 3420
---------------------------------------------------
INDICES:
Min. 1st Qu. Median Mean 3rd Qu. Max.
1110 2440 3530 3250 3990 4880
209
R
---------------------------------------------------
INDICES:
Min. 1st Qu. Median Mean 3rd Qu. Max.
1200 2500 3160 3150 3900 6230

10
-7

8.
> detachdat #
> rmdat # dat

1
0.2

10.2 1974 _Motor Trend_ US , 32


1973-74 10 11
210
R

1 mpg Miles/US gallon


2 cyl Number of cylinders
3 disp Displacement cu.in.
4 hp Gross horsepower
5 drat Rear axle ratio
6 wt Weight lb/1000
7 qsec 1/4 mile time
8 vs V/S
9 am Transmission 0 = automatic, 1 = manual
10 gear Number of forward gears
11 carb Number of carburetors

1.
mtcars R
R
?mtcars
> mtcars
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2

2.

mpg
$mtcars$mpgmtcars[['mpg']]
mtcars[,1]
211
R

> mpg #
: "mpg"
> attachmtcars # mtcars
> mpg #
[1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4
17.3 15.2 10.4
[16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0
30.4 15.8 19.7
[31] 15.0 21.4

3.
table
barplot
> tablecyl
cyl
4 6 8
11 7 14
> barplotcyl # 10-8
> barplottablecyl# 10-8

10
-8
212
R
8
1974 help
mtcars?mtcars
4.
mpg

> stemmpg
The decimal point is at the |
10 | 44
12 | 3
14 | 3702258
16 | 438
18 | 17227
20 | 00445
22 | 88
24 | 4
26 | 03
28 |
30 | 44
32 | 49
> histmpg
> boxplotmpg

10
-9
213
R
10-9

33.9 32.9
stemmpg,scale=3
> meanmpg
[1] 20.09062
> meanmpg,trim=.1 # 10%
[1] 19.69615 #
> summarympg
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.40 15.43 19.20 20.09 22.80 33.90
19.20
IQR mad

> sdmpg
[1] 6.026948
> IQRmpg
[1] 7.375
> madmpg
[1] 5.41149

5.
cy1 mpg disp
> coplotmpg~disp|as.factorcyl, data=mtcars, panel=panel.smooth,
rows = 1
214
R

10
-10 m
pg di
sp

6.
mpg
mpg

> mpg[cyl = = 4]
[1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
> meanmpg[cyl == 4]
[1] 26.66364
= ==

215
R
> plotcyl,mpg

10
-11 mpg
mpg

> lmmpg~cyl
Call: lmformula = mpg ~ cyl
Coefficients:
Intercept cyl
37.88 -2.88
> plotcyl,mpg
> ablinelmmpg~cyl
216
R

10
-12 m
pg

-2.88
mpg 5.752
mpg
4 6 8
6 8 tapply

> tapplympg,cyl,mean
4 6 8
26.66 19.74 15.10

> lmmpg~hp
Call: lmformula = mpg ~ hp
Coefficients:
217
R
Intercept hp
30.0989 -0.0682
> plothp,mpg
> ablinelmmpg~hp

10
-13 hp mpg

cor

> corhp,mpg
[1] -0.7761
> corcyl,mpg
[1] -0.8521
R R2
> corhp,mpg^2
[1] 0.6024
> corcyl,mpg^2
[1] 0.7262
218
R
72% mpg

pch
> plothp,mpg,pch=cyl
#
> legend 250,30,pch=c 4,6,8 ,legend=c "4 cylinders","6
cylinders","8 cylinders"

10
-14 mpg hp

7.

lm resid
> fm = lmmpg~hp
> resids = residfm #
> plotresids #
219
R

10
-15 mpg hp

> histresids #

10
-16 mpg hp

> qqnormresids;qqlinesresids # Q-Q


220
R

10
-17 mpg hp QQ

10-17

8.

R

R
> detach

1
0.3

X
X 1 , X 2 , L X n t t = X
s/ n
221
R
s Xi
t Xi
1.t

t
> t.stat = functionx,mu meanx-mu/ sqrtvarx
/lengthx
t
> mu = 0; x=rnorm100,mu,1
> t.statx,mu
[1] -1.552077
100
t t
Xi

> mu = 10; x=rexp100,1/mu;


> t.statx,mu
[1] 1.737937

> results = c #
> for i in 1:200 results[i] = t.statrexp100,1/mu,mu
results

> histresults #
222
R

10
-18 t

> boxplotresults #

10
-19 t

> qqnormresults #
223
R

10
-20 t

n = 100 n
t n- 1
n = 8
> for i in 1:200 results[i] = t.statrexp8,1/mu,mu
> histresults #

10
-21 t
224
R
> boxplotresults #

10
-22 t

> qqnormresults #

10
-23 t

t
n t

t
225
R
5 t n = 87 t

> for i in 1:200 results[i] = make.trt8,5,0


> histresults #

10
-24 t

> boxplotresults #

10
-25 t
226
R
> qqnormresults #

10
-26 t

> qqplotresults,rt200,7 # 7 t

10
-27 t 7 t
227
R
7
t t

1
0.4 R

R packages

> library
228
R
boot
Davison & Hinkley1997
> libraryboot
CRA N.p ackage
s
Windows RAqua Pac kages

search

> search
[1] ".GlobalEnv" "package:methods" "package:stats"
[4] "package:graphics" "package:grDevices" "package:utils"
[7] "package:datasets" "Autoloads" "package:base"
2.3.0 R

> help.start
HTML
Refe rence

R
R R

10.4.
1 R
R
R
R
R
10
.1 10-
1 R
229
R

base R
datasets R
grDevices base grid
graphics base R
grid
methods R ,
splines
stats R
stats4 S4 R
tcltk Tcl/Tk GUI
tools
utils R

10.4.
2 R
R

R
CRANhttp://CRAN.R-project.org/
Bioconductorhttp://www.bioconductor.org/
R FAQ
10
.21
0-2
R

acepack ace and avas for selecting regression transformations
base The R Base Package
boot Bootstrap R S-Plus Functions Canty
car Companion to Applied Regression
chron Chronological objects which can handle dates and times
class Functions for Classification
cluster Cluster Analysis Extended Rousseeuw et al.
230
R
DAAG Data Analysis And Graphics
datasets The R Datasets Package
dynlm Dynamic Linear Regression
e1071 Misc Functions of the Department of Statistics e1071, TU
Wien
fBasics Rmetrics - Marketes and Basic Statistics
fCalendar Rmetrics - Chronological Objects
foreign Read Data Stored by Minitab, S, SAS, SPSS, Stata, Systat,
dBase, ...
graphics The R Graphics Package
grDevices The R Graphics Devices and Support for Colours and Fonts
grid The Grid Graphics Package
Hmisc Harrell Miscellaneous
its Irregular Time Series
KernSmooth Functions for kernel smoothing for Wand & Jones 1995
lattice Lattice Graphics
leaps regression subset selection
lmtest Testing Linear Regression Models
MASS Main Package of Venables and Ripley's MASS
methods Formal Methods and Classes
mgcv GAMs with GCV smoothness estimation and GAMMs by
REML/PQL
mlbench Machine Learning Benchmark Problems
nlme Linear and nonlinear mixed effects models
nnet Feed-forward Neural Networks and Multinomial Log-Linear
Models
oz Plot the Australian coastline and states
pls Partial Least Squares RegressionPLSRand Principal Component
Regression PCR
quadprog Functions to solve Quadratic Programming Problems.
231
R
randomForest Breiman and Cutler's random forests for classification and
regression
RGtk2 R bindings for Gtk 2.0
rpart Recursive Partitioning
RWinEdt R-WinEdt
sandwich Robust Covariance Matrix Estimators
scatterplot3d 3D Scatter Plot
Simple functions and data to accompany simpleR
SparseM Sparse Linear Algebra
spatial Functions for Kriging and Point Pattern Analysis
splines Regression Spline Functions and Classes
stats The R Stats Package
stats4 Statistical Functions using S4 Classes
strucchange Testing for Structural Change
survival Survival analysis, including penalised likelihood.
tcltk Tcl/Tk Interface
tools Tools for Package Development
tseries Time series analysis and computational finance
utils The R Utils Package
xtable Export tables to LaTeX or HTML
zoo Z's ordered observations

10.4.
3 R
namespaces

da tas
ets

t R
t
232
R

::

b ase:
:t base

:: : R

getA
nyw
here
inter-dependent

1
0.5 R

R
R
R
R
R_Commander R_WinEdt

10.5.
1 R
_C omman
der
1John Fox jfox@mcmaster.ca
2Rcmdr Version 1.1.7
3
1Rcmdr R car
233
R
2 Rcmdr car
3Rcmdr RGui SDI RGui
Edit=>GUI preferences
4
1 car
2 RGui Packages=>Load packages Rcmdr

3 Script Window Submit

10
-28 R Rc
mdr
234
R
5
R Commander /
menu/dialog-box interfaces

6
1R Commander
2 log/script
3 RGUI
log
Rcmdr Submit
R
4
Data=>Import data
, R GUI !!

1 0.5.2 R
_WinEdt
1. : Uwe Ligges ligges@statistik.uni-dortmund.de
2. RWinEdt Version 1.7.6
3.
1WinEdt5.x: http://www.winedt.com/
2R-WinEdt: http://cran.r-project.org/contrib/extra/winedt/
4R_WinEdt
1 R_WinEdt WinEdt plugins\
R-WinEdt
2 install.exe WinEdt
R.ini winedt.exe!
5R_WinEdt
235
R
> libraryRWinEdt

10
-29 R RW
inEdt

10
-30 R RW
inEdt
236
R
Alt+P R
> x=c1,3,5,7,9
> meanx
[1] 5
> medianx
[1] 5
> varx
[1] 10
> sdx
[1] 3.162278
6R_WinEdt
1 RGui
2 WinEdt Delimiter

3Syntax-Highlighting
4 R
5
6script R
7WinEdt code R
8script R
9. Rhistory

10: for_ in _{_}


7R_WinEdt -
237
R
10
.3 RWi
nEdt
Command Hot Key Menu Icon
Brackets Check Ctrl+F12 {}
R History ALT+H R HIST.
R-line - and go ALT+L
R-line Ctrl+ALT+L
R<- R-paste - and go ALT+P R<-PASTE
R<- R-paste Ctrl+ALT+P
R<- R-source-and go ALT+S R<-SOURCE
R<- R-script Ctrl+ALT+S R SCRIPT R
function Ctrl+Alt+F R
for Ctrl+Alt+O
if Ctrl+Alt+I for
ifelse Ctrl+Alt+E if
<- Ctrl+- ifesle

You might also like