Tutorial 4

This version of the tutorial 4 script shows how you can use RMarkdown to present your work. Most of it is pretty self explanatory, with the exception of generating tables. In this script I use both the xtable and the kable command to make the printed tables look nicer in the html file. We also need to use the results='asis' command in the r block preamble to make it print the table correctly.

For more info on markdown and what commands are available, be sure to check out John Gruber’s homepage and google RMarkdown.

Preprocessing

Examining the data

We’ll start by examining the data…

hd <- head(train)
print(xtable(hd),type="html", html.table.attributes = tab.attributes)

	X	admit	gre	gpa	rank
1	254	1	540	3.55	4
2	90	1	660	4.00	2
3	177	0	520	2.62	2
4	195	1	600	3.47	2
5	124	0	500	2.98	3
6	96	0	660	3.33	2

summ <- summary(train)
print(xtable(summary(train)),type="html", html.table.attributes = tab.attributes)

	X	admit	gre	gpa	rank
1	Min. : 1	Min. :0.0	Min. :220	Min. :2.42	Min. :1.00
2	1st Qu.:101	1st Qu.:0.0	1st Qu.:520	1st Qu.:3.12	1st Qu.:2.00
3	Median :205	Median :0.0	Median :580	Median :3.36	Median :2.00
4	Mean :203	Mean :0.3	Mean :586	Mean :3.37	Mean :2.46
5	3rd Qu.:307	3rd Qu.:1.0	3rd Qu.:660	3rd Qu.:3.65	3rd Qu.:3.00
6	Max. :399	Max. :1.0	Max. :800	Max. :4.00	Max. :4.00

Contingency tables

Contingency tables make sense more for categorical data…

	1	2	3	4
0	20	83	68	39
1	23	42	15	10

	220	300	340	360	380	400	420	440	460	480	500	520	540	560	580	600	620	640	660	680	700	720	740	760	780	800
0	1	0	3	4	5	8	4	6	6	8	15	12	15	13	18	11	13	13	10	10	14	4	4	1	0	12
1	0	1	1	0	0	2	0	1	2	6	1	8	7	5	4	6	9	2	8	6	3	1	2	4	4	7

Convert numerical data to categorical data

Recall, when reading in the data, R tries to infer the type of each column, so columns with integer values are treated as numbers by default. We need them as categories. To convert them, we use the factor command.

train$rank <- factor(train$rank)
head(train$rank)

## [1] 4 2 2 2 3 2
## Levels: 1 2 3 4

test$rank <- factor(test$rank)
train$admit <- factor(train$admit)
test$admit <- factor(test$admit)

Analysis

training the first logistic regression model

model1 <- glm(admit ~ gre + gpa, data = train, family="binomial")
predict1 <- predict(model1, newdata = test, type = "response")
print(xtable(summary(model1)),type="html", html.table.attributes = tab.attributes)

	Estimate	Std. Error	z value	Pr(>\|z\|)
(Intercept)	-3.3716	1.2228	-2.76	0.0058
gre	0.0027	0.0012	2.22	0.0266
gpa	0.2783	0.3608	0.77	0.4404

Look at the predict1 values and guess what are the values?

#head(predict1, 50)
print(xtable(data.frame(Predict1 = head(predict1, 25))),type="html", html.table.attributes = 'border="0" style="border-collapse: collapse; text-align: right; width: 25%; "')

	Predict1
1	0.26
2	0.29
3	0.19
4	0.25
5	0.30
6	0.35
7	0.29
8	0.44
9	0.21
10	0.47
11	0.32
12	0.16
13	0.37
14	0.47
15	0.33
16	0.31
17	0.31
18	0.30
19	0.43
20	0.37
21	0.19
22	0.47
23	0.26
24	0.38
25	0.27

Sensitivity/Specifity

In order to change the probabilities to binary decision, we need to find a cutoff. We want a cutoff that gives us the best both sensitivity/specificity.

perf = function(cut, mod, y)
   {
        yhat = (mod$fit>cut)
        w = which(y==1)
        sensitivity = mean( yhat[w] == 1 ) 
        specificity = mean( yhat[-w] == 0 ) 
        out = t(as.matrix(c(sensitivity, specificity)))
        colnames(out) = c("sensitivity", "specificity")
        return(out)
     }

perf(0.1,model1,train$admit)

##      sensitivity specificity
## [1,]           1           0

perf(0.5,model1,train$admit)

##      sensitivity specificity
## [1,]           0           1

perf(0.8,model1,train$admit)

##      sensitivity specificity
## [1,]           0           1

Let’s plot the sensitivity and specificity…

sensetivity=c()
specifity=c()

for(i in seq(0.1, 0.9, by=0.1)){
  sensetivity <- c(sensetivity,perf(i,model1,train$admit)[1])
  specifity <- c(specifity,perf(i,model1,train$admit)[2])
  }

#find the cutoff
plot(seq(0.1, 0.9, by=0.1), sensetivity , type="l", lty=1, col="blue", xlab="cutoff points", ylab="sensetivity/specifity", lwd=2)
lines(seq(0.1, 0.9, by=0.1),specifity, type="l", lty=1, col="red", lwd=2)

plot of chunk unnamed-chunk-6

Confusion Matrix

#prediction
predict1 <- ifelse(predict(model1, newdata=test,type="response")>0.3, 1, 0)
confusionmatrix1 <- table(predict1, test$admit)
print(xtable(confusionmatrix1),type="html", html.table.attributes = tab.attributes)

	0	1
0	37	10
1	26	27

Accuracy of model over the test data

#accuracy of model over the test data:
sum(diag(confusionmatrix1))/sum(confusionmatrix1)

## [1] 0.64

Model 2

We now fit a richer model that includes the rank of the school.

model2 <- glm(admit ~ gre + gpa +  rank, data = train, family="binomial")
print(xtable(summary(model2)),type="html", html.table.attributes = tab.attributes)

	Estimate	Std. Error	z value	Pr(>\|z\|)
(Intercept)	-2.2490	1.3082	-1.72	0.0856
gre	0.0023	0.0012	1.87	0.0618
gpa	0.2891	0.3741	0.77	0.4397
rank2	-0.7935	0.3655	-2.17	0.0299
rank3	-1.6016	0.4233	-3.78	0.0002
rank4	-1.4306	0.4739	-3.02	0.0025

Find the cutoff

sensetivity=c()
specifity=c()
for(i in seq(0.1, 0.9, by=0.1)){
  sensetivity <- c(sensetivity,perf(i,model2,train$admit)[1])
  specifity <- c(specifity,perf(i,model2,train$admit)[2])
}

plot(seq(0.1, 0.9, by=0.1), sensetivity , type="l", lty=1, col="blue", xlab="cutoff points", ylab="sensetivity/specifity", lwd=2)
lines(seq(0.1, 0.9, by=0.1),specifity, type="l", lty=1, col="red", lwd=2)

plot of chunk unnamed-chunk-10

Prediction

predict2 <- ifelse(predict(model2, newdata=test,type="response")>0.3, 1, 0)
confusionmatrix2 <- table(predict2, test$admit)
print(xtable(summary(model2)),type="html", html.table.attributes = tab.attributes)

	Estimate	Std. Error	z value	Pr(>\|z\|)
(Intercept)	-2.2490	1.3082	-1.72	0.0856
gre	0.0023	0.0012	1.87	0.0618
gpa	0.2891	0.3741	0.77	0.4397
rank2	-0.7935	0.3655	-2.17	0.0299
rank3	-1.6016	0.4233	-3.78	0.0002
rank4	-1.4306	0.4739	-3.02	0.0025

Accuracy of model over the test data

sum(diag(confusionmatrix2))/sum(confusionmatrix2)

## [1] 0.66

Tutorial 4

Jason Hartford

17 October 2014

Preprocessing

Examining the data

Contingency tables

Convert numerical data to categorical data

Analysis

training the first logistic regression model

Sensitivity/Specifity

Confusion Matrix

Accuracy of model over the test data

Model 2

Find the cutoff

Prediction

Accuracy of model over the test data

	220	300	340	360	380	400	420	440	460	480	500	520	540	560	580	600	620	640	660	680	700	720	740	760	780	800
0	1	0	3	4	5	8	4	6	6	8	15	12	15	13	18	11	13	13	10	10	14	4	4	1	0	12
1	0	1	1	0	0	2	0	1	2	6	1	8	7	5	4	6	9	2	8	6	3	1	2	4	4	7

	220	300	340	360	380	400	420	440	460	480	500	520	540	560	580	600	620	640	660	680	700	720	740	760	780	800
0	1	0	3	4	5	8	4	6	6	8	15	12	15	13	18	11	13	13	10	10	14	4	4	1	0	12
1	0	1	1	0	0	2	0	1	2	6	1	8	7	5	4	6	9	2	8	6	3	1	2	4	4	7

	220	300	340	360	380	400	420	440	460	480	500	520	540	560	580	600	620	640	660	680	700	720	740	760	780	800
0	1	0	3	4	5	8	4	6	6	8	15	12	15	13	18	11	13	13	10	10	14	4	4	1	0	12
1	0	1	1	0	0	2	0	1	2	6	1	8	7	5	4	6	9	2	8	6	3	1	2	4	4	7