You are on page 1of 11

Datmin

Group 4

January 9, 2018
library("party")

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

##
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':


##
## as.Date, as.Date.numeric

## Loading required package: sandwich

library(caret)

## Warning: package 'caret' was built under R version 3.4.3

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.4.3

library(pROC)

## Warning: package 'pROC' was built under R version 3.4.3

## Type 'citation("pROC")' for a citation.

##
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':


##
## cov, smooth, var
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

##
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':


##
## margin

bank<- read.csv("bank-additional-full.csv")
bank$y <- as.factor(bank$y)
random <- sample(1:nrow(bank), size=(0.5*nrow(bank)), replace=FALSE)
bank1 <- bank[random,]
strain <- bank1[c(1:672,1000:14567),]
stest <- bank1[c(673:999,14568:16475),]

fit3 <- randomForest( y~ ., data=strain, importance=TRUE, proximity=TRUE)


fit3$importanceSD

## no yes MeanDecreaseAccuracy
## age 1.627090e-04 0.0006525434 1.527186e-04
## job 1.457192e-04 0.0007457360 1.463129e-04
## marital 7.628708e-05 0.0004135245 8.018263e-05
## education 1.307121e-04 0.0006393608 1.362917e-04
## default 5.824307e-05 0.0003637880 5.356985e-05
## housing 6.076103e-05 0.0003546303 6.691713e-05
## loan 4.676190e-05 0.0002534952 5.014895e-05
## contact 4.954949e-04 0.0006227242 4.395141e-04
## month 1.871339e-03 0.0010763271 1.617304e-03
## day_of_week 1.562253e-04 0.0006181445 1.548155e-04
## duration 2.033313e-04 0.0012763713 2.511003e-04
## campaign 8.408737e-05 0.0004707685 9.046708e-05
## pdays 1.647175e-04 0.0011392946 1.936220e-04
## previous 1.922123e-04 0.0005925649 1.759257e-04
## poutcome 1.588325e-04 0.0011293179 1.912898e-04
## emp.var.rate 2.235286e-03 0.0022622801 1.990297e-03
## cons.price.idx 1.696626e-03 0.0009206261 1.491864e-03
## cons.conf.idx 1.249113e-03 0.0009723164 1.072625e-03
## euribor3m 1.891732e-03 0.0022420016 1.629565e-03
## nr.employed 1.516485e-03 0.0023723058 1.366323e-03

summary(fit3)

## Length Class Mode


## call 5 -none- call
## type 1 -none- character
## predicted 14240 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 28480 matrix numeric
## oob.times 14240 -none- numeric
## classes 2 -none- character
## importance 80 -none- numeric
## importanceSD 60 -none- numeric
## localImportance 0 -none- NULL
## proximity 202777600 -none- numeric
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 14240 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call

plot(fit3)

prediksi_bank <- predict(fit3,stest)


CM_bank <- table(stest[,21],prediksi_bank)
akurasi_bank<-(sum(diag(CM_bank)))/sum(CM_bank)
akurasi_bank

## [1] 0.9181208

presisi_bank<- CM_bank[1,1]/(CM_bank[1,1]+ CM_bank[2,1])


presisi_bank
## [1] 0.9418093

recall_bank<-CM_bank[1,1]/(CM_bank[1,1]+CM_bank[1,2])
recall_bank

## [1] 0.9678392

F1_bank<-(2*presisi_bank*recall_bank)/(presisi_bank+recall_bank)
F1_bank

## [1] 0.9546468

2). Naive Bayes


library(caret)
library("naivebayes")
library(klaR)

## Loading required package: MASS

fit2 <- naive_bayes(y ~., data=strain)


print(fit2)

## ===================== Naive Bayes =====================


## Call:
## naive_bayes.formula(formula = y ~ ., data = strain)
##
## A priori probabilities:
##
## no yes
## 0.8867978 0.1132022
##
## Tables:
##
## age no yes
## mean 39.90410 40.37345
## sd 9.90093 13.45713
##
##
## job no yes
## admin. 0.248337029 0.273573201
## blue-collar 0.237963256 0.133374690
## entrepreneur 0.037298068 0.028535980
## housemaid 0.026607539 0.024813896
## management 0.071745328 0.068238213
## retired 0.032942667 0.086228288
## self-employed 0.033576180 0.030397022
## services 0.098907190 0.075062035
## student 0.017263225 0.063275434
## technician 0.164159012 0.173697270
## unemployed 0.024311055 0.034119107
## unknown 0.006889452 0.008684864
##
##
## marital no yes
## divorced 0.115457713 0.107940447
## married 0.612131771 0.540942928
## single 0.270430789 0.348635236
## unknown 0.001979728 0.002481390
##
##
## education no yes
## basic.4y 0.1053215078 0.0936724566
## basic.6y 0.0581248020 0.0434243176
## basic.9y 0.1518847007 0.0955334988
## high.school 0.2309154260 0.2326302730
## illiterate 0.0004751346 0.0006203474
## professional.course 0.1275736459 0.1439205955
## university.degree 0.2863477985 0.3411910670
## unknown 0.0393569845 0.0490074442
##
##
## default no yes
## no 0.7770826734 0.9081885856
## unknown 0.2228381375 0.0918114144
## yes 0.0000791891 0.0000000000
##
## # ... and 15 more tables

prediksi_fit2 <- predict(fit2,stest)


stest$prediksi <- unlist(prediksi_fit2)
confusionMatrix(stest$y, prediksi_fit2)

## Confusion Matrix and Statistics


##
## Reference
## Prediction no yes
## no 1811 179
## yes 77 168
##
## Accuracy : 0.8855
## 95% CI : (0.8715, 0.8984)
## No Information Rate : 0.8447
## P-Value [Acc > NIR] : 1.984e-08
##
## Kappa : 0.5038
## Mcnemar's Test P-Value : 2.746e-10
##
## Sensitivity : 0.9592
## Specificity : 0.4841
## Pos Pred Value : 0.9101
## Neg Pred Value : 0.6857
## Prevalence : 0.8447
## Detection Rate : 0.8103
## Detection Prevalence : 0.8904
## Balanced Accuracy : 0.7217
##
## 'Positive' Class : no
##

library(pROC)
des<-as.numeric(prediksi_fit2)
plot(roc(stest$y,des))

roc(stest$y,des)

##
## Call:
## roc.default(response = stest$y, predictor = des)
##
## Data: des in 1990 controls (stest$y no) < 245 cases (stest$y yes).
## Area under the curve: 0.7979

3.Decision Tree
library("party")
library(caret)
library(ggplot2)
fit1 <- ctree(y ~.,data=strain)
print(fit1)

##
## Conditional inference tree with 34 terminal nodes
##
## Response: y
## Inputs: age, job, marital, education, default, housing, loan, contact, mo
nth, day_of_week, duration, campaign, pdays, previous, poutcome, emp.var.rate
, cons.price.idx, cons.conf.idx, euribor3m, nr.employed
## Number of observations: 14240
##
## 1) duration <= 505; criterion = 1, statistic = 2470.718
## 2) poutcome == {success}; criterion = 1, statistic = 2274.884
## 3) nr.employed <= 5076.2; criterion = 1, statistic = 50.438
## 4) duration <= 167; criterion = 1, statistic = 25.793
## 5)* weights = 80
## 4) duration > 167
## 6)* weights = 253
## 3) nr.employed > 5076.2
## 7) duration <= 156; criterion = 0.999, statistic = 20.702
## 8)* weights = 25
## 7) duration > 156
## 9) cons.conf.idx <= -47.1; criterion = 0.971, statistic = 12.569
## 10)* weights = 19
## 9) cons.conf.idx > -47.1
## 11)* weights = 34
## 2) poutcome == {failure, nonexistent}
## 12) nr.employed <= 5076.2; criterion = 1, statistic = 1366.082
## 13) duration <= 172; criterion = 1, statistic = 195.165
## 14) duration <= 77; criterion = 1, statistic = 20.23
## 15)* weights = 131
## 14) duration > 77
## 16)* weights = 414
## 13) duration > 172
## 17) duration <= 264; criterion = 0.999, statistic = 16.192
## 18)* weights = 281
## 17) duration > 264
## 19)* weights = 301
## 12) nr.employed > 5076.2
## 20) month == {apr, dec, mar, oct}; criterion = 1, statistic = 1327.1
31
## 21) month == {mar, oct}; criterion = 1, statistic = 64.885
## 22) duration <= 166; criterion = 1, statistic = 33.264
## 23)* weights = 71
## 22) duration > 166
## 24)* weights = 45
## 21) month == {apr, dec}
## 25) day_of_week == {fri, mon}; criterion = 1, statistic = 42.993
## 26) age <= 27; criterion = 0.996, statistic = 25.188
## 27)* weights = 32
## 26) age > 27
## 28)* weights = 326
## 25) day_of_week == {thu, tue, wed}
## 29) duration <= 277; criterion = 1, statistic = 29.294
## 30)* weights = 239
## 29) duration > 277
## 31)* weights = 103
## 20) month == {aug, jul, jun, may, nov}
## 32) duration <= 393; criterion = 1, statistic = 281.864
## 33) emp.var.rate <= -0.1; criterion = 1, statistic = 79.272
## 34) duration <= 282; criterion = 1, statistic = 45.444
## 35) contact == {telephone}; criterion = 0.998, statistic = 1
8.177
## 36) euribor3m <= 4.191; criterion = 0.985, statistic = 23.
877
## 37) job == {admin., blue-collar, entrepreneur, managemen
t, services, student, technician, unemployed}; criterion = 1, statistic = 108
## 38)* weights = 207
## 37) job == {housemaid, retired, self-employed}
## 39)* weights = 11
## 36) euribor3m > 4.191
## 40)* weights = 18
## 35) contact == {cellular}
## 41) duration <= 130; criterion = 0.984, statistic = 14.293
## 42)* weights = 1068
## 41) duration > 130
## 43)* weights = 996
## 34) duration > 282
## 44)* weights = 388
## 33) emp.var.rate > -0.1
## 45) duration <= 355; criterion = 1, statistic = 31.705
## 46)* weights = 6566
## 45) duration > 355
## 47)* weights = 256
## 32) duration > 393
## 48) cons.price.idx <= 93.444; criterion = 1, statistic = 19.96
## 49)* weights = 272
## 48) cons.price.idx > 93.444
## 50)* weights = 401
## 1) duration > 505
## 51) duration <= 836; criterion = 1, statistic = 80.449
## 52) nr.employed <= 5076.2; criterion = 1, statistic = 74.224
## 53) poutcome == {success}; criterion = 0.996, statistic = 16.856
## 54)* weights = 45
## 53) poutcome == {failure, nonexistent}
## 55)* weights = 112
## 52) nr.employed > 5076.2
## 56) duration <= 647; criterion = 1, statistic = 21.914
## 57) cons.price.idx <= 92.893; criterion = 1, statistic = 19.682
## 58)* weights = 103
## 57) cons.price.idx > 92.893
## 59) month == {aug, oct}; criterion = 0.981, statistic = 22.597
## 60)* weights = 64
## 59) month == {apr, jul, jun, may, nov}
## 61)* weights = 424
## 56) duration > 647
## 62) contact == {telephone}; criterion = 0.987, statistic = 11.674
## 63)* weights = 145
## 62) contact == {cellular}
## 64)* weights = 275
## 51) duration > 836
## 65) education == {basic.4y, basic.9y, university.degree}; criterion =
0.974, statistic = 21.818
## 66)* weights = 279
## 65) education == {basic.6y, high.school, professional.course, unknown}
## 67)* weights = 256

summary(fit1)

## Length Class Mode


## 1 BinaryTree S4

plot(fit1)
prediksi_fit1 <- predict(fit1,stest)
stest$prediksi <- unlist(prediksi_fit1)
confusionMatrix(stest$y, prediksi_fit1)

## Confusion Matrix and Statistics


##
## Reference
## Prediction no yes
## no 1928 62
## yes 121 124
##
## Accuracy : 0.9181
## 95% CI : (0.906, 0.9292)
## No Information Rate : 0.9168
## P-Value [Acc > NIR] : 0.4281
##
## Kappa : 0.531
## Mcnemar's Test P-Value : 1.807e-05
##
## Sensitivity : 0.9409
## Specificity : 0.6667
## Pos Pred Value : 0.9688
## Neg Pred Value : 0.5061
## Prevalence : 0.9168
## Detection Rate : 0.8626
## Detection Prevalence : 0.8904
## Balanced Accuracy : 0.8038
##
## 'Positive' Class : no
##

library(pROC)
des<-as.numeric(prediksi_fit1)
plot(roc(stest$y,des))
roc(stest$y,des)

##
## Call:
## roc.default(response = stest$y, predictor = des)
##
## Data: des in 1990 controls (stest$y no) < 245 cases (stest$y yes).
## Area under the curve: 0.7375

You might also like