You are on page 1of 17

2/21/2016

KddcupAnalysis

In[1]:
importgraphlab
In[2]:
kdd=graphlab.SFrame('kddcup.csv')
[INFO]ThisnoncommerciallicenseofGraphLabCreateisassignedtobh
anusharma.a3@gmail.comandwillexpireonFebruary16,2017.Forcommer
ciallicensingoptions,visithttps://dato.com/buy/.(https://dato.com/
buy/.)
[INFO]Startserverat:ipc:///tmp/graphlab_server9792Serverbinar
y:C:\Users\dshar\Anaconda2\envs\datoenv\lib\sitepackages\graphlab\un
ity_server.exeServerlog:C:\Users\dshar\AppData\Local\Temp\graphlab
_server_1456062829.log.0
[INFO]GraphLabServerVersion:1.8.1
PROGRESS:FinishedparsingfileC:\Users\dshar\kddcup.csv
PROGRESS:Parsingcompleted.Parsed100linesin0.962981secs.

Inferredtypesfromfirstlineoffileas
column_type_hints=[long,str,str,str,long,long,long,long,long,long,long,
long,long,long,long,long,long,long,long,long,long,long,long,long,float,
float,float,float,float,float,float,long,long,float,float,float,float,f
loat,float,float,float,str]
Ifparsingfailsduetoincorrecttypes,youcancorrect
theinferredtypelistaboveandpassittoread_csvin
thecolumn_type_hintsargument

PROGRESS:Read345187lines.Linespersecond:285583
PROGRESS:FinishedparsingfileC:\Users\dshar\kddcup.csv
PROGRESS:Parsingcompleted.Parsed494020linesin1.48128secs.

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

1/17

2/21/2016

KddcupAnalysis

In[8]:
graphlab.canvas.set_target('ipynb')
kdd['normal.'].show(view='Categorical')

Mostfrequentitemsfrom<SArray>
Value

Count

Percent

smurf.

280,790 56.838%

neptune.

107,201 21.7%

normal.

97,277

19.691%

back.

2,203

0.446%

satan.

1,589

0.322%

ipsweep.

1,247

0.252%

portsweep.

1,040

0.211%

warezclient.

1,020

0.206%

teardrop.

979

0.198%

pod.

264

0.053%

nmap.

231

0.047%

guess_passwd.

53

0.011%

buffer_overflow. 30

0.006%

land.

21

0.004%

warezmaster.

20

0.004%

imap.

12

0.002%

rootkit.

10

0.002%

loadmodule.

0.002%

ftp_write.

0.002%

multihop.

0.001%

phf.

8.097e
4%

perl.

6.073e
4%

spy.

4.048e
4%

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

2/17

2/21/2016

KddcupAnalysis

In[9]:
kdd.head()
Out[9]:

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

3/17

2/21/2016

KddcupAnalysis

tcp

http

SF

181

5450

0.1

0.2

0.3

0.4

0.5

tcp

http

SF

239

486

tcp

http

SF

235

1337

tcp

http

SF

219

1337

tcp

http

SF

217

2032

tcp

http

SF

217

2032

tcp

http

SF

212

1940

tcp

http

SF

159

4087

tcp

http

SF

210

151

tcp

http

SF

212

786

tcp

http

SF

210

624

0.15

0.16

8.1

0.00

0.00.1

0.00.2

0.00.3

1.00

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

0.0

1.0

18

18

0.0

0.0

0.0

0.0

1.0

0.00.7

0.00.8

0.00.9

0.00.10

...

0.0

0.0

0.0

0.0

...

0.0

0.0

0.0

0.0

...

0.0

0.0

0.0

0.0

...

0.0

0.0

0.0

0.0

...

0.0

0.0

0.0

0.0

...

0.04

0.0

0.0

0.0

...

0.04

0.0

0.0

0.0

...

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

4/17

2/21/2016

KddcupAnalysis

0.04

0.0

0.0

0.0

...

0.05
In[10]:

0.0

0.0

0.0

...

kdd.show()

SF

181

str

dtype:

str

dtype:

66

num_unique(est.):

11

num_unique
(est.):

num_undefined:

5450
int
3,298

dtype:
num_unique
(est.):

num_undefined:

num_undefined:

min:

min:

SF

max:

6.934e+8

max:

S0

median:

REJ

mean:

frequentitems:

RSTR
RSTO
SH

std:

520
3,025.616
988,218.101

distributionofvalues:

median:
mean:
std:
distributionofvalues

S1
S2
RSTOS0
S3
OTH

In[41]:

d={'0':'duration','tcp':'protocol_type1','SF':'flag','181':'src_bytes','5450':
'1':'logged_in','0.6':'num_compromised','0.7':'root_shell','0.10':'num_file_creation
'0.12':'num_shells','0.13':'num_access_files','0.14':'num_outbound_cmds','0.15'
'0.00.1':'srv_serror_rate','0.00.2':'rerror_rate','0.00.3':'srv_rerror_rate','1.00'
'9.1':'dst_host_srv_count','1.00.1':'dst_host_same_srv_rate','0.00.6':'dst_host_diff_
'0.00.8':'dst_host_serror_rate','0.00.9':'dst_host_srv_serror_rate','normal.':'respon

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

5/17

2/21/2016

KddcupAnalysis

In[15]:
d
Out[15]:
{'0':'duration',
'0.00':'serror_rate',
'0.00.1':'srv_serror_rate',
'0.00.10':'dst_host_rerror_rate',
'0.00.11':'dst_host_srv_rerror_rate',
'0.00.2':'rerror_rate',
'0.00.3':'srv_rerror_rate',
'0.00.4':'diff_srv_rate',
'0.00.5':'srv_diff_host_rate',
'0.00.6':'dst_host_diff_srv_rate',
'0.00.7':'dst_host_srv_diff_host_rate',
'0.00.8':'dst_host_serror_rate',
'0.00.9':'dst_host_srv_serror_rate',
'0.1':'land',
'0.10':'num_file_creations',
'0.11':'dst_host_same_src_port_rate',
'0.12':'num_shells',
'0.13':'num_access_files',
'0.14':'num_outbound_cmds',
'0.15':'is_host_login',
'0.16':'is_guest_login',
'0.2':'wrong_fragment',
'0.3':'urgent',
'0.4':'hot',
'0.5':'num_failed_logins',
'0.6':'num_compromised',
'0.7':'root_shell',
'0.8':'su_attempted',
'0.9':'num_root',
'1':'logged_in',
'1.00':'same_srv_rate',
'1.00.1':'dst_host_same_srv_rate',
'181':'src_bytes',
'5450':'dst_bytes',
'8':'count',
'8.1':'srv_count',
'9':'dst_host_count',
'9.1':'dst_host_srv_count',
'SF':'flag',
'http':'protocol_type2',
'normal.':'response',
'tcp':'protocol_type1'}

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

6/17

2/21/2016

KddcupAnalysis

In[42]:
kdd.rename(d)
Out[42]:

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

7/17

2/21/2016

KddcupAnalysis

0.0

1.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

1.0

0.0

1.0

0.0

1.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

1.0

0.0

0.0

0.0

1.0

0.0

0.0

dst_host_diff_srv_rate

dst_host_same_src_port_ra
te...

dst_host_srv_diff_host_ra
te...

0.0

0.05

0.0

0.0

0.03

0.0

0.0

0.03

0.0

0.0

0.02

0.0

0.0

0.02

0.0

0.0

1.0

0.04

0.0

0.09

0.04

0.0

0.12

0.04

0.0

0.12

0.05

0.0

0.06

0.05

dst_host_rerror_rate

...

0.0

...

0.0

...

0.0

...

0.0

...

0.0

...

0.0

...

0.0

...

0.0

...

0.0

...

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

8/17

2/21/2016

KddcupAnalysis

0.0

...

In[43]:
kdd.show()

duration

protocol_type1

dtype:

int

num_unique(est.):

2,501

num_undefined:

min:

max:

58,329

median:
mean:
std:

0
47.979

dtype:

protocol_type
str

dtype:

num_unique(est.):

num_unique(est.):

num_undefined:

num_undefined:

frequentitems:

frequentitems:

icmp

ecr_i

tcp

private

udp

http

707.746

distributionofvalues:

smtp
other
domain_u
ftp_data
eco_i
ftp
finger
urp_i
telnet

In[44]:
train_data,test_data=kdd.random_split(.8,seed=0)

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

9/17

2/21/2016

KddcupAnalysis

In[45]:
train_data.head()
Out[45]:

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

10/17

2/21/2016

KddcupAnalysis

tcp

http

SF

235

1337

tcp

http

SF

219

1337

tcp

http

SF

217

2032

tcp

http

SF

217

2032

tcp

http

SF

212

1940

tcp

http

SF

159

4087

tcp

http

SF

210

151

tcp

http

SF

212

786

tcp

http

SF

210

624

logged_in

num_compromised

root_shell

su_attempted

num_root

num_outbound_cmds

is_host_login

is_guest_login

count

18

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

srv_count

11/17

2/21/2016

KddcupAnalysis

18

Creatingmodel

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

12/17

2/21/2016

KddcupAnalysis

In[48]:

kdd_model=graphlab.classifier.create(train_data,target='response',features=['protocol_typ

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

13/17

2/21/2016

KddcupAnalysis

PROGRESS:Creatingavalidationsetfrom5percentoftrainingdata.Th
ismaytakeawhile.
Youcanset``validation_set=None``todisablevalidationtra
cking.
PROGRESS:Thefollowingmethodsareavailableforthistypeofproblem.
PROGRESS:BoostedTreesClassifier,RandomForestClassifier,LogisticClass
ifier
PROGRESS:Thereturnedmodelwillbechosenaccordingtovalidationacc
uracy.
PROGRESS:Boostedtreesclassifier:
PROGRESS:
PROGRESS:Numberofexamples:375394
PROGRESS:Numberofclasses:23
PROGRESS:Numberoffeaturecolumns:2
PROGRESS:Numberofunpackedfeatures:2
PROGRESS:++++
+
PROGRESS:|Iteration|ElapsedTime|Trainingaccuracy|Validationa
ccuracy|
PROGRESS:++++
+
PROGRESS:|1|2.961036|0.981233|0.981515
|
PROGRESS:|2|4.742356|0.982621|0.983081
|
PROGRESS:|3|6.585209|0.982621|0.983081
|
PROGRESS:|4|8.430268|0.982621|0.983081
|
PROGRESS:|5|10.256491|0.982621|0.983081
|
PROGRESS:|6|12.089232|0.982621|0.983081
|
PROGRESS:|7|13.987530|0.982621|0.983081
|
PROGRESS:|8|15.854709|0.982621|0.983081
|
PROGRESS:|9|17.797652|0.982696|0.983283
|
PROGRESS:|10|19.734425|0.982696|0.983283
|
PROGRESS:++++
+
PROGRESS:Randomforestclassifier:
PROGRESS:
PROGRESS:Numberofexamples:375394
PROGRESS:Numberofclasses:23
PROGRESS:Numberoffeaturecolumns:2
PROGRESS:Numberofunpackedfeatures:2
PROGRESS:++++
+
PROGRESS:|Iteration|ElapsedTime|Trainingaccuracy|Validationa
ccuracy|
PROGRESS:++++
+
PROGRESS:|1|2.013409|0.973793|0.974949
http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

14/17

2/21/2016

KddcupAnalysis

|
PROGRESS:|2|3.754968|0.974483|0.975556
|
PROGRESS:|3|5.468433|0.974765|0.97596
|
PROGRESS:|4|7.223957|0.98208|0.982727
|
PROGRESS:|5|8.930002|0.981795|0.982071
|
PROGRESS:|6|10.593517|0.981795|0.982071
|
PROGRESS:|7|12.395478|0.981808|0.982273
|
PROGRESS:|8|14.178238|0.981811|0.982273
|
PROGRESS:|9|15.930915|0.981808|0.982273
|
PROGRESS:|10|17.570304|0.981808|0.982273
|
PROGRESS:++++
+
PROGRESS:Logisticregression:
PROGRESS:
PROGRESS:Numberofexamples:375394
PROGRESS:Numberofclasses:23
PROGRESS:Numberoffeaturecolumns:2
PROGRESS:Numberofunpackedfeatures:2
PROGRESS:Numberofcoefficients:1496
PROGRESS:StartingLBFGS
PROGRESS:
PROGRESS:+++++
++
PROGRESS:|Iteration|Passes|Stepsize|ElapsedTime|Training
accuracy|Validationaccuracy|
PROGRESS:+++++
++
PROGRESS:|1|4|0.000013|0.922823|0.851716
|0.851010|
PROGRESS:|2|6|1.000000|1.834471|0.855115
|0.854343|
PROGRESS:|3|7|1.000000|2.363347|0.980221
|0.980909|
PROGRESS:|4|8|1.000000|2.924757|0.976286
|0.977071|
PROGRESS:|5|9|1.000000|3.594237|0.982440
|0.983030|
PROGRESS:|6|10|1.000000|4.106106|0.982131
|0.982929|
PROGRESS:|10|16|1.000000|6.497316|0.981185
|0.981869|
PROGRESS:+++++
++
PROGRESS:TERMINATED:Iterationlimitreached.
PROGRESS:Thismodelmaynotbeoptimal.Toimproveit,considerincrea
sing`max_iterations`.
PROGRESS:Modelselectionbasedonvalidationaccuracy:
PROGRESS:
http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

15/17

2/21/2016

KddcupAnalysis

PROGRESS:BoostedTreesClassifier:0.983282828283
PROGRESS:RandomForestClassifier:0.982272727273
PROGRESS:LogisticClassifier:0.981869
PROGRESS:
PROGRESS:SelectingBoostedTreesClassifierbasedonvalidationsetperf
ormance.
In[59]:
con=kdd_model.evaluate(test_data,metric='confusion_matrix')
In[66]:
printcon.viewvalues()
dict_values([Columns:

target_label
str

predicted_labelstr

count int
Rows:41
Data:
++++
|target_label|predicted_label|count|
++++
|back.|normal.|413|
|normal.|normal.|19083|
|warezclient.|satan.|1|
|smurf.|smurf.|56142|
|normal.|smurf.|77|
|guess_passwd.|neptune.|8|
|normal.|neptune.|76|
|portsweep.|neptune.|131|
|satan.|neptune.|34|
|portsweep.|satan.|42|
++++
[41rowsx3columns]
Note:OnlytheheadoftheSFrameisprinted.
Youcanuseprint_rows(num_rows=m,num_columns=n)toprintmorerowsan
dcolumns.])
In[67]:
print_rows(num_rows=41,num_columns=3).con

NameErrorTraceback(mostrecentcalll
ast)
<ipythoninput67c4940ab881ca>in<module>()
>1print_rows(num_rows=41,num_columns=3).con
NameError:name'print_rows'isnotdefined

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

16/17

2/21/2016

KddcupAnalysis

In[74]:
graphlab.SFrame.print_rows(num_rows=41,num_columns=3)

TypeErrorTraceback(mostrecentcalll
ast)
<ipythoninput74b3574bd63f6f>in<module>()
>1graphlab.SFrame.print_rows(num_rows=41,num_columns=3)
TypeError:unboundmethodprint_rows()mustbecalledwithSFrameinsta
nceasfirstargument(gotnothinginstead)

In[75]:
con.show()

AttributeErrorTraceback(mostrecentcalll
ast)
<ipythoninput75770cccb153e1>in<module>()
>1con.show()
AttributeError:'dict'objecthasnoattribute'show'

In[]:

http://localhost:8889/notebooks/Kddcup%20Analysis.ipynb

17/17

You might also like