You are on page 1of 3

library(randomForest) # for bagging and random forests

####### Read Data #######


reg <- read.csv('~/Box Documents/Data/Kaggle/MarchMachine/regular_season_results
.csv')
teams <- read.csv('~/Box Documents/Data/Kaggle/MarchMachine/teams.csv')
tourney <- read.csv('~/Box Documents/Data/Kaggle/MarchMachine/tourney_results.cs
v')
tourneyS <- read.csv('~/Box Documents/Data/Kaggle/MarchMachine/sample_submission
.csv')
regS = reg[reg$season=="S",]
regS = data.frame(regS, scorediff=regS$wscore-regS$lscore)
tourneyTeams = unique(c(reg$wteam,reg$lteam))
###### Adjust Data ######
# Ignore overtime column and add scoredifferential column in regular season and
tourney
reg = reg[,-8]
reg$season = as.character(reg$season)
reg=reg[reg$season!="S",]
reg$season = as.factor(reg$season)
reg = data.frame(reg, scorediff=reg$wscore-reg$lscore)
tourney = tourney[,-7]
######## Add Data for A-R Seasons ######
tourney = data.frame(tourney, scorediff=tourney$wscore-tourney$lscore,wMean=rep(
0,1156),wMedian=rep(0,1156),
wMeanVS=rep(0,1156),wMedianVS=rep(0,1156),lMean=rep(0,1156)
,lMedian=rep(0,1156),
lMeanVS=rep(0,1156),lMedianVS=rep(0,1156),wWLMarginMean=rep
(0,1156),
wWLMarginMedian=rep(0,1156),lWLMarginMean=rep(0,1156),lWLMa
rginMedian=rep(0,1156))
for(i in 1:1156){
wscores = c(reg$wscore[reg$season == tourney$season[i] & reg$wteam == tourney$
wteam[i]],
reg$lscore[reg$season == tourney$season[i] & reg$lteam == tourney$
wteam[i]])
tourney$wMean[i] = mean(wscores)
tourney$wMedian[i] = median(wscores)

wscoresVS = c(reg$lscore[reg$season == tourney$season[i] & reg$wteam == tourne
y$wteam[i]],
reg$wscore[reg$season == tourney$season[i] & reg$lteam == tourne
y$wteam[i]])
tourney$wMeanVS[i] = mean(wscoresVS)
tourney$wMedianVS[i] = median(wscoresVS)

lscores = c(reg$wscore[reg$season == tourney$season[i] & reg$wteam == tourney$
lteam[i]],
reg$lscore[reg$season == tourney$season[i] & reg$lteam == tourney$
lteam[i]])
tourney$lMean[i] = mean(lscores)
tourney$lMedian[i] = median(lscores)

lscoresVS = c(reg$lscore[reg$season == tourney$season[i] & reg$wteam == tourne
y$lteam[i]],
reg$wscore[reg$season == tourney$season[i] & reg$lteam == tourne
y$lteam[i]])
tourney$lMeanVS[i] = mean(lscoresVS)
tourney$lMedianVS[i] = median(lscoresVS)

wWLMargin = c(reg$wscore[reg$season == tourney$season[i] & reg$wteam == tourne
y$wteam[i]]-
reg$lscore[reg$season == tourney$season[i] & reg$wteam == tour
ney$wteam[i]],
reg$lscore[reg$season == tourney$season[i] & reg$lteam == tourne
y$wteam[i]]-
reg$wscore[reg$season == tourney$season[i] & reg$lteam == tour
ney$wteam[i]])
tourney$wWLMarginMean[i] = mean(wWLMargin)
tourney$wWLMarginMedian[i] = median(wWLMargin)

lWLMargin = c(reg$wscore[reg$season == tourney$season[i] & reg$wteam == tourne
y$lteam[i]]-
reg$lscore[reg$season == tourney$season[i] & reg$wteam == tour
ney$lteam[i]],
reg$lscore[reg$season == tourney$season[i] & reg$lteam == tourne
y$lteam[i]]-
reg$wscore[reg$season == tourney$season[i] & reg$lteam == tour
ney$lteam[i]])
tourney$lWLMarginMean[i] = mean(lWLMargin)
tourney$lWLMarginMedian[i] = median(lWLMargin)
}
######## Add Data for Season S #####
tourneyS = data.frame(tourneyS, wteam=rep(0,2278), lteam=rep(0,2278), wMean=rep(
0,2278),wMedian=rep(0,2278),
wMeanVS=rep(0,2278),wMedianVS=rep(0,2278),lMean=rep(0,2278
),lMedian=rep(0,2278),
lMeanVS=rep(0,2278),lMedianVS=rep(0,2278),wWLMarginMean=re
p(0,2278),
wWLMarginMedian=rep(0,2278),lWLMarginMean=rep(0,2278),lWLM
arginMedian=rep(0,2278))
tourneyS$wteam=as.numeric(sub("S_([[:graph:]]+)_([[:graph:]]+)", "\\1", tourneyS
$id, perl=TRUE))
tourneyS$lteam=as.numeric(sub("S_([[:graph:]]+)_([[:graph:]]+)", "\\2", tourneyS
$id, perl=TRUE))
for(i in 1:2278){
wscores = c(regS$wscore[regS$wteam == tourneyS$wteam[i]], regS$lscore[regS$lte
am == tourneyS$wteam[i]])
tourneyS$wMean[i] = mean(wscores)
tourneyS$wMedian[i] = median(wscores)

wscoresVS = c(regS$lscore[regS$wteam == tourneyS$wteam[i]], regS$wscore[regS$l
team == tourneyS$wteam[i]])
tourneyS$wMeanVS[i] = mean(wscoresVS)
tourneyS$wMedianVS[i] = median(wscoresVS)

lscores = c(regS$wscore[regS$wteam == tourneyS$lteam[i]], regS$lscore[regS$lte
am == tourneyS$lteam[i]])
tourneyS$lMean[i] = mean(lscores)
tourneyS$lMedian[i] = median(lscores)

lscoresVS = c(regS$lscore[regS$wteam == tourneyS$lteam[i]], regS$wscore[regS$l
team == tourneyS$lteam[i]])
tourneyS$lMeanVS[i] = mean(lscoresVS)
tourneyS$lMedianVS[i] = median(lscoresVS)

wWLMargin = c(regS$wscore[regS$wteam == tourneyS$wteam[i]]- regS$lscore[regS$w
team == tourneyS$wteam[i]],
regS$lscore[regS$lteam == tourneyS$wteam[i]]- regS$wscore[regS$l
team == tourneyS$wteam[i]])
tourneyS$wWLMarginMean[i] = mean(wWLMargin)
tourneyS$wWLMarginMedian[i] = median(wWLMargin)

lWLMargin = c(regS$wscore[regS$wteam == tourneyS$lteam[i]]- regS$lscore[regS$w
team == tourneyS$lteam[i]],
regS$lscore[regS$lteam == tourneyS$lteam[i]]- regS$wscore[regS$l
team == tourneyS$lteam[i]])
tourneyS$lWLMarginMean[i] = mean(lWLMargin)
tourneyS$lWLMarginMedian[i] = median(lWLMargin)
}
######## Reverse data for A-R #####
tourney = tourney[,-c(1:6)]
set.seed(1)
rev = sample(1:1156,578)
for(i in rev){
temp1 = tourney[i,2:5]
temp2 = tourney[i,10:11]
tourney[i,2:5] = tourney[i,6:9]
tourney[i,10:11] = tourney[i,12:13]
tourney[i,6:9] = temp1
tourney[i,12:13] = temp2
tourney[i,1] = -1*tourney[i,1]
}
###### Random Forest #####
set.seed(1)
rf.tourney=randomForest(scorediff~.,data=tourney,mtry=4,importance=TRUE)
yhat.rf = predict(rf.tourney,newdata=tourneyS)
###### Make Predictions ####
tourneyS = tourneyS[,c(1,2)]
tourneyS = data.frame(tourneyS, scorediff=yhat.rf)
sd = sqrt(sum(yhat.rf^2)/2277)
tourneyS$scorediff = tourneyS$scorediff/sd
tourneyS$scorediff = pnorm(tourneyS$scorediff)
tourneyS$pred=tourneyS$scorediff
tourneyS = tourneyS[,c(1,2)]
write.csv(tourneyS, file="~/Box Documents/Data/Kaggle/MarchMachine/thirdsubadj.c
sv")

You might also like