You are on page 1of 5

rm(list=ls())

myvar=10
print(mynumber)
v1=c(1,5,5.5,1e2)
v2=c(0.14, 0, -2)
v3=c(v1,v2)
#Subset
v3[2]
#Sequence of values
v3[c(2,3)]
v3_sub=v3[c(2,3)]
v2+2
sin(v1[2])
sin(v1)[2]

v1*v2
#Dot Product - need to have same length
v1%*% v2
length(v3)
class(v2)
v4=c("10","45")

v4/10 #Error
v4=as.numeric(v4)
class(v4)
v4/10
summary(v3)
summary(char_vector)

mean(v3)
var(v3)
quantile(v3)
median(v3)
sum(v3)
max(v3)
min(v3)
names(v3)
names(v2)=c("Cat","Dog","Rat")
Matrices
1:6
seq(1,100,by=10)
ma=matrix(1:6,nrow=3,ncol=2)
rbind(ma,c(100,200))
m=cbind(ma,mb)
m[1,1:3]
m[1,]
m[,1:2]
#transpose
t(m)
diag(ma)
diag(3) # 3 by 3 Identity diagonal matrix 1 1 1
diag(c(1,2,3))

m2=matrix(21:32,nrow=3)
m3=m%*% m2
m3

Lists
list(v1,chv2,ma)
my.list=list(numeric=v1,character=v2,matrix=ma)
my.list[1]
class(my.list[[1]])
mylist$matrix
#Data Frames
data(mtcars)
class(mtcars)
View(mtcars)
head(mtcars) #First 6 rows
summary(mtcars)
names(mtcars)
colnames(mtcars)
mtcars$mpg
mtcars[,"mpg"]
mtcars[,1]
mtcars[1:3,]
y=c("TRUE","FALSE","TRUE")
class(y)
v2>0
m>=5
mtcars$mpg>20
v=mtcars$mpg>20
mtcars[mtcars$>20,]
#or
mtcars[v,]
mtcars[mtcars$>20 & mtcars$am==0,]
#Data.Table
Cmd+shift+m
More->Setwd()
source('~/Desktop/R/script.r')
y
library("ggplot2")
data(mtcars)
print(ggplot(mtcars,aes(wt,mpg))+geom_point())
#Cmd+Enter to run a single line of code
#2013 baseball archive
#http://dgrtwo.github.io/pages/lahman/Salaries.csv
salaries=read.csv("Salaries.csv")
View(Salaries)
salaries=read.csv("http://dgrtwo.github.io/pages/lahman/Salaries.csv")
#IF no header and sep is space
salaries=read.csv("http://dgrtwo.github.io/pages/lahman/Salaries.csv", header=FA
LSE, sep=" ")
install.packages("data.table")
#Put salaries into a data.table
library(data.table)
salaries=as.data.table(salaries)
salaries #top 5 and last 5
salaries$salary
salaries[1,] #first row
salaries[1:5,]
salaries[,1] #doesn't work
salaries[ , yearID]
salaries[ , list(yearID,salary)]
#only yearsafter 2000
salaries[yearID>2000,]
salaries[yearID==2000,]
salaries[lgID=='AL',]
salaries[lgID=='AL' & yearID>1990,]
salaries[yearID<1990 | yearID>2010,]
#order
salaries[order(salary),]
salaries[order(yearID,salary),]
salaries.filtered=salaries[lgID=="AL" & yearID>=1990, ]
salaires.filtered.sorted = salaries.filtered[order(salary),]
#Summary - Group by
mean(salaries$salary)
max(salaries$salary)
median(salaries$salary)
mean(salaries[yearID==2000,]$salary)
summarized.year=salaries[,mean(salary), by="yearID"]
summarized.year=salaries[,list(average=mean(salary)), by="yearID"]
summarized.year=salaries[,list(average=mean(salary), Maximum=max(salary))
, by="yearID"]
summarized.league=salaries[,list(average=mean(salary), Maximum=max(salary))
, by="lgID"]

, by="yearID"]
summarized.team=salaries[,list(average=mean(salary), Maximum=max(salary))
, by="teamID"]

, by="yearID"]
summarized.year[yearID>2000,]
summarized.year[yearID>2000,]
summarized.team[order(average

summarized.year.lg=salaries[,list(average=mean(salary), Maximum=max(salary))
, by=c("yearID","lgID")]

summarized.year.lg=salaries[,list(average=mean(salary), Maximum=max(salary))
, by=c("yearID","teamID")]

library(ggplot2)
ggplot(salaries, aes(yearID,salary)) + geom_point()
ggplot(summarized.year, aes(yearID, average))+geom_line()
ggplot(summarized.year, aes(yearID, average,color=lgID))+geom_line()
#Merging data
master=read.csv("http://dgrtwo.github.io/pages/lahman/Master.csv")
master=as.data.table(master)
salaries[playerID=='aardsda01',]
merged.salaries=merge(salaries, master,by="playerID")
#Assign a new column name :=
merged.salaries[, name:=paste(nameFirst, nameLast)]
batting=read.csv("http://dgrtwo.github.io/pages/lahman/Batting.csv")
batting=as.data.table(batting)
#G- # of games
#H- # of Hits
#Batting and Salary need to be merged on 4 columns
#year, team, league, player
merged.batting=merged(batting, salary, by=c("playerID", "teamID", "yearID", "lgI
D"))
#all.x
#Keep all the values in X (like a left outer join)
merged.batting=merged(batting, salary, by=c("playerID", "teamID", "yearID", "lgI
D"),
all.x=TRUE)
merged.all=merge(merged.batting,master,by="playerID")
merged.all=merged.all[AB>0,]
#total # of HR by each player
summarized.batters=merged.all[,list(Total.HR=sum(HR)), by="playerID"]
merged.all[,name:=paste(nameFirst,nameLast)]
summarized.batters=merged.all[,list(Total.HR=sum(HR)), by=c("playerID","name")]
summarized.batters[order(Total.HR),]
summarized.batters=merged.all[,list(Total.HR=sum(HR), Total.R=sum(R),
Total.H=sum(H)), by=c("p
layerID","name")]
#More hits means more runs
#See the correlation
ggplot(summarized.batters, aes(Total.H, Total.R))+geom_point
#Batting Avg=# of hits/# of times a player goes out to bat
summarized.batters=merged.all[,list(Total.HR=sum(HR), Total.R=sum(R),
Total.H=sum(H), BattingA
verage=sum(H)/sum(AB)), by=c("playerID","name")]
ggplot(summarized.batters, aes(BattingAverage))+geom_histogram()

You might also like