toyota <- read.csv("D:/CIM LAVASA/2017-18/Subjects Taught-2017-18/Data Mining in
R/DataSets/ToyotaCorolla.csv") toyota[1:3,] summary(toyota) hist(toyota$Price) ## next we create indicator variables for the categorical variable ## FuelType with its three nominal outcomes: CNG, Diesel, and Petrol v1=rep(1,length(toyota$FuelType)) v2=rep(0,length(toyota$FuelType)) toyota$FuelType1=ifelse(toyota$FuelType=="CNG",v1,v2) toyota$FuelType2=ifelse(toyota$FuelType=="Diesel",v1,v2) #Remove the variable fuel type auto=toyota[-4] #check whether the if condition works using the first three rows auto[1:3,] plot(Price~Age,data=auto) plot(Price~KM,data=auto) plot(Price~HP,data=auto) plot(Price~MetColor,data=auto) plot(Price~Automatic,data=auto) plot(Price~CC,data=auto) plot(Price~Doors,data=auto) plot(Price~Weight,data=auto)
## regression on all data
m1=lm(Price~.,data=auto) summary(m1)
set.seed(1) ## fixing the seed value for the random selection guarantees the ## same results in repeated runs n=length(auto$Price) n1=1000 n2=n-n1 train=sample(1:n,n1) #Read documentation about the command sample
## regression on training set
m1=lm(Price~.,data=auto[train,]) summary(m1) pred=predict(m1,newdat=auto[-train,]) obs=auto$Price[-train] diff=obs-pred percdiff=abs(diff)/obs me=mean(diff) rmse=sqrt(sum(diff**2)/n2) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error
## cross-validation (leave one out)
n=length(auto$Price) diff=dim(n) percdiff=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] m1=lm(Price~.,data=auto[train,]) pred=predict(m1,newdat=auto[-train,]) obs=auto$Price[-train] diff[k]=obs-pred percdiff[k]=abs(diff[k])/obs } me=mean(diff) rmse=sqrt(mean(diff**2)) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error
## cross-validation (leave one out): Model with just Age
n=length(auto$Price) diff=dim(n) percdiff=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] m1=lm(Price~Age,data=auto[train,]) pred=predict(m1,newdat=auto[-train,]) obs=auto$Price[-train] diff[k]=obs-pred percdiff[k]=abs(diff[k])/obs } me=mean(diff) rmse=sqrt(mean(diff**2)) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error