Codes

# This script uses CART to do estimation
# Uses the Cereals dataset

# Load cereals.csv
dc<-read.csv("cereals.csv")
# Remove the fields "Cereals" and "shelf"

fld<-!(names(dc) %in% c("Cereals","shelf"))
dcm<-dc[,fld]
# Partition into training and validation datasets

set.seed(34)
r<-sample(seq_len(nrow(dc)),0.7*nrow(dc),replace=F)
dt<-dcm[r,]
dv<-dcm[-r,]
# Use rpart to create the CART

library(rpart)
m.cart<-
rpart(rating~.,data=dt,parm=list(split="information"),control=rpart.control
(minsplit=2,minbucket=3))
# See the model output

summary(m.cart)
# Plot the tree

plot(m.cart,uniform=T,margin=.2)
text(m.cart,cex=.8, pretty=0)
# Get the ps plot for better visibility

post(m.cart,file="cereals.ps")
# Check where to prune
# Use the Cp and X validation plot

plotcp(m.cart)
# use r square
rsq.rpart(m.cart)
# Use the model to predict the rating for validation data

m.val<-predict(m.cart, newdata=dv)
# Get the difference between the predicted and observed values of rating
diff<-dv$rating-m.val
# Plot the result

plot(dv$rating,diff,pch=20, col="red")
abline(h=0, col="blue")
# Prepare to get the scored data into a csv file

output<-data.frame(list(m.val,dv$rating))
colnames(output)<-c("Predicted","Observed")
write.csv(output,"score.csv",row.names=F)
# Use RandomForest to compare results

library(randomForest)
m.rf<-randomForest(rating~.,dt)
varImpPlot(m.rf)
# Predict using the RandomForest Model
p.rf<-predict(m.rf,newdata=dv)
# Write the score of both the models along with observed data for
comparison
output<-data.frame(list(m.val,p.rf,dv$rating))
colnames(output)<-c("PredictedCART","PredictedRF","Observed")
write.csv(output,"score.csv",row.names=F)
# This script is for a classifier using CART

library(rpart)
# Read the play.csv dataset

d<-read.csv("Play.csv ")
# Use the rpart function to generate the CART

m<-
rpart(Play~.,d,method="class",parms=list(split="information"),control=rpart
.control(minsplit=2,minbucket=1))
# The following rpart includes a prior proportion of the two classes

#rpart(Play~.,d,method="class",parms=list(prior=c(.8,.2),split="information
"),control=rpart.control(minsplit=2,minbucket=1))
# See the rules generated by the CART

print(summary(m))
# See the complexity at each partition

printcp(m)
# Pruning the tree based on the minimum of cross validation error 'xerror'
# This is commented here as there will be only the root node if the pruning
is done in this example
#pm<-prune(m,cp=m$cptable[which.min(m$cptable[,"xerror"]),"CP"])
# Plot the tree to see the same visually

plot(m,uniform=T,branch=1,margin=.2)
text(m,cex=.6, pretty=0)
# Following will generate a more decorative tree in ps format

post(m,file="tree.ps")
# Predicting on the training dataset, as no validation dataset was defined.

# Predict will compute the classes of the given records
p<-predict(m,dat=d)
# Get the confusion matrix

print(table(d$Play,p[,2]))
# This script performs Logistic Regression on the Universal Bank Dataset

# The dependent variable PersonalLoan is predicted based on single
predictor Income
# Use Library Caret

library(caret)
# Read the dataset into a data frame

d<-read.csv("UniversalBank.csv")
# Convert PersonalLoan Column to factor

d$PersonalLoan=as.factor(as.character(d$PersonalLoan))
# Divide into training and validation partition

set.seed(53)
r<-createDataPartition(y=d$PersonalLoan,p=0.7,list=F)
dt<-d[r,]
dv<-d[-r,]
# Run Logistic Regression on the training dataset using Income as predictor

and PersonalLoan as outcome variable
m.lr<-train(PersonalLoan~Income, data=dt, method="glm",family="binomial")
summary(m.lr)$coef
summary(m.lr)
# Predict the result on validation data

fit.m=predict(m.lr,dv)
confusionMatrix(fit.m[,2],dv$PersonalLoan, positive = "1")
# Draw ROC Curve

library("ROCR")
fit.m=predict(m.lr,dv, type="prob")
pred<-prediction(fit.m[2],dv$PersonalLoan)
perf<-performance(pred,"tpr","fpr")
plot(perf,main="ROC Curve",colorize=T)
# As the ROC Curve shows that high TPR can be achieved at low
# cutoff probabilities, draw confusion matrix accordingly
predcton<-ifelse(fit.m[2]>0.2,1,0)
print("At cutoff=0.2")
print(table(dv$PersonalLoan,predcton))
# Try with even lower cutoff at the cost of FPR

predcton<-ifelse(fit.m[2]>0.22,1,0)
# Draw lift charts

library(lift)
plotLift(m.vl,dv$PersonalLoan)
plotLift(m.vl,dv$PersonalLoan,cumulative = F)
plotLift(predcton,dv$PersonalLoan)
plotLift(predcton,dv$PersonalLoan,cumulative = F)
# Copy the data to csv file for further analysis

dout=data.frame(Outcome=predcton)
dv$Outcome=dout$Outcome
write.csv(dv,"PredUniversalBank.csv",row.names = F)
# This script performs Logistic Regression on the Universal Bank Dataset

# The dependent variable PersonalLoan is predicted based on single
predictor Income
# Read the dataset into a data frame
d<-read.csv("UniversalBank.csv")
# Divide into training and validation partition

set.seed(53)
r<-sample(seq_len(nrow(d)),0.7*nrow(d),replace=F)
dt<-d[r,]
dv<-d[-r,]
# Run Logistic Regression on the training dataset using Income as predictor

and PersonalLoan as outcome variable
nrow(t)
m.lr<-glm(PersonalLoan~Income,data=dt,family="binomial")
# As we are using only one predictor, we can plot and see the relation
between the observed and predicted points
plot(PersonalLoan~Income,data=dt)
lines(dt$Income,m.lr$fitted.values,type="p",col="blue")
# See the details of the model generated

print(summary(m.lr))
# See the anova table with Chi Squared test

anova(m.lr,test="Chisq")
# Use the model on validation data to see the performance

# Type 'response' ensures that the output is in the same scale as the
response variable, and not on logit scale
m.vl<-predict(m.lr,newdata=dv, type="response")
# Draw ROC Curve

library("ROCR")
pred<-prediction(m.vl,dv$PersonalLoan)
perf<-performance(pred,"tpr","fpr")
plot(perf,main="ROC Curve",colorize=T)
# As the ROC Curve shows that high TPR can be achieved at low
# cutoff probabilities, draw confusion matrix accordingly
predcton<-ifelse(m.vl>0.15,1,0)
# Try with even lower cutoff at the cost of FPR

predcton<-ifelse(m.vl>0.1,1,0)
# Draw lift charts

library(lift)
plotLift(m.vl,dv$PersonalLoan)
plotLift(m.vl,dv$PersonalLoan,cumulative = F)
plotLift(predcton,dv$PersonalLoan)
plotLift(predcton,dv$PersonalLoan,cumulative = F)
# Copy the data to csv file for further analysis

dout=data.frame(Outcome=predcton)
dv$Outcome=dout$Outcome
write.csv(dv,"PredUniversalBank.csv",row.names = F)
# Using caret package for KNN
library(caret)
# Read dataset
wine=read.csv(choose.files())
names(wine)
# Use for estimation
# Training and Validation Partition

set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]
# Preprocess. Scaling is necessary if the range of different variables are

different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)
# Check the output

names(winetpp)
head(winetpp)
# Develop the model and check

set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=10)
mod
plot(mod)
# validate the model

winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
mape=sum(abs((pred-winevpp$Origin)/winevpp$Origin))/nrow(winevpp)*100
mape
# Use for Classification
# Convert the output as factor

wine$Origin=factor(wine$Origin)

set.seed(234)
winet=wine[rec,]
winev=wine[-rec,]

different
summary(winet)
# Check the output

names(winetpp)
head(winetpp)

set.seed(234)
mod
plot(mod)

pred
table(winevpp$Origin,pred)
# Using k-Nearest Neighbor in the Iris dataset to classify in terms of

Species
# Load dataset & library
library(class)
di<-read.table("iris.csv",header=T,sep=",")
# Training and Validation Partitions

set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]
# Run kNN algorithm

pred<-knn(trn[,-5],val[,-5],trn[,5],k=3)
# Generate Confusion Matrix

print("k=3")
print(table(pred,val[,5]))
# Finetune model with different values of k

print("k=1")
print("k=5")

Species
library(class)

set.seed(1020)
trn<-di[part,]
val<-di[-part,]
# Run kNN algorithm


print("k=3")

print("k=1")
print("k=5")
# Use Iris dataset with Naive Bayes - Caret Package
# Open Dataset
di<-read.csv("iris.csv",header=T)
head(di)
# Call Library
library(caret)
# Partition dataset
rec=createDataPartition(y=di$Species,p=0.7,list=F)
dit=di[rec,]
div=di[-rec,]
# Train the model and explore

mnb=train(Species~.,data=dit,method="nb")
summary(mnb)
names(mnb)
mnb$results
# Use the model to predict on validation dataset and evaluate

outp=predict(mnb,div)
table(div$Species,outp)
# Output the probability values instead of class

outp=predict(mnb,div,type="prob")
# Using Naive Bayes in the Iris dataset to classify in terms of Species

library(klaR)

set.seed(1020)
trn<-di[part,]
val<-di[-part,]
# Develop model over training set

m.tr<-NaiveBayes(Species~.,dat=trn)
# Plot the model

plot(m.tr)
# Predict the performance of the model using validation set

m.vl<-predict(m.tr,dat=trn)
# Print the confusion matrix for training set

print(table(trn$Species,m.vl$class))
# Print the confusion matrix for validation set

m.vl<-predict(m.tr,val)
print(table(val$Species,m.vl$class))
# Load a new dataset which does not have the classification

pd<-read.table("irispred.csv",header=T,sep=",")
# PCA using the provided functions
# Load the wine.txt dataframe
wine<-read.csv("wine.csv",header=T)
# Plot to see correlations

# Not including 'Price' in scatterplot matrix, as we are checking
correlations amongst the predictors
# 'Price' is the dependent variable
library(car)
scatterplotMatrix(wine[-4])
# Find the principal Components

wine.pc=prcomp(wine[-4],center=T,scale=T)
# See what are the results of prcomp

print(paste("What does the prcomp result in?"))
print(names(wine.pc))
# See the summary of the PCA

print(paste("Summary of PCA"))
print(summary(wine.pc))
# Get the eigenvalues as square of st dev - i.e., the variance

print(paste("Eigenvalues :"))
print(wine.pc$sdev^2)
# Draw a screeplot to decide how many components to take

screeplot(wine.pc,main="Scree Plot",xlab="Components")
screeplot(wine.pc,type="line", main="Scree Plot")
# Dotplot PC1
library(lattice)
load = wine.pc$rotation
print(paste("The loadings are as follows: "))
print(load)
# order the weights of PC1 per variable

ordered.load=load[order(load[,1]),1]
dotplot(ordered.load,main="Loadings Plot of PC1",xlab="Variable
Loadings",col="red",cex="1.5")
# Dotplot PC2
ordered.load2=load[order(load[,2]),2]
dotplot(ordered.load2,main="Loadings Plot of PC2",xlab="Variable
Loadings",col="red",cex="1.5")
# Draw a biplot
biplot(wine.pc,cex=c(1,0.7))
# Print the final scored data

print(wine.pc$x)
# Cereal dataset - to predict rating
# Use library caret

library(caret)
# load dataframe
cer=read.csv(choose.files(),header=T)
# remove the first column - Cereal - as that is just the name and may not
be important
cer=cer[,-1]
# partition into training and validation partitions

rec=createDataPartition(cer$rating,p=0.7,list=F)
certrain=cer[rec,]
cerval=cer[-rec,]
# create the model and see the outcomes

mlm=train(rating~.,data=certrain,method="lm")
summary(mlm)
# list the importance of variables - in this case by the absolute value of

t-statistics
varImp(mlm)
plot(varImp(mlm))
# predict on the validation set

predrating=predict(mlm,cerval)
mape=sum(abs(cerval$rating-predrating))/nrow(cerval)*100
mape
# This is a script to draw the plot for cooks distance of a regression

model
# and identify those points which have a higher value of cooks distance
# Read in the csv file
d<-read.csv("regtest.csv")
# Partition into training and validation data set

sam<-sample(seq_len(nrow(d)),0.7*nrow(d),replace=F)
dt<-d[sam,]
dv<-d[-sam,]
# Develop the regression model based on the training set
m.lm<-lm(endurance~age, data=dt)
# Compute cooks distance

cd<-cooks.distance(m.lm)
# Identify the data points which have a cooks distance greater than 0.04
tp<-seq(1:length(cd))
ip<-tp[cd>0.04]
iv<-cd[ip]
# Make the final plot with necessary identification

plot(cd)
text(ip,iv-(max(cd)*0.05),names(iv),col="blue",cex=0.7)
# Draw scatterplot matrix using ggplot

library(ggplot2)
# GGally is a package that is based on ggplot2 and enhances certain

features
library(GGally)
# Open iris dataset

di=read.csv(choose.files())
# Draw scatterplot
ggscatmat(di,columns=1:4)
# Draw scatterplot matrix for the numerical values

ggpairs(di, columns=1:4)
# Brush based on Species

ggpairs(di, columns=1:4,aes(color=di$Species))
# Draw scatterplot
ggscatmat(di,columns=1:4)
# Simple preprocessing of data using library caret
# load library
library(caret)
# Open the dataset and investigage

di<-read.csv(choose.files(),header=T)
head(di)
str(di)
# Partition the data

set.seed(34)
rec=createDataPartition(y=di$Species,p=0.7,list=F)
dit=di[rec,]
div=di[-rec,]
# Scale the data
# preprocessing options in caret:
# �center�: subtract mean from values.
# �scale�: divide values by standard deviation.
# �range�: normalize values.
# In case of missing values, one can use knn to impute "knnImpute"
pps=preProcess(dit,method=c("scale","center"))
train=predict(pps,dit)
val=predict(pps,div)
# One can also create dummy variables using caret

# Try to create dummy for "Species"
dmy=dummyVars("~.",dn,fullRank = T)
# fullRank True will create n-1 dymmys, while False will create n dummies
dnnew=data.frame(predict(dmy,newdata=di))
# the outcome is a matrix, hence it needs to be coarced to data frame
# Example for Feature Selection
# Read file - Cement.txt

d<-read.table(choose.files(),header=T)
names(d)
# Using library caret

library(caret)
# Use Recursive Feature Elimination (Backward Selection)

set.seed(10)
ctrl=rfeControl(functions=lmFuncs,method="cv",verbose=F)
lmfinal=rfe(d[,-11],d[,11],sizes=c(5:10),rfeControl = ctrl)
lmfinal
# Plot the result to understand the selection

plot(lmfinal)
plot(lmfinal, metric = "MAE")
# Using library leaps

library(leaps)
# method can be "Cp","adjr2", or "r2"

v=leaps(d[,-1],d[,1],method = "Cp")
# Use regsubsets (method may be exhaustive, forward, backward,seqrep)

v1=regsubsets(y~.,d,nbest=2,nvmax=12,method="exhaustive")
n=summary(v1)
n
# Check the result

n$which
n$adjr2
n$cp
n$r2
# Step Wise Selection using caret

cpm=train(y~.,d,method="glmStepAIC")
# use the previous model to predict the class
print(predict(m.tr,pd))
# cSplit(df, 1:ncol(df), sep=",", stripWhite=TRUE, type.convert=FALSE)
# Code to remove rows based on row names
# Load Cereals Dataset

d=read.csv(choose.files())
# Create data partitions

library(caret)
set.seed(55)
rec=createDataPartition(y=d$rating,p=0.7,list=F)
dt=d[rec,]
dv=d[-rec,]
# We want to remove records with rownames of 5 and 7

head(dt)
rowtoremove=c(5,7)
dtn=dt[!row.names(dt) %in% rowtoremove,]
# Check if those two records have been removed

head(dtn)

Species
library(class)

set.seed(1020)
trn<-di[part,]
val<-di[-part,]
# Run kNN algorithm


print("k=3")

print("k=1")
print("k=5")
# Using caret package for KNN

library(caret)
# Read dataset
wine=read.csv(choose.files())
names(wine)
# Use for estimation

set.seed(234)
winet=wine[rec,]
winev=wine[-rec,]

different
summary(winet)
# Check the output

names(winetpp)
head(winetpp)

set.seed(234)
mod
plot(mod)

pred
mape=sum(abs((pred-winevpp$Origin)/winevpp$Origin))/nrow(winevpp)*100
mape
# Use for Classification
# Convert the output as factor

wine$Origin=factor(wine$Origin)

set.seed(234)
winet=wine[rec,]
winev=wine[-rec,]

different
summary(winet)
# Check the output

names(winetpp)
head(winetpp)

set.seed(234)
mod
plot(mod)

pred
table(winevpp$Origin,pred)

Codes

Hochgeladen von

Dokumentinformationen

Originalbeschreibung:

Copyright

Verfügbare Formate

Dieses Dokument teilen

Dokument teilen oder einbetten

Freigabeoptionen

Stufen Sie dieses Dokument als nützlich ein?

Sind diese Inhalte unangemessen?

Copyright:

Verfügbare Formate

Codes

Hochgeladen von

Copyright:

Verfügbare Formate

# This script uses CART to do estimation

# Uses the Cereals dataset

# Remove the fields "Cereals" and "shelf"

# Partition into training and validation datasets

# Use rpart to create the CART

# See the model output

# Plot the tree

# Get the ps plot for better visibility

# Check where to prune

# Use the Cp and X validation plot

# Use the model to predict the rating for validation data

# Plot the result

# Prepare to get the scored data into a csv file

# Use RandomForest to compare results

# This script is for a classifier using CART

# Read the play.csv dataset

# Use the rpart function to generate the CART

# The following rpart includes a prior proportion of the two classes

# See the rules generated by the CART

# See the complexity at each partition

# Plot the tree to see the same visually

# Following will generate a more decorative tree in ps format

# Predicting on the training dataset, as no validation dataset was defined.

# Get the confusion matrix

# This script performs Logistic Regression on the Universal Bank Dataset

# Use Library Caret

# Read the dataset into a data frame

# Convert PersonalLoan Column to factor

# Divide into training and validation partition

# Run Logistic Regression on the training dataset using Income as predictor

# Predict the result on validation data

confusionMatrix(fit.m[,2],dv$PersonalLoan, positive = "1")

# Draw ROC Curve

# Try with even lower cutoff at the cost of FPR

# Draw lift charts

# Copy the data to csv file for further analysis

# This script performs Logistic Regression on the Universal Bank Dataset

# Divide into training and validation partition

# Run Logistic Regression on the training dataset using Income as predictor

# See the details of the model generated

# See the anova table with Chi Squared test

# Use the model on validation data to see the performance

# Draw ROC Curve

# Try with even lower cutoff at the cost of FPR

# Draw lift charts

# Copy the data to csv file for further analysis

# Use for estimation

# Training and Validation Partition

# Preprocess. Scaling is necessary if the range of different variables are

# Check the output

# Develop the model and check

# validate the model

# Use for Classification

# Convert the output as factor

# Training and Validation Partition

# Preprocess. Scaling is necessary if the range of different variables are

# Check the output

# Develop the model and check

# validate the model

# Using k-Nearest Neighbor in the Iris dataset to classify in terms of

# Training and Validation Partitions

# Run kNN algorithm

# Generate Confusion Matrix