Sie sind auf Seite 1von 2

# Trying out PCA in its raw form

# Load Dataframe
grd<-read.csv("marks.csv")

# Read the headers


print(names(grd))

# plot the points


plot(grd[,1],grd[,2], col="red",xlab="Phys",ylab="Stat",sub="Raw, Unscaled")

# Need to center the data: Use an user-defined function


grd.center=apply(grd,2,function(x) (x-mean(x)))

# Now re-plot the centered data


plot(grd.center[,1],grd.center[,2], col="red",xlab="Phys",ylab="Stat",sub="Mean
Scaled")

# Calulate the covariance matrix


cov.grd.center=cov(grd.center)

print(cov.grd.center)

# Calculate eigenvectors and eigenvalues


egn.grd.center=eigen(cov.grd.center)
print(egn.grd.center)

# Put rownames and column names to our eigenvectors for easy reading
rownames(egn.grd.center$vectors)=c("Physics","Stats")
colnames(egn.grd.center$vectors)=c("PC-1","PC-2")

# The sum of eigenvalues should be equal to the sum of variance of the data, you
can check that out
print(paste("The Sum of the eigenvalues :",sum(egn.grd.center$values), sep=""))
print(paste("The sum of the variance: ", var(grd.center[,1])
+var(grd.center[,2]),sep=""))

# Let us plot the eigenvectors lines and the centered data points
pc1.slope=egn.grd.center$vectors[2,1]/egn.grd.center$vectors[1,1]
pc2.slope=egn.grd.center$vectors[2,2]/egn.grd.center$vectors[1,2]

plot(grd.center[,1],grd.center[,2], col="red",xlab="Phys",ylab="Stat",sub="Mean
Scaled")
abline(0,pc1.slope, col="blue")
abline(0,pc2.slope, col="green")

# Library calibrate for textxy


library(calibrate)
textxy(12,10,"(-0.710,-0.695)",cx=0.9,dcol="red")
textxy(-12,10,"(0.695,-0.719)",cx=0.9,dcol="green")

# How much variation each eigenvalue accounts for?


pc1.var=egn.grd.center$values[1]/sum(egn.grd.center$values)*100
pc2.var=egn.grd.center$values[2]/sum(egn.grd.center$values)*100
xlab=paste("pc 1: ",pc1.var, " % of variation", sep="")
ylab=paste("pc 2: ",pc2.var, " % of variation", sep="")

# Take the principle components i.e. the eigenvectors in a matrix. Also called
loadings
loadings=egn.grd.center$vectors

# Calculate the scores via matrix multiplication


score=grd.center %*% loadings

# Calculate the standard deviations


sd=sqrt(egn.grd.center$values)

# Plot the scores


plot(score,ylim=c(-10,10), main="Scores in terms of eigenvectors/PCs",
xlab=xlab,ylab=ylab,col="red")
abline(0,0,col="blue")
abline(0,90,col="green")

# Correlation Biplot
plot(score[,1]/sd[1],score[,2]/sd[2],main="BiPlot",xlab=xlab,ylab=ylab,type="n")
abline(0,0,col="blue")
abline(0,90,col="green")

# Make line more prominent


factor=5

# Plot variables as vectors


arrows(0,0,loadings[,1]*sd[1]/factor,loadings[,2]*sd[2]/factor,length=0.1,
lwd=2,angle=20, col="red")
abline(0,0,col="blue")
abline(0,90,col="green")
text(loadings[,1]*sd[1]/factor*1.2,loadings[,2]*sd[2]/factor*1.2,rownames(loadings)
, col="red", cex=1.2)

# Provide row names for the scored data for plotting


rownames(score)=seq(1:nrow(score))
text(score[,1]/sd[1],score[,2]/sd[2], rownames(score),col="blue", cex=0.7)

Das könnte Ihnen auch gefallen