Sie sind auf Seite 1von 3

Libraries

1.textir
2. MARSS
3.tm
4.SnowballC
library(e1071)
library(textir)
library(tm)
library(VGAM)

=================================Lab 1========================================
@taken the test construct a excel sheet. 2nd column is document numebr
@constructed a excel sheet with 4 column.
@How many times this document
freqs.df= read.csv("frequency.csv", stringsAsFactors = FALSE)
@create a table where the rows are all possible.
result <- xtabs(Freq ~ ID+Var1, data=freqs.df)
dim(result)
colnames(result)
#To be useful in the next step, you need to convert result into a matrix object
@convert every entry into numeric.
final.m <- apply(result, 2, as.numeric)
#limit the feature list to only those words that appear across the entire corpus
with a mean
relative frequency of some threshold.
@this will create a bolean vector for every column.
smaller.m <- final.m[,apply(final.m,2,mean)>=.25]
# Create a distance object
dm <- dist(smaller.m)
# Perform a cluster analysis on the distance object
cluster <- hclust(dm)
plot(cluster)
=================================Lab 2==========================================
==
@take different senteneces and predict the author.
@every row corrresponding to novel
@each column is relative frequency.
novels = read.csv("novels.csv",stringsAsFactors = FALSE)
@
anon.v <- which(novels$author.v == "anonymous")
@training set excluding the rows which we don't know the authors name
train <- novels[-anon.v,2:ncol(novels)]
@testing set
class.f <- novels[-anon.v,"author.v"]
@support vector machine
library(e1071)
model.svm <- svm(train, factor(class.f))

pred.svm <- predict(model.svm, train)


table(pred.svm, class.f)
testdata <- novels[anon.v,2:ncol(novels)]
final.result <- predict (model.svm, testdata)
as.data.frame(final.result)
library(textir)
@DATASET COMES with above fucntions
data(we8there)
#Classification using vglm
colnames(we8thereCounts)
t = sample(1:nrow(we8thereCounts),1000)
fgl = we8thereCounts[t,1:100]
covars = fgl
for( i in 1: 100){
covars[,i] = (fgl[,i]- mean(fgl[,i]))/sd(fgl[,i])
}
dd=data.frame(cbind(type=as.numeric(we8thereRatings[t,5]) ,as.matrix(covars)))
library(VGAM)
gg <- vglm(type~.,multinomial,data=dd)
round(fitted(gg),2)
cbind(round(fitted(gg),2),as.numeric(we8thereRatings[t,5]))
@Never used vglm for text analytcs
#Using mnlm functionl
trainlist = sample(1:nrow(we8thereCounts),0.7*nrow(we8thereCounts))
trainset = we8thereCounts[trainlist,]
testset = we8thereCounts[-trainlist,]
trainres = factor(we8thereRatings[trainlist,5])
fit = mnlm(NULL,covars=trainset, counts=trainres)
testres = we8thereRatings[-trainlist,5]
predictedres = round(predict(fit, testset, type="response"),1)
cbind(predictedres,testres)
======================lab 3============================================
#Preprocessing the text to construct DocumentTermMatrix
d = read.csv("C:/cd/Data sets/9-text analytics/quotes.csv")
library(tm)
ds <- DataframeSource(d)
@Create corpus object
myCorpus<-Corpus(ds)
# convert to lower case
myCorpus <- tm_map(myCorpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus<- tm_map(myCorpus, PlainTextDocument)

#In the above code, tm_map() is an interface to apply transformations (mappings)


to corpora. A
list of available transformations can be obtained with getTransformations(), and
the mostly used
ones are as.PlainTextDocument(), removeNumbers(), removePunctuation(), stemDocum
ent() and stripWhitespace().
myTdm<- DocumentTermMatrix(myCorpus, control = list(weighting = weightTf, stopwo
rds = TRUE,minWordLength=2))
temp = as.matrix(myTdm)

Das könnte Ihnen auch gefallen