Beruflich Dokumente
Kultur Dokumente
1.textir
2. MARSS
3.tm
4.SnowballC
library(e1071)
library(textir)
library(tm)
library(VGAM)
=================================Lab 1========================================
@taken the test construct a excel sheet. 2nd column is document numebr
@constructed a excel sheet with 4 column.
@How many times this document
freqs.df= read.csv("frequency.csv", stringsAsFactors = FALSE)
@create a table where the rows are all possible.
result <- xtabs(Freq ~ ID+Var1, data=freqs.df)
dim(result)
colnames(result)
#To be useful in the next step, you need to convert result into a matrix object
@convert every entry into numeric.
final.m <- apply(result, 2, as.numeric)
#limit the feature list to only those words that appear across the entire corpus
with a mean
relative frequency of some threshold.
@this will create a bolean vector for every column.
smaller.m <- final.m[,apply(final.m,2,mean)>=.25]
# Create a distance object
dm <- dist(smaller.m)
# Perform a cluster analysis on the distance object
cluster <- hclust(dm)
plot(cluster)
=================================Lab 2==========================================
==
@take different senteneces and predict the author.
@every row corrresponding to novel
@each column is relative frequency.
novels = read.csv("novels.csv",stringsAsFactors = FALSE)
@
anon.v <- which(novels$author.v == "anonymous")
@training set excluding the rows which we don't know the authors name
train <- novels[-anon.v,2:ncol(novels)]
@testing set
class.f <- novels[-anon.v,"author.v"]
@support vector machine
library(e1071)
model.svm <- svm(train, factor(class.f))