Beruflich Dokumente
Kultur Dokumente
else
ListFw[i]=FALSE
}
}
return(ListFw)
}
#4#count
of
the
number
of
lines
in
the
body
of
the
email
message
numLinesInBody=
function(x){
LinesBody=
list()
for(i
in
1:
length(x)){
body=
trainMessages[[i]]$body
if(length(body)==0){
LinesBody[i]=
0
}
else
{
body=paste(body,
collapse=
"")
B=
length(trainMessages[[i]]$body)
if(B
>
0)
LinesBody[i]=length(trainMessages[[i]]$body)
else
LinesBody[i]=0
}
}
return(LinesBody)
}
#5#if
any
puncuation
appears
in
the
subject
isPunc=
function(x)
{
ListP=
list()
for(i
in
1:
length(x)){
Subject=
trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListP[i]=
FALSE
}
else
{
Subject=paste(Subject,
collapse=
"")
P=
grepl('[[:punct:]]',
Subject)
if(P==TRUE)
ListP[i]=TRUE
else
ListP[i]=FALSE
}
}
return(ListP)
}
#6#If
the
Subject
of
the
message
alternates
from
Capital
to
lowercase
more
than
once
AlternateCap=
function(x)
{
Alter=
list()
for(i
in
1:
length(x)){
Subject=
trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Alter[i]=
FALSE
}
else
{
Subject=paste(Subject,
collapse=
"")
A=
grepl('([A-Z][a-z]){2,}',
Subject)
if(A==TRUE)
Alter[i]=TRUE
else
Alter[i]=FALSE
}
}
return(Alter)
}
#7#count
of
the
number
of
exclamation
marks
in
the
subject
subjectExclamationCount=
function(x){
EC=
list()
for(i
in
1:
length(x)){
Subject=
trainMessages[[i]]$header['Subject']
if(length(Subject)
==
0)
{
EC[i]
=
0
}
else{
Subject=
paste(Subject,
collapse=
"")
count=
gregexpr("!",
Subject)
if(count[[1]][1]==
-1){
EC[i]=
0
}
else{
count=
length(count[[1]])
EC[i]=
count
}
}
}
return(EC)
}
#8#count
of
the
number
of
question
marks
in
the
subject
subjectQuestionCount=
function(x){
QC=
list()
for(i
in
1:
length(x)){
Subject=
trainMessages[[i]]$header['Subject']
if(length(Subject)
==
0)
{
QC[i]
=
0
}
else{
Subject=
paste(Subject,
collapse=
"")
count=
gregexpr("//?",
Subject)
if(count[[1]][1]==
-1){
QC[i]=
0
}
else{
count=
length(count[[1]])
QC[i]=
count
}
}
}
return(QC)
}
#9#whether
the
Subject
of
the
mail
is
in
capital
letters
isYelling=
function(x)
{
Yell=
list()
for(i
in
1:
length(x)){
Subject=
trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Yell[i]=
FALSE
}
else
{
Subject=paste(Subject,
collapse=
"")
Y=
!grepl('[a-z]',
Subject)
if(Y==TRUE)
Yell[i]=TRUE
else
Yell[i]=FALSE
}
}
return(Yell)
}
#10#if
there
are
no
blanks
in
the
subject
SubjectBlanks=
function(x)
{
Blank=
list()
for(i
in
1:
length(x)){
Subject=
trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Blank[i]=
FALSE
}
else
{
Subject=paste(Subject,
collapse=
"")
B=
!grepl('[
]',
Subject)
if(B==TRUE)
Blank[i]=TRUE
else
Blank[i]=FALSE
}
}
return(Blank)
}
#11#whether
the
header
states
that
the
message
is
a
multipart,
i.e.
with
attachments.
multipartText=function(x){
mulText=
list()
for(i
in
1:
length(x)){
header=
trainMessages[[i]]$header
if(length(header==0)){
mulText[i]=
FALSE
}
else{header=paste(header,
collapse=
"")
MT=
grepl("multipart/text",header['Content-Type'])
if(MT==TRUE)
mulText[i]=TRUE
else
mulText[i]=FALSE
}
}
return(mulText)
}
#12#whether
the
subject
contains
one
of
spam
phrases
subjectSpamWords=function(x)
{
WORD=
list()
for(i
in
1:
length(x)){
Subject=
trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
WORD[i]=
FALSE
}
else
{
Subject=paste(Subject,
collapse=
"")
W=
grepl('viagra|pounds|free|weight|guarantee|millions|dollars|credit|risk
|prescription
|generic|drug|money
back|credit
card',
Subject)
if(W==TRUE)
WORD[i]=TRUE
else
WORD[i]=FALSE
}
}
return(WORD)
}
#13#whether
the
message
body
contains
a
form
of
the
introduction
Dear
...
isDear=
function(x)
{
ListDear=
list()
for(i
in
1:
length(x)){
body=
trainMessages[[i]]$body
if(length(body)==0){
ListDear[i]=
FALSE
}
else
{
body=paste(body,
collapse=
"")
D=
grepl('Dear|DEAR|dear',
body)
if(D==TRUE)
ListDear[i]=TRUE
else
ListDear[i]=FALSE
}
}
return(ListDear)
}
#14#If
the
string
"vacation"
occurs
in
the
body
of
the
message
isVacation=function(x)
{
ListVacation=
list()
for(i
in
1:
length(x)){
body=
trainMessages[[i]]$body
if(length(body)==0){
ListVacation[i]=
FALSE
}
else
{
body=paste(body,
collapse=
"")
V=
any(grepl('Vacation|VACATION|vacation',
body))
if(V==TRUE)
ListVacation[i]=TRUE
else
ListVacation[i]=FALSE
}
}
return(ListVacation)
}
##15#If
there
is
a
url
link
in
the
body
of
the
message
isLink=function(x)
{
Listlink=
list()
for(i
in
1:
length(x)){
body=
trainMessages[[i]]$body
if(length(body)==0){
Listlink[i]=
FALSE
}
else
{
body=paste(body,
collapse=
"")
L=
any(grepl('https?://[a-zA-Z0-9.]+
[.][[:alpha:]{2,6}[/[:blank:]]',
body))
if(L==TRUE)
Listlink[i]=TRUE
else
Listlink[i]=FALSE
}
}
return(Listlink)
}
return(Capsub)
}
#18#whether
the
body
text
includes
a
line
indicating
the
word
wrote:
isWrote=function(x)
{
ListWrote=
list()
for(i
in
1:
length(x)){
body=
trainMessages[[i]]$body
if(length(body)==0){
ListWrote[i]=
FALSE
}
else
{
body=paste(body,
collapse=
"")
L=
any(grepl('wrote:
',
body))
if(L==TRUE)
ListWrote[i]=TRUE
else
ListWrote[i]=FALSE
}
}
return(ListWrote)
}
#19#the
number
of
letters
and
numbers
in
the
body
of
the
email
message
bodyCharacterCount=
function(x){
BCC=
list()
for(i
in
1:
length(x)){
body=
trainMessages[[i]]$body
if(length(body)
==
0)
{
BCC[i]
=
0
}
else{
body=
paste(body,
collapse=
"")
count=
gregexpr("[A-Za-z0-9]",
body)
if(count[[1]][1]==
-1){
BCC[i]=
0
}
else{
count=
length(count[[1]])
BCC[i]=
count
}
}
}
return(BCC)
}
#20#How
many
lines
that
html
tags
exist
in
the
header
of
the
message
TagExists=
function(x){
Tag=
list()
for(i
in
1:
length(x)){
header=
trainMessages[[i]]$header
if(length(header)
==
0)
{
Tag[i]
=
0
}
else{
header=
paste(header,
collapse=
"")
count=
gregexpr("<[^>/][^.]*>",
header)
if(count[[1]][1]==
-1){
Tag[i]=
0
}
else{
count=
length(count[[1]])
Tag[i]=
count
}
}
}
return(Tag)
}
df=
data.frame(
unlist(isSpam(trainMessages)),
unlist(isRe(trainMessages)),
unlist(isFw(trainMessages)),
unlist(numLinesInBody(trainMessages)),
unlist(isPunc(trainMessages)),
unlist(AlternateCap(trainMessages)),
unlist(subjectExclamationCount(trainMessages)),
unlist(subjectQuestionCount(trainMessages)),
unlist(isYelling(trainMessages)),
unlist(SubjectBlanks(trainMessages)),
unlist(multipartText(trainMessages)),
unlist(subjectSpamWords(trainMessages)),
unlist(isDear(trainMessages)),
unlist(isVacation(trainMessages)),
unlist(isLink(trainMessages)),
unlist(numDollarSigns(trainMessages)),
unlist(SubjectCapital(trainMessages)),
unlist(bodyCharacterCount(trainMessages)),
unlist(isWrote(trainMessages)),
unlist(TagExists(trainMessages))
)
colnames(df)=
c("isSpam",
"isRe",
"isFw",
"numLinesInBody",
"isPunc",
"AlternateCap",
"SubjectExclamationCount",
"subjectQuestionCount",
"isYelling",
"pSubjectblanks",
"multipartText",
"subjectSpamWords",
"isDear",
"isVacation",
"isLink",
"numDollarSign",
"SubjectCapital",
"priority",
"isWrote",
"TagExists")
library(ggplot2)
#Show
isRe
for
isSpam
ggplot(df,
aes(isSpam,
fill=
isRe))+
geom_bar()
#show
Number
of
Spam
and
not
Spam
table(isSpam)
##
isSpam
##
FALSE
TRUE
##
4864
1677
#Compare
isDear
with
isSpam
ggplot(df,
aes(isSpam,
fill=
isDear))+
geom_bar()
#Compare
isSpam
with
ggplot(df,
aes(isSpam,
fill=
isYelling))
+
geom_bar()