Sie sind auf Seite 1von 11

Spam

Email Classification Austin Kinion


#1#use 'spam' from directory to classify T or F on each email#
isSpam=function(x) {

ListSpam= grepl("spam", names(trainMessages[]), fixed=TRUE)

return(ListSpam)
}
#2#if the string Re: appears in the subject
isRe=function(x) {
ListRe= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListRe[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
S= grepl('Re:', Subject)

if(S==TRUE)
ListRe[i]=TRUE

else
ListRe[i]=FALSE
}
}

return(ListRe)
}
#3#if the string Fw: appears in the subject
isFw= function(x) {
ListFw= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListFw[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
F= grepl('Fw:', Subject)

if(F==TRUE)
ListFw[i]=TRUE


else
ListFw[i]=FALSE
}
}
return(ListFw)
}
#4#count of the number of lines in the body of the email message
numLinesInBody= function(x){

LinesBody= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
LinesBody[i]= 0
}
else {
body=paste(body, collapse= "")
B= length(trainMessages[[i]]$body)

if(B > 0)
LinesBody[i]=length(trainMessages[[i]]$body)

else
LinesBody[i]=0
}
}

return(LinesBody)
}
#5#if any puncuation appears in the subject
isPunc= function(x) {
ListP= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListP[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
P= grepl('[[:punct:]]', Subject)

if(P==TRUE)
ListP[i]=TRUE

else
ListP[i]=FALSE
}

}
return(ListP)
}
#6#If the Subject of the message alternates from Capital to lowercase
more than once
AlternateCap= function(x) {
Alter= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Alter[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
A= grepl('([A-Z][a-z]){2,}', Subject)

if(A==TRUE)
Alter[i]=TRUE

else
Alter[i]=FALSE
}
}
return(Alter)
}
#7#count of the number of exclamation marks in the subject
subjectExclamationCount= function(x){

EC= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
EC[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("!", Subject)

if(count[[1]][1]== -1){
EC[i]= 0
}
else{
count= length(count[[1]])
EC[i]= count
}
}
}

return(EC)
}
#8#count of the number of question marks in the subject
subjectQuestionCount= function(x){

QC= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
QC[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("//?", Subject)

if(count[[1]][1]== -1){
QC[i]= 0
}
else{
count= length(count[[1]])
QC[i]= count
}
}
}
return(QC)
}
#9#whether the Subject of the mail is in capital letters
isYelling= function(x) {
Yell= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Yell[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
Y= !grepl('[a-z]', Subject)

if(Y==TRUE)
Yell[i]=TRUE

else
Yell[i]=FALSE
}
}

return(Yell)
}
#10#if there are no blanks in the subject
SubjectBlanks= function(x) {
Blank= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Blank[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
B= !grepl('[ ]', Subject)

if(B==TRUE)
Blank[i]=TRUE

else
Blank[i]=FALSE
}
}

return(Blank)
}
#11#whether the header states that the message is a multipart, i.e.
with attachments.
multipartText=function(x){

mulText= list()

for(i in 1: length(x)){
header= trainMessages[[i]]$header

if(length(header==0)){

mulText[i]= FALSE
}
else{header=paste(header, collapse= "")

MT= grepl("multipart/text",header['Content-Type'])

if(MT==TRUE)
mulText[i]=TRUE

else
mulText[i]=FALSE
}

}
return(mulText)
}
#12#whether the subject contains one of spam phrases
subjectSpamWords=function(x) {
WORD= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
WORD[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
W=
grepl('viagra|pounds|free|weight|guarantee|millions|dollars|credit|risk
|prescription
|generic|drug|money back|credit card', Subject)

if(W==TRUE)
WORD[i]=TRUE

else
WORD[i]=FALSE
}
}

return(WORD)
}
#13#whether the message body contains a form of the introduction Dear
...
isDear= function(x) {
ListDear= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListDear[i]= FALSE
}
else {
body=paste(body, collapse= "")
D= grepl('Dear|DEAR|dear', body)

if(D==TRUE)
ListDear[i]=TRUE

else
ListDear[i]=FALSE
}
}


return(ListDear)
}
#14#If the string "vacation" occurs in the body of the message
isVacation=function(x) {
ListVacation= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListVacation[i]= FALSE
}
else {
body=paste(body, collapse= "")
V= any(grepl('Vacation|VACATION|vacation', body))

if(V==TRUE)
ListVacation[i]=TRUE

else
ListVacation[i]=FALSE
}
}

return(ListVacation)
}
##15#If there is a url link in the body of the message
isLink=function(x) {
Listlink= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
Listlink[i]= FALSE
}
else {
body=paste(body, collapse= "")
L= any(grepl('https?://[a-zA-Z0-9.]+
[.][[:alpha:]{2,6}[/[:blank:]]', body))

if(L==TRUE)
Listlink[i]=TRUE

else
Listlink[i]=FALSE
}
}

return(Listlink)
}

##16#the number of dollar signs in the body of the message


numDollarSigns= function(x){

CountDollar= list()

for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body) == 0) {
CountDollar[i] = 0
}
else{
body= paste(body, collapse= "")
count= gregexpr("\\$", body)

if(count[[1]][1]== -1){
CountDollar[i]= 0
}
else{
count= length(count[[1]])
CountDollar[i]= count
}
}
}
return(CountDollar)
}
#17#Number of Capital letters in Subject of message
SubjectCapital= function(x){

Capsub= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
Capsub[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("[A-Z]", Subject)

if(count[[1]][1]== -1){
Capsub[i]= 0
}
else{
count= length(count[[1]])
Capsub[i]= count
}
}
}

return(Capsub)
}
#18#whether the body text includes a line indicating the word wrote:
isWrote=function(x) {
ListWrote= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListWrote[i]= FALSE
}
else {
body=paste(body, collapse= "")
L= any(grepl('wrote: ', body))

if(L==TRUE)
ListWrote[i]=TRUE

else
ListWrote[i]=FALSE
}
}

return(ListWrote)
}
#19#the number of letters and numbers in the body of the email message
bodyCharacterCount= function(x){

BCC= list()

for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body) == 0) {
BCC[i] = 0
}
else{
body= paste(body, collapse= "")
count= gregexpr("[A-Za-z0-9]", body)

if(count[[1]][1]== -1){
BCC[i]= 0
}
else{
count= length(count[[1]])
BCC[i]= count
}
}
}

return(BCC)
}
#20#How many lines that html tags exist in the header of the message
TagExists= function(x){

Tag= list()

for(i in 1: length(x)){
header= trainMessages[[i]]$header
if(length(header) == 0) {
Tag[i] = 0
}
else{
header= paste(header, collapse= "")
count= gregexpr("<[^>/][^.]*>", header)

if(count[[1]][1]== -1){
Tag[i]= 0
}
else{
count= length(count[[1]])
Tag[i]= count
}
}
}
return(Tag)
}
df= data.frame(
unlist(isSpam(trainMessages)),
unlist(isRe(trainMessages)),
unlist(isFw(trainMessages)),
unlist(numLinesInBody(trainMessages)),
unlist(isPunc(trainMessages)),
unlist(AlternateCap(trainMessages)),
unlist(subjectExclamationCount(trainMessages)),
unlist(subjectQuestionCount(trainMessages)),
unlist(isYelling(trainMessages)),
unlist(SubjectBlanks(trainMessages)),
unlist(multipartText(trainMessages)),
unlist(subjectSpamWords(trainMessages)),
unlist(isDear(trainMessages)),
unlist(isVacation(trainMessages)),
unlist(isLink(trainMessages)),
unlist(numDollarSigns(trainMessages)),
unlist(SubjectCapital(trainMessages)),
unlist(bodyCharacterCount(trainMessages)),
unlist(isWrote(trainMessages)),
unlist(TagExists(trainMessages))

)
colnames(df)= c("isSpam", "isRe", "isFw", "numLinesInBody", "isPunc",
"AlternateCap", "SubjectExclamationCount",
"subjectQuestionCount", "isYelling", "pSubjectblanks",
"multipartText",
"subjectSpamWords", "isDear", "isVacation", "isLink",
"numDollarSign",
"SubjectCapital", "priority", "isWrote", "TagExists")
library(ggplot2)
#Show isRe for isSpam
ggplot(df, aes(isSpam, fill= isRe))+ geom_bar()
#show Number of Spam and not Spam
table(isSpam)
## isSpam
## FALSE TRUE
## 4864 1677
#Compare isDear with isSpam
ggplot(df, aes(isSpam, fill= isDear))+ geom_bar()
#Compare isSpam with
ggplot(df, aes(isSpam, fill= isYelling)) + geom_bar()

Das könnte Ihnen auch gefallen