机器学习——垃圾邮件过滤（R语言）

100 阅读 0 评论 66 点赞

我是靠谱客的博主虚幻发夹，最近开发中收集的这篇文章主要介绍机器学习——垃圾邮件过滤（R语言），觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

以下是根据《机器学习：实用案例解析》一书中的案列写的，感觉结果并没有达到预期的状态，欢迎大家指出我其中理解错的地方，哈哈哈。
 
 原始数据：SpamAssassin语料库，
http://spamassain.apache.org/publiccorpus/

spam：垃圾邮件

easy ham：易识别的正常邮件

hard ham：不易识别的正常邮件

文本分类算法：朴素贝叶斯分类器（假设所有词之间频次统计相互独立）

a）明显更可能在垃圾邮件中出现的词；

b）明显更可能在非垃圾邮件中出现的词。

分类结果：

a）假设电子邮件是垃圾邮件，看到其具体内容的概率；

b）假设电子邮件不是垃圾邮件，看到相同内容的概率。

预测：通过邮件正文内容本身预测邮件的类别

思路：

a）原始邮件预处理：去除头部，保留邮件正文；

b） 构建特征词类别知识库，通过从邮件正文中抽取特证词来构建邮件分类器的特征集。

语料库构造：量化特证词频率——构造一个词项-文档矩阵（TDM）N*M的矩阵，[i，j]表示词项i在文档j中出现的次数

c）构建分类器：

构建一个数据框来保存所有特征项在垃圾邮件中的条件概率；
能在已知观测特征的前提下计算出邮件是垃圾的概率；
根据有多少邮件包含这个特征词项来定义一封邮件是垃圾邮件的条件概率；

一、首先处理spam垃圾邮件：

#加载程序包

library(tm)

library(ggplot2)

#设置数据源路径

spam.path <- file.path("data", "spam")

spam2.path <- file.path("data", "spam_2")

easyham.path <- file.path("data", "easy_ham")

easyham2.path <- file.path("data", "easy_ham_2")

hardham.path <- file.path("data", "hard_ham")

hardham2.path <- file.path("data", "hard_ham_2")

# 读取文件正文内容

# 每份邮件包含头部和正文两个部分一般由第一个空行分割（协议规定）

#打开每个文件，找到空行，并将空行之后的文本返回一个字符串向量，就是空行之后的所有文本拼接之后字符串。

get.msg <- function(path)

{

#"rt" read as text 以文本格式读取，"latin1"编码方式

con <- file(path, open = "rt", encoding = "latin1")

#读取每一行文本返回为字符串向量的一个元素

text <- readLines(con)

# 定位第一个空行，并抽取出其后的所有文本

# tryCatch来捕获异常，不输出,原先代码为msg <- text[seq(which(text == "")[1] + 1, length(text), 1)] 报错

msg <- tryCatch(text[seq(which(text == "")[1] + 1, length(text), 1)], error = function(e) e)

#关闭文件

close(con)

#paste把向量拼接成一个单条文本元素，用"n"换行来分割各个元素

return(paste(msg, collapse = "n"))

}

#保存邮件内容，使得向量的每个元素就是一封邮件内容。

#dir 获得路径下所有文件名列表

spam.docs <- dir(spam.path)

#读取文件名，cmds是用于移动文件的命令，不是数据集，要排除掉

#spam.docs 就是包含用于训练的所有文件名的一个字符向量

spam.docs <- spam.docs[which(spam.docs != "cmds")]

#sapply，对每个邮件应用get.msg函数，通过返回值构建一个文本向量

all.spam <- sapply(spam.docs,

function(p) get.msg(paste(spam.path, p,sep="")))

#检查每个向量元素的名称与文件名是否一一对应

head(all.spam)

#构建一个文本资料库

#定义get.td函数：输入邮件文本向量；输出TDM；

#tm包提供若干方法用于构建语料库；

get.tdm <- function(doc.vec)

{

#stopwords = TRUE,告诉tm在所有文本中移除488个最常见的英文停用词；

#查看停用列表stopwords()；

#removePunctuation = TRUE,移除标点符号；

#removeNumbers = TRUE,移除数字；

#minDocFreq = 2,只保留出现次数大于1次的词；

control <- list(stopwords = TRUE,

removePunctuation = TRUE,

removeNumbers = TRUE,

minDocFreq = 2)

#VectorSource函数构建source对象以利用邮件向量构建语料库.

doc.corpus <- Corpus(VectorSource(doc.vec))

doc.dtm <- TermDocumentMatrix(doc.corpus, control)

#返回的是TMD文本

return(doc.dtm)

}

spam.tdm <- get.tdm(all.spam)

#至此垃圾邮件处理完成，下面开始构建分类器

#用TDM构建一套垃圾邮件训练数据

#输入：TDM

#输出：词频与占比

#as.matrix，把TDM对象转换成R的标准矩阵；

spam.matrix <- as.matrix(spam.tdm)

#rowSums，创建一个向量，包含每个特征在所有文档中的总频次；

spam.counts <- rowSums(spam.matrix)

#data.frame，把一个字符向量和一个数值向量结合在一起；

#stringsAsFactors = FALSE，默认会转换成因子类型，所以要关闭；

spam.df <- data.frame(cbind(names(spam.counts),

as.numeric(spam.counts)),

stringsAsFactors = FALSE)

#修改列名

names(spam.df) <- c("term", "frequency")

#把频次转化成数值向量

spam.df$frequency <- as.numeric(spam.df$frequency)

#计算一个特定词项所出现的文档在所有文档中所占的比例；

#sapply，把每行的行号读入；

#function(i)，统计该行中值为正数的元素个数/TDM中列的总数（也就是垃圾邮件语料库中文档总数）；

spam.occurrence <- sapply(1:nrow(spam.matrix),

function(i)

{

length(which(spam.matrix[i, ] > 0)) / ncol(spam.matrix)

})

#统计整个垃圾邮件语料库中每个词项的出现概率；

spam.density <- spam.df$frequency / sum(spam.df$frequency)

# density: 统计整个语料库中每个词项的频次(不用此来分类, 如果想知道某些词是否影响结果,对比频次相当有用)

spam.df <- transform(spam.df,

density = spam.density,

occurrence = spam.occurrence)

# 查看统计结果

head(spam.df[with(spam.df,order(-occurrence)),])

二、构建正常邮件的训练数据

与垃圾邮件一样，路径使用data/easy_ham

选取和垃圾邮件一样的数量500封。

# 易识别的正常邮件

easyham.docs <- dir(easyham.path)

easyham.docs <- easyham.docs[which(easyham.docs != "cmds")]

all.easyham <- sapply(easyham.docs[1:length(spam.docs)],

function(p) get.msg(file.path(easyham.path, p)))

easyham.tdm <- get.tdm(all.easyham)

easyham.matrix <- as.matrix(easyham.tdm)

easyham.counts <- rowSums(easyham.matrix)

easyham.df <- data.frame(cbind(names(easyham.counts),

as.numeric(easyham.counts)),

stringsAsFactors = FALSE)

names(easyham.df) <- c("term", "frequency")

easyham.df$frequency <- as.numeric(easyham.df$frequency)

easyham.occurrence <- sapply(1:nrow(easyham.matrix),

function(i)

{

length(which(easyham.matrix[i, ] > 0)) / ncol(easyham.matrix)

})

easyham.density <- easyham.df$frequency / sum(easyham.df$frequency)

easyham.df <- transform(easyham.df,

density = easyham.density,

occurrence = easyham.occurrence)

# 查看统计结果

head(easyham.df[with(easyham.df,order(-occurrence)),])

词 频次 概率 文档占比

分析：对于文档来说出现频率高的词，如email，please，其出现的频次并不是最多；

若果用频次来作为垃圾邮件的训练数据，会把某些包含table的垃圾邮件权重调的太高，因为并 不是所有的垃圾邮件都是table方式生成的；

===所以用根据有多少邮件包含这个特征词来定义是垃圾邮件的概率，即occurrence；

分析：对比垃圾邮件和正常邮件可以发现，正常邮件的特征词占比与词频比较稀疏平均；

===如果一封邮件中包含有一两个与垃圾邮件非常相关的词，那么就需要很多非垃圾词才能将其分类为正常邮件。

三、定义分类器并用不易识别的正常邮件进行测试

思考：对于新的邮件中哪些在训练集中的词与不在训练集中的词该如何处理？

思路：计算各个词在训练集中的概率乘积；

对于已在的，直接计算条件概率；

对于未在的，根据某些分布给它们赋予一个随机概率；或者用自然语言处理（NLP）估计一个词 项在上下文中的“垃圾倾向”。

===给这些没出现在数据集中的词赋予一个特别小的概率0.0001%；

处理：每个类别的先验概率都默认50%（假设是垃圾邮件和正常邮件的可能性相同）；

对新的邮件抽取内容，统计词项。

# 定义一个分类器函数

# get.msg 抽取出邮件正文

# get.tdm 转换成TDM

# rowSums 计算特征词项的频率

classify.email <- function(path, training.df, prior = 0.5, c = 1e-6)

{

msg <- get.msg(path)

msg.tdm <- get.tdm(msg)

msg.freq <- rowSums(as.matrix(msg.tdm))

# intersect函数-寻找邮件特征词与训练集的交集

# msg.match-保存这封邮件中所有在训练集spam.df中出现过的特征词项

msg.match <- intersect(names(msg.freq), training.df$term)

# 判断是否有词出现在训练集中，若有则计算属于训练集对应类别邮件的概率

# 空集返回prior * c ^ (length(msg.freq)) ，先验概率乘以小概率值c的邮件特征数次幂。由于没有可判断的词，所以这样的结果是被分为垃圾邮件概率很小；

# 若不为空，match-查找词项在训练级的term列出现的位置，计算交集词的occurrence概率

if(length(msg.match) < 1)

{

return(prior * c ^ (length(msg.freq)))

}

else

{

match.probs <- training.df$occurrence[match(msg.match, training.df$term)]

return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))

}

}

# 用不易识别的正常邮件进行测试

# 获取文件路劲，sapply封装对垃圾邮件和正常邮件的测试

hardham.docs <- dir(hardham.path)

hardham.docs <- hardham.docs[which(hardham.docs != "cmds")]

# hardham.spamtest 保存每封邮件在给定对应训练数据的前提下是垃圾邮件的条件概率计算结果

# hardham.hamtest 保存正常邮件的条件概率

hardham.spamtest <- sapply(hardham.docs,

function(p) classify.email(file.path(hardham.path, p), training.df = spam.df))

hardham.hamtest <- sapply(hardham.docs,

function(p) classify.email(file.path(hardham.path, p), training.df = easyham.df))

# 比较两个概率的大小

hardham.res <- ifelse(hardham.spamtest > hardham.hamtest,

TRUE,

FALSE)

# 查看结果

summary(hardham.res)

四、用所有邮件类型测试分类器

# 定义一个函数用于一次性完成对所有邮件概率的比较

# 根据spam.df和easyham.df判定是否为垃圾邮件

spam.classifier <- function(path)

{

pr.spam <- classify.email(path, spam.df)

pr.ham <- classify.email(path, easyham.df)

return(c(pr.spam, pr.ham, ifelse(pr.spam > pr.ham, 1, 0)))

}

# 获取所有的邮件列表

easyham2.docs <- dir(easyham2.path)

easyham2.docs <- easyham2.docs[which(easyham2.docs != "cmds")]

hardham2.docs <- dir(hardham2.path)

hardham2.docs <- hardham2.docs[which(hardham2.docs != "cmds")]

spam2.docs <- dir(spam2.path)

spam2.docs <- spam2.docs[which(spam2.docs != "cmds")]

# 易于辨别的正常邮件

easyham2.class <- suppressWarnings(lapply(easyham2.docs,

function(p)

{

spam.classifier(file.path(easyham2.path, p))

}))

# 不易辨别的正常邮件

hardham2.class <- suppressWarnings(lapply(hardham2.docs,

function(p)

{

spam.classifier(file.path(hardham2.path, p))

}))

# 垃圾邮件

spam2.class <- suppressWarnings(lapply(spam2.docs,

function(p)

{

spam.classifier(file.path(spam2.path, p))

}))

# 创建数据框存放结果

easyham2.matrix <- do.call(rbind, easyham2.class)

easyham2.final <- cbind(easyham2.matrix, "EASYHAM")

hardham2.matrix <- do.call(rbind, hardham2.class)

hardham2.final <- cbind(hardham2.matrix, "HARDHAM")

spam2.matrix <- do.call(rbind, spam2.class)

spam2.final <- cbind(spam2.matrix, "SPAM")

class.matrix <- rbind(easyham2.final, hardham2.final, spam2.final)

class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)

names(class.df) <- c("Pr.SPAM" ,"Pr.HAM", "Class", "Type")

class.df$Pr.SPAM <- as.numeric(class.df$Pr.SPAM)

class.df$Pr.HAM <- as.numeric(class.df$Pr.HAM)

class.df$Class <- as.logical(as.numeric(class.df$Class))

class.df$Type <- as.factor(class.df$Type)

# 查看结果

head(class.df)

# 统计识别率

get.results <- function(bool.vector)

{

results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),

length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))

return(results)

}

easyham2.col <- get.results(subset(class.df, Type == "EASYHAM")$Class)

hardham2.col <- get.results(subset(class.df, Type == "HARDHAM")$Class)

spam2.col <- get.results(subset(class.df, Type == "SPAM")$Class)

class.res <- rbind(easyham2.col, hardham2.col, spam2.col)

colnames(class.res) <- c("NOT SPAM", "SPAM")

print(class.res)

邮件类型 3045	正常邮件	垃圾邮件
易识别的正常邮件 1400	1386 （99%）	14（1%）
不易识别的正常邮件 248	240 （96.8%）	8 （3.2%）
垃圾邮件 1397	651 （46.6%）	746 （53.4%）

分析：可以看出对于正常邮件，分类效果还不错，但是对与垃圾邮件，效果并不是很理想；

主要问题是许多垃圾邮件被判为正常邮件，
在无法确定的情况下，程序判为正常邮件的概率大，说明垃圾邮件的训练数据不足，与垃圾邮件相关的特征没有被纳入。

五、改进

正常邮件中垃圾邮件和正常邮件的比例为2：8，首先尝试改进先验概率；

效果并不明显，即使先验概率改为1，也就是当无法判断的时候都判断为垃圾邮件，对垃圾邮件的分类效果依然不好，是否可能正常邮件训练集中的特征有问题，存在一些垃圾邮件和正常邮件中都会出现的，而原先的训练中没被垃圾邮件收入，而被正常邮件收录了的？？？（这是个人运行后存在的问题，还有待解决。）

最后附上完整的R代码

# 清空变量，加载程序包
rm(list = ls())
if(require(tm) == FALSE) {
install.packages("tm")
library(tm)
}
if(require(ggplot2) == FALSE) {
install.packages("ggplot2")
library(ggplot2)
}
# 设置数据源路劲
spam.path <- file.path("data", "spam")
spam2.path <- file.path("data", "spam_2")
easyham.path <- file.path("data", "easy_ham")
easyham2.path <- file.path("data", "easy_ham_2")
hardham.path <- file.path("data", "hard_ham")
hardham2.path <- file.path("data", "hard_ham_2")
# 首先处理垃圾邮件
# 定义函数，用于读取邮件正文内容，以第一个空行作为开始标志
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
msg <- tryCatch(text[seq(which(text == "")[1] + 1, length(text), 1)], error = function(e) e)
close(con)
return(paste(msg, collapse = "n"))
}
# 构建一个文本资料库
get.tdm <- function(doc.vec)
{
doc.corpus <- Corpus(VectorSource(doc.vec))
control <- list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE,
minDocFreq = 2)
doc.dtm <- TermDocumentMatrix(doc.corpus, control)
return(doc.dtm)
}
# 保存邮件内容
spam.docs <- dir(spam.path)
spam.docs <- spam.docs[which(spam.docs != "cmds")]
all.spam <- sapply(spam.docs,
function(p) get.msg(file.path(spam.path,p)))
# 获取文本资料库
spam.tdm <- get.tdm(all.spam)
# 用TDM构建一套垃圾邮件训练数据
spam.matrix <- as.matrix(spam.tdm)
spam.counts <- rowSums(spam.matrix)
spam.df <- data.frame(cbind(names(spam.counts),
as.numeric(spam.counts)),
stringsAsFactors = FALSE)
names(spam.df) <- c("term", "frequency")
spam.df$frequency <- as.numeric(spam.df$frequency)
spam.occurrence <- sapply(1:nrow(spam.matrix),
function(i)
{
length(which(spam.matrix[i, ] > 0)) / ncol(spam.matrix)
})
spam.density <- spam.df$frequency / sum(spam.df$frequency)
# 统计整个语料库中每个词项的频次
spam.df <- transform(spam.df,
density = spam.density,
occurrence = spam.occurrence)
head(spam.df[with(spam.df,order(-occurrence)),])
head(spam.df[with(spam.df,order(-frequency)),])
# 易识别的正常邮件
easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[which(easyham.docs != "cmds")]
all.easyham <- sapply(easyham.docs[1:length(spam.docs)],
function(p) get.msg(file.path(easyham.path, p)))
easyham.tdm <- get.tdm(all.easyham)
easyham.matrix <- as.matrix(easyham.tdm)
easyham.counts <- rowSums(easyham.matrix)
easyham.df <- data.frame(cbind(names(easyham.counts),
as.numeric(easyham.counts)),
stringsAsFactors = FALSE)
names(easyham.df) <- c("term", "frequency")
easyham.df$frequency <- as.numeric(easyham.df$frequency)
easyham.occurrence <- sapply(1:nrow(easyham.matrix),
function(i)
{
length(which(easyham.matrix[i, ] > 0)) / ncol(easyham.matrix)
})
easyham.density <- easyham.df$frequency / sum(easyham.df$frequency)
easyham.df <- transform(easyham.df,
density = easyham.density,
occurrence = easyham.occurrence)
head(easyham.df[with(easyham.df,order(-occurrence)),])
head(easyham.df[with(easyham.df,order(-frequency)),])
# 定义一个分类器函数
# 寻找邮件特征词与训练集的交集
classify.email <- function(path, training.df, prior = 0.8, c = 1e-6)
{
msg <- get.msg(path)
msg.tdm <- get.tdm(msg)
msg.freq <- rowSums(as.matrix(msg.tdm))
msg.match <- intersect(names(msg.freq), training.df$term)
if(length(msg.match) < 1)
{
return(prior * c ^ (length(msg.freq)))
}
else
{
match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))
}
}
# 用不易识别的正常邮件进行测试
hardham.docs <- dir(hardham.path)
hardham.docs <- hardham.docs[which(hardham.docs != "cmds")]
hardham.spamtest <- sapply(hardham.docs,
function(p) classify.email(file.path(hardham.path, p), training.df = spam.df))
hardham.hamtest <- sapply(hardham.docs,
function(p) classify.email(file.path(hardham.path, p), training.df = easyham.df))
hardham.res <- ifelse(hardham.spamtest > hardham.hamtest,
TRUE,
FALSE)
summary(hardham.res)
# 定义一个函数用于一次性完成对所有邮件概率的比较
spam.classifier <- function(path)
{
pr.spam <- classify.email(path, spam.df)
pr.ham <- classify.email(path, easyham.df)
return(c(pr.spam, pr.ham, ifelse(pr.spam > pr.ham, 1, 0)))
}
# 获取所有的邮件列表
easyham2.docs <- dir(easyham2.path)
easyham2.docs <- easyham2.docs[which(easyham2.docs != "cmds")]
hardham2.docs <- dir(hardham2.path)
hardham2.docs <- hardham2.docs[which(hardham2.docs != "cmds")]
spam2.docs <- dir(spam2.path)
spam2.docs <- spam2.docs[which(spam2.docs != "cmds")]
# 易于辨别的正常邮件
easyham2.class <- suppressWarnings(lapply(easyham2.docs,
function(p)
{
spam.classifier(file.path(easyham2.path, p))
}))
# 不易辨别的正常邮件
hardham2.class <- suppressWarnings(lapply(hardham2.docs,
function(p)
{
spam.classifier(file.path(hardham2.path, p))
}))
# 垃圾邮件
spam2.class <- suppressWarnings(lapply(spam2.docs,
function(p)
{
spam.classifier(file.path(spam2.path, p))
}))
# 创建数据框存放结果
easyham2.matrix <- do.call(rbind, easyham2.class)
easyham2.final <- cbind(easyham2.matrix, "EASYHAM")
hardham2.matrix <- do.call(rbind, hardham2.class)
hardham2.final <- cbind(hardham2.matrix, "HARDHAM")
spam2.matrix <- do.call(rbind, spam2.class)
spam2.final <- cbind(spam2.matrix, "SPAM")
class.matrix <- rbind(easyham2.final, hardham2.final, spam2.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.SPAM" ,"Pr.HAM", "Class", "Type")
class.df$Pr.SPAM <- as.numeric(class.df$Pr.SPAM)
class.df$Pr.HAM <- as.numeric(class.df$Pr.HAM)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)
# 查看结果
head(class.df)
# Create final plot of results
class.plot <- ggplot(class.df, aes(x = log(Pr.HAM), log(Pr.SPAM))) +
geom_point(aes(shape = Type, alpha = 0.5)) +
stat_abline(yintercept = 0, slope = 1) +
scale_shape_manual(values = c("EASYHAM" = 1,
"HARDHAM" = 2,
"SPAM" = 3),
name = "Email Type") +
scale_alpha(guide = "none") +
xlab("log[Pr(HAM)]") +
ylab("log[Pr(SPAM)]") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.text.y = element_blank())
ggsave(plot = class.plot,
filename = file.path("images", "03_final_classification.pdf"),
height = 10,
width = 10)
get.results <- function(bool.vector)
{
results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
return(results)
}
# Save results as a 2x3 table
easyham2.col <- get.results(subset(class.df, Type == "EASYHAM")$Class)
hardham2.col <- get.results(subset(class.df, Type == "HARDHAM")$Class)
spam2.col <- get.results(subset(class.df, Type == "SPAM")$Class)
class.res <- rbind(easyham2.col, hardham2.col, spam2.col)
colnames(class.res) <- c("NOT SPAM", "SPAM")
print(class.res)