Python项目杂记完整教程目录(更有python、go、pytorch、tensorflow、爬虫、人工智能教学等着你):https://www.cnblogs.com/nickchen121/p/10825705.html
朴素贝叶斯(垃圾邮件分类)
邮箱训练集下载地址
邮箱训练集可以加我微信:chenyoudea
模块导入
import re
import os
from jieba import cut
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
文本预处理
def get_words(filename):
"""读取文本并过滤无效字符和长度为1的词"""
words = []
with open(filename, \'r\', encoding=\'utf-8\') as fr:
for line in fr:
line = line.strip()
# 过滤无效字符
line = re.sub(r\'[.【】0-9、——。,!~\*]\', \'\', line)
# 使用jieba.cut()方法对文本切词处理
line = cut(line)
# 过滤长度为1的词
line = filter(lambda word: len(word) > 1, line)
words.extend(line)
return words
遍历邮件
all_words = []
def get_top_words(top_num):
"""遍历邮件建立词库后返回出现次数最多的词"""
filename_list = [\'邮件_files/{}.txt\'.format(i) for i in range(151)]
# 遍历邮件建立词库
for filename in filename_list:
all_words.append(get_words(filename))
# itertools.chain()把all_words内的所有列表组合成一个列表
# collections.Counter()统计词个数
freq = Counter(chain(*all_words))
return [i[0] for i in freq.most_common(top_num)]
top_words = get_top_words(100)
# 构建词-个数映射表
vector = []
for words in all_words:
\'\'\'
words:
[\'国际\', \'SCI\', \'期刊\', \'材料\', \'结构力学\', \'工程\', \'杂志\', \'国际\', \'SCI\', \'期刊\', \'先进\', \'材料科学\',
\'材料\', \'工程\', \'杂志\', \'国际\', \'SCI\', \'期刊\', \'图像处理\', \'模式识别\', \'人工智能\', \'工程\', \'杂志\', \'国际\',
\'SCI\', \'期刊\', \'数据\', \'信息\', \'科学杂志\', \'国际\', \'SCI\', \'期刊\', \'机器\', \'学习\', \'神经网络\', \'人工智能\',
\'杂志\', \'国际\', \'SCI\', \'期刊\', \'能源\', \'环境\', \'生态\', \'温度\', \'管理\', \'结合\', \'信息学\', \'杂志\', \'期刊\',
\'网址\', \'论文\', \'篇幅\', \'控制\', \'以上\', \'英文\', \'字数\', \'以上\', \'文章\', \'撰写\', \'语言\', \'英语\', \'论文\',
\'研究\', \'内容\', \'详实\', \'方法\', \'正确\', \'理论性\', \'实践性\', \'科学性\', \'前沿性\', \'投稿\', \'初稿\', \'需要\',
\'排版\', \'录用\', \'提供\', \'模版\', \'排版\', \'写作\', \'要求\', \'正规\', \'期刊\', \'正规\', \'操作\', \'大牛\', \'出版社\',
\'期刊\', \'期刊\', \'质量\', \'放心\', \'检索\', \'稳定\', \'邀请函\', \'推荐\', \'身边\', \'老师\', \'朋友\', \'打扰\', \'请谅解\']
\'\'\'
word_map = list(map(lambda word: words.count(word), top_words))
\'\'\'
word_map:
[0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
\'\'\'
vector.append(word_map)
训练模型
vector = np.array(vector)
# 0-126.txt为垃圾邮件标记为1;127-151.txt为普通邮件标记为0
labels = np.array([1]*127 + [0]*24)
model = MultinomialNB()
model.fit(vector, labels)
测试模型
def predict(filename):
"""对未知邮件分类"""
# 构建未知邮件的词向量
words = get_words(filename)
current_vector = np.array(
tuple(map(lambda word: words.count(word), top_words)))
# 预测结果
result = model.predict(current_vector.reshape(1, -1))
return \'**垃圾邮件**\' if result == 1 else \'普通邮件\'
print(\'151.txt分类情况:{}\'.format(predict(\'邮件_files/151.txt\')))
print(\'152.txt分类情况:{}\'.format(predict(\'邮件_files/152.txt\')))
print(\'153.txt分类情况:{}\'.format(predict(\'邮件_files/153.txt\')))
print(\'154.txt分类情况:{}\'.format(predict(\'邮件_files/154.txt\')))
print(\'155.txt分类情况:{}\'.format(predict(\'邮件_files/155.txt\')))