利用大量邮件先验数据,使用朴素贝叶斯分类算法来自动识别垃圾邮件。
python实现:
#过滤垃圾邮件
def textParse(bigString): #正则表达式进行文本解析
import re
listOfTokens = re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1,26): #导入并解析文本文件
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = range(50);testSet = []
for i in range(10): #随机构建训练集
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex]) #随机挑选一个文档索引号放入测试集
del(trainingSet[randIndex]) #将该文档索引号从训练集中剔除
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet: #对测试集进行分类
wordVector = setOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print 'the error rate is: ', float(errorCount)/len(testSet)