朴素贝叶斯概念
例子:邮件分类问题:
N = (12/17)*(5/11)*(3/11)
S = (5/17)*(2/7)*(1/7)
print(N)
print(S)
# N>S 我们可以判断这是一封正常邮件
常见问题1
因为图2中 吗出现的次数是0 那么这封邮件就会被误判为正常邮件
解决如果遇到样本里面有0的情况,可以通过添加alpha进行解决。
alpha=1 统一增加1 确保不会出现无0的情况
N = (12/17)*((1/15)**4)*(3/15)
S = (5/17)*((5/11)**4)*(1/11)
print(N)
print(S)
print(S > N)
贝叶斯代码
数据切分
def split_our_data_by_class(dataset):# 按类别拆分数据 传入参数数据
splited_data = dict() # 按字典储存
for i in range(len(dataset)):
vector = dataset[i]
class_value = vector[-1]
if (class_value not in splited_data):
splited_data[class_value] =list()
splited_data[class_value].append(vector)
return splited_data
# create our dummy data
dataset = [ [0.8,2.3,0],
[2.1,1.6,0],
[2.0,3.6,0],
[3.1,2.5,0],
[3.8,4.7,0],
[6.1,4.4,1],
[8.6,0.3,1],
[7.9,5.3,1],
[9.1,2.5,1],
[6.8,2.7,1]]
splited = split_our_data_by_class(dataset)
print(splited)
# 逐行打印
for label in splited:
print(label)
for row in splited[label]:
print(row)
calculate mean and standard deviation(n-1):计算平均值和标准差(n-1)
from math import sqrt
def calculate_the_mean(a_list_of_num): # 平均值函数
mean = sum(a_list_of_num)/float(len(a_list_of_num)) # 求总和除以数据个数
return mean
def calculate_the_standard_deviation(a_list_of_num): # 计算标准差
the_mean = calculate_the_mean(a_list_of_num) # 调用平均值函数计算平均值
the_variance = sum([(x-the_mean)**2 for x in a_list_of_num])/ float(len(a_list_of_num)-1)# {每个值减去平均值的平方除以数据的(个数-1)}结果相加的总和
std = sqrt(the_variance) # 开方
return std
def describe_our_data(dataset):# 调用函数
description = [(calculate_the_mean(column),
calculate_the_standard_deviation(column),
len(column)) for column in zip(*dataset)]
del(description[-1]) # 删除最后一列的分类标签
return description
describe_our_data(dataset)
def describe_our_data_by_class(dataset):# 综合
splited_data = split_our_data_by_class(dataset)
data_description = dict()# 创建一个空字典来存储
for class_value,rows in splited_data.items():# 通过迭代填入空字典
data_description[class_value] = describe_our_data(rows)
return data_description
description = describe_our_data_by_class(dataset)
for label in description:
print(label)
for row in description[label]:
print(row)
description = describe_our_data_by_class(dataset)
type(description)
print(description)
print("---------")
print(description[0])
print("---------")
print(description[0][0])
print("--------")
print(description[0][0][2])