import jieba
from sklearn.feature_extraction.text import CountVectorizer #统计词数,英文
'''
# 构建文章【英文】
content = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document? i x y']
#构建实例
con_vet = CountVectorizer()
#进行提取词语
x = con_vet.fit_transform(content)
print(x) # (0, 1) 1 (文章下标,分词下标) 词在文章中出现的次数 sparse矩阵
print(x.toarray()) # 将 sparse矩阵 转化为 数组
# 获取提取到词语
names = con_vet.get_feature_names()
print(names) # 提取到的词
'''
# 构建文章【中文】
content = ["今天阳光真好","我要去看北京*","逛完*之后我要去王府井","吃烤蝎子与烤蜈蚣","晚上去后海游个泳"]
content_list = []
for tmp in content:
# 使用精确模式进行分词 cut_all默认为精确模式
res = jieba.cut(tmp,cut_all=False)
res_str = ','.join(res)
content_list.append(res_str)
#构建实例
con_vet = CountVectorizer(stop_words=['我要','之后'])
#进行提取词语
x = con_vet.fit_transform(content_list)
print(x) # (0, 1) 1 (文章下标,分词下标) 词在文章中出现的次数 sparse矩阵
print(x.toarray()) # 将 sparse矩阵 转化为 数组
# 获取提取到词语
names = con_vet.get_feature_names()
print(names) # 提取到的词
词数重要程度
from sklearn.feature_extraction.text import TfidfVectorizer
# 构建文章【英文】
content = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document? i x y']
#构建实例
# min_df = 1 # 设置分词的时候,词必须至少出现一次
# stop_words ===停用词 不重要的词去掉
tf_vet = TfidfVectorizer(stop_words=['is','and'])
#进行提取词语
x =tf_vet.fit_transform(content)
print(x) #(0, 1) 1 (文章下标,分词下标) 词在文章中出现的重要程度 sparse矩阵
print(x.toarray()) #将 sparse矩阵 转化为 数组
# 获取提取到词语
names = tf_vet.get_feature_names()
print(names) # 提取到的词