参考:http://blog.csdn.net/tingsking18/article/details/4033645
python内部的字符串是以unicode来编码
decode函数用来将其他编码解码为unicode
encode函数将unicode编码为指定的编码类型,例如gbk,utf-8
# -*- coding: utf-8 -*- """ Created on Wed Jan 15 15:20:59 2014 @author: hp """ import urllib2 import re import time import jieba url="http://blog.sina.com.cn/s/blog_608e1afd0102e5ym.html" def geturl(url): html=urllib2.urlopen(url).read() html=unicode(html,‘utf-8‘) word=re.findall(ur"[\u4e00-\u9fa5]+",html) s="" for w in word: s+=w return s #return web content def separate_word(s): seg_list=jieba.cut(s,cut_all=False) fenci="/ ".join(seg_list) # print ‘get web-->‘,s # print ‘div result-》‘,fenci # print "fenci[1]-->",fenci[1] word_list=[] word_tmp="" #word_tmp.decode(‘utf-8‘) for i in range(len(fenci)): if fenci[i]!="/": word_tmp+=fenci[i] else: i+=1 word_tmp.decode(‘utf-8‘,‘ignore‘) word_list.append(word_tmp) word_tmp="" #word_list=seg_list.split("/ ") # print "word_list-->",word_list return word_list def count_word(word_list): word_list_group=[] word_num=[] dic={} for i in range(len(word_list)): w_tmp=word_list[i] signal=0 for j in range(len(word_list_group)): if word_list_group[j]==w_tmp: signal=1 if signal==0: word_list_group.append(unicode(w_tmp.encode(‘utf-8‘),‘utf-8‘)) for i in range(len(word_list_group)): num=0 for j in range(len(word_list)): if word_list_group[i]==word_list[j]: num+=1 word_num.append(num) for i in range(len(word_list_group)): dic[word_list_group[i].encode(‘gbk‘)]=word_num[i] # for i in range(len(word_list_group)): # print "word_list_group-->",word_list_group[i].encode(‘gbk‘),"word_num-->",word_num[i] return dic # return word_list_group,word_num contant=geturl(url) word=separate_word(contant) result=count_word(word) for key in result.keys(): print key.encode(‘gbk‘),"--->",result[key] #print result time.sleep(10)