一,词频统计:
统计一篇文章中的单词数量:
import sys import re def countFile(filename,words): #对 filename 文件进行词频分析,分析结果记在字典 words里 try: f = open(filename,"r",encoding = "gbk" ) #文件为缺省编码。根据实际情况可以加参数 encoding="utf-8" 或 encoding = "gbk" except Exception as e: print(e) return 0 txt = f.read() #全部文件内容存入字符串txt f.close() splitChars = set([]) #分隔串的集合 #下面找出所有文件中非字母的字符,作为分隔串 for c in txt: if not in ( c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z'): splitChars.add(c) splitStr = "" #用于 re.split的正则表达式 #该正则表达式形式类似于: ",|:| |-" 之类,两个竖线之间的字符串就是分隔符 for c in splitChars: if c in {'.','?','!','"',"'",'(',')','|','*','$','\\','[',']','^','{','}'}: #上面这些字符比较特殊,加到splitChars里面的时候要在前面加 "\" splitStr += "\\" + c + "|" # python字符串里面,\\其实就是 \ else: splitStr += c + "|" splitStr += " " # '|'后面必须要有东西,空格多写一遍没关系 lst = re.split(splitStr,txt) #lst是分隔后的单词列表 for x in lst: if x == "": #两个相邻分隔串之间会分割出来一个空串,不理它 continue lx = x.lower() if lx in words: words[lx] += 1 #如果在字典里,则改词出现次数+1 else: words[lx] = 1 #如果不在字典里,则将该词加入字典,出现次数设为1 return 1 result = {} #结果字典。格式为 { 'a':2,'about':3 ....} if countFile(sys.argv[1],result) ==0:# argv[1] 是 源文件,分析结果记在result里面 exit() lst = list(result.items()) lst.sort() #单词按字典序排序 f = open(sys.argv[2],"w",encoding="gbk") #argv[2] 是结果文件, 文件为缺省编码, "w"表示写入 for x in lst: f.write("%s\t%d\n" % (x[0],x[1])) f.close()
二,词频统计升级:
能够读取多个文件的文本内容,对多个文件进行统计,这里肯定就要用有关文件夹中的内容下面直接上代码把.
import sys import re import os def countFile(filename,words): #对 filename 文件进行词频分析,分析结果记在词典 words里 try: f = open(filename,"r",encoding = "gbk" ) #文件为缺省编码。根据实际情况可以加参数 encoding="utf-8" 或 encoding = "gbk" except Exception as e: print(e) return 0 txt = f.read() #全部文件内容存入字符串txt f.close() splitChars = set([]) #分隔串的集合 #下面找出所有文件中非字母的字符,作为分隔串 for c in txt: if not ( c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z'): splitChars.add(c) splitStr = "" #用于 re.split的正则表达式 #该正则表达式形式类似于: ",|:| |-" 之类,两个竖线之间的字符串就是分隔符 for c in splitChars: if c in {'.','?','!','"',"'",'(',')','|','*','$','\\','[',']','^','{','}'}: #上面这些字符比较特殊,加到splitChars里面的时候要在前面加 "\" splitStr += "\\" + c + "|" # python字符串里面,\\其实就是 \ else: splitStr += c + "|" splitStr += " " # '|'后面必须要有东西,空格多写一遍没关系 lst = re.split(splitStr,txt) #lst是分隔后的单词列表 for x in lst: if x == "": #两个相邻分隔串之间会分割出来一个空串,不理它 continue lx = x.lower() if lx in words: words[lx] += 1 #如果在词典里,则改词出现次数+1 else: words[lx] = 1 #如果不在词典里,则将该词加入词典,出现次数设为1 return 1 result = {} #结果字典 lst = os.listdir() #列出当前文件夹下所有文件和文件夹 for x in lst: if os.path.isfile(x): #如果x是文件 if x.lower().endswith(".txt") and x.lower().startswith("a"): #x是 'a'开头, .txt结尾 countFile(x,result) lst = list(result.items()) lst.sort() #单词按字典序排序 f = open(sys.argv[1],"w",encoding="gbk") #argv[2] 是结果文件, 文件为缺省编码, "w"表示写入 for x in lst: f.write("%s\t%d\n" % (x[0],x[1])) f.close()
其实和第一个文件差的就是打开文件夹的步骤
三,词汇统计升级再升级:
1 import sys 2 import re 3 def makeFilterSet(): 4 cet4words = set([]) 5 f = open("cet4words.txt", "r",encoding="gbk") 6 lines = f.readlines() 7 f.close() 8 for line in lines: 9 line = line.strip() 10 if line == "": 11 continue 12 if line[0] == "$": 13 cet4words.add(line[1:]) # 将四级单词加入 集合 14 return cet4words 15 16 def makeSplitStr(txt): 17 splitChars = set([]) 18 #下面找出所有文件中非字母的字符,作为分隔符 19 for c in txt: 20 if not ( c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z'): 21 splitChars.add(c) 22 splitStr = "" 23 #生成用于 re.split的分隔符字符串 24 for c in splitChars: 25 if c in ['.','?','!','"',"'",'(',')','|','*','$','\\','[',']','^','{','}']: 26 splitStr += "\\" + c + "|" 27 else: 28 splitStr += c + "|" 29 splitStr+=" " 30 return splitStr 31 32 def countFile(filename,filterdict): #词频统计,要去掉在 filterdict集合里的单词 33 words = {} 34 try: 35 f = open(filename,"r",encoding="gbk") 36 except Exception as e: 37 print(e) 38 return 0 39 txt = f.read() 40 f.close() 41 splitStr = makeSplitStr(txt) 42 lst = re.split(splitStr,txt) 43 for x in lst: 44 lx = x.lower() 45 if lx == "" or lx in filterdict: #去掉在 filterdict里的单词 46 continue 47 words[lx] = words.get(lx,0) + 1 48 return words 49 50 result = countFile(sys.argv[1],makeFilterSet()) 51 if result != {}: 52 lst = list(result.items()) 53 lst.sort() 54 f = open(sys.argv[2],"w",encoding="gbk") 55 for x in lst: 56 f.write("%s\t%d\n" % (x[0],x[1])) 57 f.close()
四,词频统计升级升级再升级:
1 import sys 2 import re 3 def makeVaryWordsDict(): 4 vary_words = { } #元素形式: 变化形式:原型 例如 {acts:act,acting:act,boys:boy....} 5 f = open("word_varys.txt","r",encoding="gbk") 6 lines = f.readlines() 7 f.close() 8 L = len(lines) 9 for i in range(0,L,2): #每两行是一个单词的原型及变化形式 10 word = lines[i].strip() #单词原型 11 varys = lines[i+1].strip().split("|") #变形 12 for w in varys: 13 vary_words[w] = word #加入 变化形式:原型 , w的原型是 word 14 return vary_words 15 16 def makeSplitStr(txt): 17 splitChars = set([]) 18 #下面找出所有文件中非字母的字符,作为分隔符 19 for c in txt: 20 if not ( c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z'): 21 splitChars.add(c) 22 splitStr = "" 23 #生成用于 re.split的分隔符字符串 24 for c in splitChars: 25 if c in ['.','?','!','"',"'",'(',')','|','*','$','\\','[',']','^','{','}']: 26 splitStr += "\\" + c + "|" 27 else: 28 splitStr += c + "|" 29 splitStr+=" " 30 return splitStr 31 32 def countFile(filename,vary_word_dict): 33 #分析 filename 文件,返回一个词典作为结果。到 vary_word_dict里查单词原型 34 try: 35 f = open(filename,"r",encoding="gbk") 36 except Exception as e: 37 print(e) 38 return None 39 txt = f.read() 40 f.close() 41 splitStr = makeSplitStr(txt) 42 words = {} 43 lst = re.split(splitStr,txt) 44 for x in lst: 45 lx = x.lower() 46 if lx == "": 47 continue 48 if lx in vary_word_dict: #如果在原型词典里能查到原型,就变成原型再统计 49 lx = vary_word_dict[lx] 50 #直接写这句可以替换上面 if 语句 lx = vary_word_dict.get(lx,lx) 51 words[lx] = words.get(lx,0) + 1 52 return words 53 54 result = countFile(sys.argv[1],makeVaryWordsDict()) 55 if result != None and result != {}: 56 lst = list(result.items()) 57 lst.sort() 58 f = open(sys.argv[2],"w",encoding="gbk") 59 for x in lst: 60 f.write("%s\t%d\n" % (x[0],x[1])) 61 f.close()