# -*- coding: UTF-8 -*- #!/usr/bin/env pythonfrom collections import Counterimport collections import jieba.analyseimport jiebaimport timeimport reimport sys#去除停用词#stopwords = {}.fromkeys(['的', '包括', '等', '是'])stopwords = {}.fromkeys([ line.strip() for line in open("stopwords.txt") ])#读取文件路径bill_path = r'article_nohtml.txt'#写入文件路径bill_result_path = r'result.txt'#读取文件with open(bill_path,'r') as fr: all_the_text = fr.read()#处理特殊字符all_the_text = re.sub("\"|,|\.", "", all_the_text)#分词data = jieba.cut(all_the_text)#计算频率data = dict(Counter(data))#以词频排序def sort_by_count(d): #字典排序 d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return ddata = sort_by_count(data) #将结果集写入文件with open(bill_result_path,'w') as fw: for k,v in data.items(): k = k.encode('utf-8') #处理停用词 if k not in stopwords: #写入结果 #fw.write(str(k)+':'+str(v)+'\n') #fw.write("%s,%d\n" % (k,v)) fw.write(str(k)+':%d'%v + '\n')#关闭流fw.close()
运行结果图