作者:漂泊盼安定 | 来源:互联网 | 2023-10-11 16:40
github地址:https://github.com/fxsjy/jieba
示例1:对txt文本进行分词,并对获取的分词进行计数,最后将结果写入result.txt中。
http://www.cnblogs.com/chenbjin/p/3843800.html
import jieba
import sys
reload(sys)
sys.setdefaultencoding('utf8')def fenci(argv) :filename = argv[1]f = open(filename,'r+')file_list = f.read()f.close()seg_list = jieba.cut(file_list,cut_all=True)tf={}for seg in seg_list :#print segseg = ''.join(seg.split())if (seg != '' and seg != "\n" and seg != "\n\n") :if seg in tf :tf[seg] += 1else :tf[seg] = 1f = open("result.txt","w+")for item in tf:#print itemf.write(item+" "+str(tf[item])+"\n")f.close()if __name__ == '__main__' : fenci(sys.argv)
示例2:http://www.cnblogs.com/chenbjin/p/3851165.html
对100份文档进行分词,然后进行TF-IDF的计算,其效果相当好。
import os
import jieba
import jieba.posseg as pseg
import sys
import string
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys)
sys.setdefaultencoding('utf8')
#获取文件列表(该目录下放着100份文档)
def getFilelist(argv) :path = argv[1]filelist = []files = os.listdir(path)for f in files :if(f[0] == '.') :passelse :filelist.append(f)return filelist,path
#对文档进行分词处理
def fenci(argv,path) :#保存分词结果的目录sFilePath = './segfile'if not os.path.exists(sFilePath) : os.mkdir(sFilePath)#读取文档filename = argvf = open(path+filename,'r+')file_list = f.read()f.close()#对文档进行分词处理,采用默认模式seg_list = jieba.cut(file_list,cut_all=True)#对空格,换行符进行处理result = []for seg in seg_list :seg = ''.join(seg.split())if (seg != '' and seg != "\n" and seg != "\n\n") :result.append(seg)#将分词后的结果用空格隔开,保存至本地。比如"我来到北京清华大学",分词结果写入为:"我 来到 北京 清华大学"f = open(sFilePath+"/"+filename+"-seg.txt","w+")f.write(' '.join(result))f.close()#读取100份已分词好的文档,进行TF-IDF计算
def Tfidf(filelist) :path = './segfile/'corpus = [] #存取100份文档的分词结果for ff in filelist :fname = path + fff = open(fname,'r+')content = f.read()f.close()corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer()tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))word = vectorizer.get_feature_names() #所有文本的关键字weight = tfidf.toarray() #对应的tfidf矩阵
sFilePath = './tfidffile'if not os.path.exists(sFilePath) : os.mkdir(sFilePath)# 这里将每份文档词语的TF-IDF写入tfidffile文件夹中保存for i in range(len(weight)) :print u"--------Writing all the tf-idf in the",i,u" file into ",sFilePath+'/'+string.zfill(i,5)+'.txt',"--------"f = open(sFilePath+'/'+string.zfill(i,5)+'.txt','w+')for j in range(len(word)) :f.write(word[j]+" "+str(weight[i][j])+"\n")f.close()if __name__ == "__main__" : (allfile,path) = getFilelist(sys.argv)for ff in allfile :print "Using jieba on "+fffenci(ff,path)Tfidf(allfile)