作者:海边遗忘的时光_958 | 来源:互联网 | 2023-09-03 15:37
https:github.comfacebookresearchfastTextpython版本https:github.comsalestockfastText.py
"""
https://github.com/facebookresearch/fastText
python版本
https://github.com/salestock/fastText.py
这个是非官方的版本 现在已经不在使用了
官方提供了Python版本
https://github.com/facebookresearch/fastText/tree/master/python
现在用的都是官方的版本
"""
import jieba
import pandas as pd
import randomcate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()df_car = pd.read_csv("./data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()df_military = pd.read_csv("./data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()df_sports = pd.read_csv("./data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values
"""
fasttext的无监督的词向量训练
https://github.com/facebookresearch/fastText/tree/master/python
"""
import fasttextcate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}def preprocess_text_unsupervised(content_lines, sentences, category):for line in content_lines:try:segs=jieba.lcut(line)segs = filter(lambda x:len(x)>1, segs)segs = filter(lambda x:x not in stopwords, segs)sentences.append(" ".join(segs))except Exception:print (line)continue
#生成无监督训练数据
sentences = []preprocess_text_unsupervised(technology, sentences, cate_dic['technology'])
preprocess_text_unsupervised(car, sentences, cate_dic['car'])
preprocess_text_unsupervised(entertainment, sentences, cate_dic['entertainment'])
preprocess_text_unsupervised(military, sentences, cate_dic['military'])
preprocess_text_unsupervised(sports, sentences, cate_dic['sports'])print ("writing data to fasttext unsupervised learning format...")
out = open('unsupervised_train_data.txt', 'wb')
for sentence in sentences:out.write(sentence.encode('utf8')+b"\n")
print("done!") import fasttext# Skipgram model :
skmodel = fasttext.train_unsupervised('unsupervised_train_data.txt', model='skipgram')# or, cbow model :
cbowmodel = fasttext.train_unsupervised('unsupervised_train_data.txt', model='cbow')
gensim训练词向量
import gensim
"""
对比gensim的word2vec
"""model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
model.save("gensim_word2vec.model")