import os import re import numpy as np import pandas as pd from bs4 import BeautifulSoup import nltk.data from nltk import word_tokenize from nltk.corpus import stopwords from gensim.models.word2vec import Word2Vec from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix def load_dataset(name,nrows=None): datasets={ "unlabeled_train":"unlabeledTrainData.tsv", "labeled_train":"labeledTrainData.tsv", "test":"testData.tsv" } if name not in datasets: raise ValueError(name) data_file=os.path.join("..","data",datasets[name]) df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows) print("number of reviews:{}".format(len(df))) return df #读入无标签数据 df=load_dataset("unlabeled_train") print(df.head())#50000 #数据预处理 stopword=list(stopwords.words("english")) def clean_text(text,remove_stopwords=False): text=BeautifulSoup(text,"html.parser").get_text() text=re.sub(r"[^a-zA-Z]"," ",text) words=text.lower().split() if remove_stopwords: words=[w for w in words if w not in stopword] return words def split_sentences(review): raw_sentences=word_tokenize(review.strip()) sentences=[clean_text(s) for s in raw_sentences if s] return sentences #将dataframe文本预处理,分词。 df["clean_review"]=df.review.apply(clean_text) sentences=sum(df.review.apply(split_sentences,[])) #用gensim训练词嵌入模型 num_features=300 min_word_count=40 num_workes=4 cOntext=10 downsampling=1e-3 model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count, window=context,sample=downsampling) model.init_sims(replace=True) model.save(os.path.join("..","models","model_name")) #看看训练的词向量结果如何 print(model.doesnt_match("man woman child kitchen".split()))#kitchen model.most_similar("man") df=load_dataset("labeled_train") def to_review_vector(review): words=clean_text(review,remove_stopwords=True) array=np.array(model[w] for w in words if w in model) return pd.Series(array.mean(axis=0)) train_data_feature=df.review.apply(to_review_vector) print(train_data_feature.head()) #使用随机森林构建分类器 forest=RandomForestClassifier(n_estimators=100,random_state=42) clf=forest.fit(train_data_feature,df["sentiment"]) confusion_matrix(df["sentiment",forest.predict(train_data_feature)]) del df del train_data_feature df=load_dataset("test") test_data=df.review.apply(to_review_vector) predict=forest.predict(test_data) output=pd.DataFrame({"id":df["id"],"sentiment":predict}) #保存到csv文件
2、使用word2vec做情感分析
import os import re import numpy as np import pandas as pd from bs4 import BeautifulSoup import nltk.data from nltk import word_tokenize from nltk.corpus import stopwords from gensim.models.word2vec import Word2Vec from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix def load_dataset(name,nrows=None): datasets={ "unlabeled_train":"unlabeledTrainData.tsv", "labeled_train":"labeledTrainData.tsv", "test":"testData.tsv" } if name not in datasets: raise ValueError(name) data_file=os.path.join("..","data",datasets[name]) df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows) print("number of reviews:{}".format(len(df))) return df #读入无标签数据 df=load_dataset("unlabeled_train") print(df.head())#50000 #数据预处理 stopword=list(stopwords.words("english")) def clean_text(text,remove_stopwords=False): text=BeautifulSoup(text,"html.parser").get_text() text=re.sub(r"[^a-zA-Z]"," ",text) words=text.lower().split() if remove_stopwords: words=[w for w in words if w not in stopword] return words def split_sentences(review): raw_sentences=word_tokenize(review.strip()) sentences=[clean_text(s) for s in raw_sentences if s] return sentences #将dataframe文本预处理,分词。 df["clean_review"]=df.review.apply(clean_text) sentences=sum(df.review.apply(split_sentences,[])) #用gensim训练词嵌入模型 num_features=300 min_word_count=40 num_workes=4 cOntext=10 downsampling=1e-3 model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count, window=context,sample=downsampling) model.init_sims(replace=True) model.save(os.path.join("..","models","model_name")) #看看训练的词向量结果如何 print(model.doesnt_match("man woman child kitchen".split()))#kitchen model.most_similar("man") df=load_dataset("labeled_train") def to_review_vector(review): words=clean_text(review,remove_stopwords=True) array=np.array(model[w] for w in words if w in model) return pd.Series(array.mean(axis=0)) train_data_feature=df.review.apply(to_review_vector) print(train_data_feature.head()) #使用随机森林构建分类器 forest=RandomForestClassifier(n_estimators=100,random_state=42) clf=forest.fit(train_data_feature,df["sentiment"]) confusion_matrix(df["sentiment",forest.predict(train_data_feature)]) del df del train_data_feature df=load_dataset("test") test_data=df.review.apply(to_review_vector) predict=forest.predict(test_data) output=pd.DataFrame({"id":df["id"],"sentiment":predict}) #保存到csv文件