从提供的亚马逊电商平台的商品评价数据中识别关键模式、关系、度量和参数。
tsv格式的数据, 如下图
使用LDA模型量化评论,再结合其他数据进行下一步数据挖掘。这里主要讨论LDA。
LDA(Latent Dirichlet Allocation)是一种文档主题生成模型,也称为一个三层贝叶斯概率模型,包含词、主题和文档三层结构。所谓生成模型,就是说,我们认为:
应用
使用了词袋(bag of words)方法
代码头部全局变量,方便理解后续的代码:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, modelsTOPIC_NUM = 1 # 主题数lmtzr = WordNetLemmatizer()m_files = [r"..\data\microwave.tsv",r"..\data\microwave_lda_1rmv_cols.tsv",r"..\data\microwave_lda_2dup_revs.tsv",r"..\data\microwave_lda_3rmv_invds.tsv",r"..\data\microwave_lda_4pos_revs.txt",r"..\data\microwave_lda_5neg_revs.txt",r"..\data\microwave_lda_6pos_rev_words.txt", # 文本进行了处理r"..\data\microwave_lda_7neg_rev_words.txt",r"..\data\microwave_lda_8pos_topic.tsv",r"..\data\microwave_lda_9neg_topic.tsv",r"..\data\microwave_lda_10pos_topic_words.txt",r"..\data\microwave_lda_11neg_topic_words.txt",r"..\data\microwave_lda_12rev_words.tsv",r"..\data\microwave_lda_13rev_score.tsv"]# 停用词集合
stop_words = set(stopwords.words('english'))
stop_words = [word for word in stop_words if word not in ['not']]
# print(stop_words)
# 自定义停用词
m_stop_words = ['would', 'br', 'microwave', 'use', 'get', 'old', 'new', 'look', 'work', 'could', 'oven','purchase', 'take', 'make', 'buy', 'go', 'come', 'say', 'not', 'bought', 'even', 'ge','also', 'ca', 'dry']
# 情感分析中重要的词性
m_tags = ['MD', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RP', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']
# 正则表达式过滤特殊符号用空格符占位,双引号、单引号、句点、逗号
pat_letter = re.compile(r'[^a-zA-Z \']+')
# 还原常见缩写单词
pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
pat_s &#61; re.compile("(?<&#61;[a-zA-Z])\&#39;s") # 找出字母后面的字母
pat_s2 &#61; re.compile("(?<&#61;s)\&#39;s?")
pat_not &#61; re.compile("(?<&#61;[a-zA-Z])n\&#39;t") # not的缩写
pat_would &#61; re.compile("(?<&#61;[a-zA-Z])\&#39;d") # would的缩写
pat_will &#61; re.compile("(?<&#61;[a-zA-Z])\&#39;ll") # will的缩写
pat_am &#61; re.compile("(?<&#61;[I|i])\&#39;m") # am的缩写
pat_are &#61; re.compile("(?<&#61;[a-zA-Z])\&#39;re") # are的缩写
pat_ve &#61; re.compile("(?<&#61;[a-zA-Z])\&#39;ve") # have的缩写
然后看下最后调用的函数代码&#xff0c;了解一下顺序:
# lda训练&#xff0c;得到主题词
def lda_step1():remove_cols() # 剔除多余列 file[0]->file[1]get_dup_revs() # 获取重复评论 file[1]->file[2]def lda_step2(): # 需要查看step1中获取的重复评论的信息invd_list &#61; [1, 2] # 无效评论的行号remvove_invds(*invd_list) # 剔除无效评论 file[1]->file[1],使用了file[2]get_pos_neg_revs() # 获取消极、积极评论 file[1]->file[4,5]def lda_step3(): # lda训练write_selected_words() # 预处理文本&#xff08;归一化&#xff0c;筛选词性&#xff0c;去停词表等) file[4]->file[6],file[5]->file[7]get_topic_words() # file[6]->file[8]->file[10],file[7]->file[9]-file[11]# lda_step1()
# lda_step2()
lda_step3()
# 剔除冗余的列
def remove_cols():data &#61; pd.read_csv(m_files[0], sep&#61;&#39;\t&#39;, encoding&#61;&#39;utf-8&#39;)data &#61; data.drop([&#39;marketplace&#39;, &#39;product_category&#39;, &#39;product_parent&#39;, &#39;product_title&#39;], axis&#61;1) # 剔除了多列data.to_csv(m_files[1], sep&#61;&#39;\t&#39;, encoding&#61;&#39;utf-8&#39;)
# 获取重复的评论
def get_dup_revs():m_df &#61; pd.read_csv(m_files[1], index_col&#61;0, sep&#61;&#39;\t&#39;, encoding&#61;&#39;utf-8&#39;)data_review &#61; m_df[&#39;review_body&#39;] # 获取评论这一列# 计算数组有哪些不同的值&#xff0c;并计算每个值有多少个重复值,原值变成了行索引dup_df &#61; pd.DataFrame(data_review.value_counts())m_review &#61; dup_df.index.values.tolist() # 获取评论值列表m_num &#61; dup_df[&#39;review_body&#39;].values.tolist() # 获取原来评论的重复值# 新建一个dfm_review_num &#61; pd.DataFrame([m_review, m_num])m_review_num &#61; pd.DataFrame(m_review_num.values.T) # 转置m_review_num.columns &#61; [&#39;review_body&#39;, &#39;num&#39;]# 筛选出重复的评论m_review_num &#61; m_review_num[m_review_num[&#39;num&#39;] > 1]m_review_num.to_csv(m_files[2], sep&#61;&#39;\t&#39;, index&#61;False, header&#61;True, encoding&#61;&#39;utf-8&#39;)# print(m_review_num)
结果:
2. 重复率过高的可能是系统自动评论
第一条可能为恶意评论:
I received a Danby Microwave for Christmas 2011. Less than 4 months later it stop working I called the Danby 800# and was told what to do. I did this and have not heard anything back. I have attempted numerous times with no success on getting my refund. Loss to my family of $85.00
I will never buy another Danby product or recommend one.
第二条为系统标记无效评论
其他评论较为正常
3. 剔除掉被认定为无参考意义的评论
# 去除无效评论
def remvove_invds(*invd_list): # 参数为无效评论在“重复评论”中的行号#print("remvove_invds", invd_list)m_df &#61; pd.read_csv(m_files[1], sep&#61;&#39;\t&#39;, encoding&#61;&#39;utf-8&#39;)m_invds &#61; pd.read_csv(m_files[2], sep&#61;&#39;\t&#39;, encoding&#61;&#39;utf-8&#39;)#print("m_invds",m_invds)m_invds &#61; m_invds[m_invds.index.isin(invd_list)]m_invd_revs &#61; m_invds[&#39;review_body&#39;].values.tolist()# print("m_invd_revs:" &#43; m_invd_revs)# 筛选出不在无效评论中的m_df &#61; m_df[~m_df.review_body.isin(m_invd_revs)]m_df.to_csv(m_files[3], sep&#61;&#39;\t&#39;, index&#61;False, header&#61;True, encoding&#61;&#39;utf-8&#39;)
抽取1,2星和4,5星的评论分别作为消极评论、积极评论的语料
# 抽取1、2,4、5星的评论
def get_pos_neg_revs():m_df &#61; pd.read_csv(m_files[3], sep&#61;&#39;\t&#39;, encoding&#61;&#39;utf-8&#39;)m_neg_df &#61; m_df[m_df.star_rating.isin([1, 2])]m_pos_df &#61; m_df[m_df.star_rating.isin([4, 5])]m_neg_revs &#61; m_neg_df[&#39;review_body&#39;]m_pos_revs &#61; m_pos_df[&#39;review_body&#39;]m_neg_revs.to_csv(m_files[5], sep&#61;&#39;\t&#39;, index&#61;False, header&#61;True, encoding&#61;&#39;utf-8&#39;)m_pos_revs.to_csv(m_files[4], sep&#61;&#39;\t&#39;, index&#61;False, header&#61;True, encoding&#61;&#39;utf-8&#39;)
英语中同一个动词有多种形态&#xff0c;奖其还原成原形
去除无参考意义的词&#xff0c;如:
{&#39;to&#39;, &#39;there&#39;, &#39;nor&#39;, &#39;wouldn&#39;, &#39;shouldn&#39;, &#39;i&#39;, &#39;then&#39;, &#39;you&#39;, &#39;ain&#39;, "hasn&#39;t", &#39;she&#39;, &#39;not&#39;, &#39;such&#39;, &#39;those&#39;, &#39;so&#39;, &#39;over&#39;, &#39;the&#39;, &#39;y&#39;, &#39;d&#39;, &#39;most&#39;, &#39;m&#39;, &#39;should&#39;, &#39;both&#39;, &#39;weren&#39;, &#39;from&#39;, &#39;until&#39;, &#39;an&#39;, &#39;my&#39;, &#39;yours&#39;, &#39;in&#39;, &#39;here&#39;, &#39;them&#39;, &#39;have&#39;, &#39;didn&#39;, &#39;against&#39;, &#39;myself&#39;, &#39;of&#39;, &#39;her&#39;, &#39;had&#39;, "couldn&#39;t", "didn&#39;t", &#39;when&#39;, "should&#39;ve", &#39;is&#39;, &#39;very&#39;, "don&#39;t", &#39;has&#39;, &#39;these&#39;, &#39;will&#39;, &#39;re&#39;, &#39;now&#39;, "hadn&#39;t", &#39;were&#39;, &#39;again&#39;, &#39;same&#39;, &#39;itself&#39;, &#39;his&#39;, &#39;what&#39;, &#39;him&#39;, &#39;don&#39;, "you&#39;ll", &#39;how&#39;, &#39;couldn&#39;, &#39;other&#39;, &#39;doesn&#39;, &#39;out&#39;, &#39;no&#39;, &#39;while&#39;, &#39;your&#39;, &#39;do&#39;, &#39;this&#39;, &#39;if&#39;, "shouldn&#39;t", &#39;just&#39;, &#39;aren&#39;, &#39;shan&#39;, &#39;himself&#39;, &#39;on&#39;, &#39;further&#39;, &#39;themselves&#39;, &#39;ve&#39;, &#39;hers&#39;, &#39;t&#39;, &#39;me&#39;, &#39;s&#39;, &#39;that&#39;, &#39;and&#39;, &#39;which&#39;, &#39;or&#39;, &#39;our&#39;, "won&#39;t", &#39;above&#39;, &#39;off&#39;, &#39;we&#39;, "wasn&#39;t", "needn&#39;t", &#39;ours&#39;, &#39;who&#39;, &#39;all&#39;, &#39;wasn&#39;, &#39;through&#39;, &#39;be&#39;, &#39;ourselves&#39;, &#39;by&#39;, &#39;during&#39;, &#39;about&#39;, "mightn&#39;t", &#39;was&#39;, &#39;yourselves&#39;, &#39;before&#39;, &#39;because&#39;, &#39;ma&#39;, &#39;being&#39;, &#39;more&#39;, &#39;it&#39;, &#39;any&#39;, &#39;ll&#39;, "weren&#39;t", &#39;between&#39;, &#39;why&#39;, &#39;he&#39;, &#39;herself&#39;, &#39;whom&#39;, "wouldn&#39;t", &#39;o&#39;, "that&#39;ll", "you&#39;d", &#39;few&#39;, &#39;won&#39;, &#39;once&#39;, &#39;some&#39;, &#39;doing&#39;, "aren&#39;t", "you&#39;ve", &#39;with&#39;, &#39;under&#39;, "mustn&#39;t", &#39;too&#39;, &#39;needn&#39;, &#39;isn&#39;, &#39;yourself&#39;, "haven&#39;t", &#39;up&#39;, &#39;below&#39;, &#39;am&#39;, &#39;after&#39;, "it&#39;s", &#39;as&#39;, &#39;hadn&#39;, &#39;into&#39;, &#39;own&#39;, "you&#39;re", &#39;its&#39;, &#39;theirs&#39;, &#39;their&#39;, "isn&#39;t", "shan&#39;t", &#39;only&#39;, &#39;mightn&#39;, &#39;hasn&#39;, &#39;mustn&#39;, &#39;does&#39;, &#39;a&#39;, &#39;each&#39;, &#39;having&#39;, &#39;haven&#39;, &#39;they&#39;, "she&#39;s", &#39;at&#39;, &#39;can&#39;, &#39;but&#39;, &#39;been&#39;, &#39;did&#39;, "doesn&#39;t", &#39;down&#39;, &#39;than&#39;, &#39;are&#39;, &#39;for&#39;, &#39;where&#39;}
去除掉情感分析中无参考意义的词性, 保留有参考意义的词性。
有参考意义的词性&#xff1a;
m_tags &#61; [&#39;MD&#39;, &#39;UH&#39;, &#39;VB&#39;, &#39;VBD&#39;, &#39;VBG&#39;, &#39;VBN&#39;, &#39;VBP&#39;, &#39;VBZ&#39;, &#39;RP&#39;, &#39;RB&#39;, &#39;RBR&#39;, &#39;RBS&#39;, &#39;JJ&#39;, &#39;JJR&#39;, &#39;JJS&#39;]
# 从文本抽取单词
def extract_words(text, debug&#61;False):text &#61; replace_abbreviations(text)if debug:print(&#39;去除非字母符号:&#39;, text)m_words &#61; nltk.word_tokenize(text) # 分词if debug:print(&#39;分词:&#39;, m_words)m_word_tags &#61; nltk.pos_tag(m_words) # 获取单词词性if debug:print(&#39;获取词性:&#39;, m_word_tags)m_words &#61; [word for word, tag in m_word_tags if tag in m_tags] # 过滤词性if debug:print(&#39;过滤词性后:&#39;, m_words)m_words &#61; words_normalize(m_words) # 归一化if debug:print(&#39;归一化后:&#39;, m_words)m_words &#61; [word for word in m_words if word not in stop_words] # 过滤停词表m_words &#61; [word for word in m_words if word not in m_stop_words] # 过滤自定义停词表if debug:print(&#39;过滤停词表后:&#39;, m_words)return m_words
抽取1,2星和4,5星的评论分别作为消极评论、积极评论的语料
分别对两份语料进行LDA训练得到主题词。
get_topics.py:
# 获取文章主题, 使用预处理后的评论文本(已经进行了归一化&#xff0c;筛选词性&#xff0c;去停词表等操作)
def get_topics2(input_file):fr &#61; open(input_file, &#39;r&#39;, encoding&#61;&#39;utf-8&#39;)words_list &#61; [] # 二维单词列表for line in fr.readlines():m_words &#61; nltk.word_tokenize(line)# m_words &#61; [word for word in m_words if word not in m_stop_words]words_list.append(m_words)# """构建词频矩阵&#xff0c;训练LDA模型"""dictionary &#61; corpora.Dictionary(words_list)# corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]# corpus是把每条新闻ID化后的结果&#xff0c;每个元素是新闻中的每个词语&#xff0c;在字典中的ID和频率corpus &#61; [dictionary.doc2bow(words) for words in words_list] # text单篇文章lda &#61; models.LdaModel(corpus&#61;corpus, id2word&#61;dictionary, num_topics&#61;TOPIC_NUM) # lda训练topic_list &#61; lda.print_topics(TOPIC_NUM)print(len(topic_list), "个主题的单词分布为&#xff1a;\n")for topic in topic_list:print(topic)return topic_list
分析结果:
1 个主题的单词分布为&#xff1a;(积极)
(0, &#39;0.022*"great" &#43; 0.019*"well" &#43; 0.015*"small" &#43; 0.014*"good" &#43; 0.013*"easy" &#43; 0.011*"fit" &#43; 0.010*"love" &#43; 0.010*"need" &#43; 0.009*"little" &#43; 0.008*"much"&#39;)1 个主题的单词分布为&#xff1a;(消极)
(0, &#39;0.014*"replace" &#43; 0.009*"last" &#43; 0.008*"stop" &#43; 0.008*"start" &#43; 0.008*"back" &#43; 0.008*"well" &#43; 0.007*"never" &#43; 0.007*"call" &#43; 0.007*"turn" &#43; 0.007*"open"&#39;)[&#39;well&#39;, &#39;small&#39;, &#39;fit&#39;, &#39;good&#39;, &#39;great&#39;, &#39;easy&#39;, &#39;need&#39;, &#39;much&#39;, &#39;little&#39;, &#39;love&#39;][&#39;replace&#39;, &#39;well&#39;, &#39;turn&#39;, &#39;last&#39;, &#39;never&#39;, &#39;call&#39;, &#39;back&#39;, &#39;stop&#39;, &#39;open&#39;, &#39;start&#39;]
gitee项目地址:https://gitee.com/Meloor/LDATest
文件目录:LDA/get_topics.py
参考博客:https://www.jianshu.com/p/4a0bd8498561