热门标签 | HotTags
当前位置:  开发笔记 > 编程语言 > 正文

requests爬虎妞

2019独角兽企业重金招聘Python工程师标准importreimportdatetimeimporttime,redisfrombs4importBeautifulS

2019独角兽企业重金招聘Python工程师标准>>> hot3.png

import reimport datetime
import time, redis
from bs4 import BeautifulSoup
from lxml import etreeimport requests
from pymongo import MongoClient
from pymysql import connect
from selenium import webdriver
from selenium.webdriver import DesiredCapabilitiesclass ArticleFilter(object):def __init__(self, title, content):self.redis_client = redis.StrictRedis(host='127.0.0。1', port='6379', db=9)self.first_keywords = str(self.redis_client.get('first_keywords')).split(',')self.second_keywords = str(self.redis_client.get('second_keywords')).split(',')self.title = titleself.cOntent= contentself.group_id_list = list()# 一级关键词在内容中的频次def article_content_filter(self):first_keyword_dict = dict()second_keyword_dict = dict()# 内容查找if isinstance(self.content, list):text = ''.join([item.get('text') for item in self.content if item.get('text')])# 查询文章内容含有的频次最高的一级关键词for first_keyword in self.first_keywords:num = 0num += text.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]keyword = {'first_keywords': keyword}else:# 频次最高的一级关键词没有或者有多个,采用二级属性词分类标准for second_keyword in self.second_keywords:num = 0num += text.count(second_keyword)if num > 0:second_keyword_dict[second_keyword] = numsecond_res = self.select_high(second_keyword_dict)if len(second_res) == 1:keyword, num = second_res[0][0], second_res[0][1]keyword = {'second_keywords': keyword}elif len(second_res) > 1:# 频次最高的二级属性词有多个,文章分别上架到二级属性词对应的文章分类keyword = [x[0] for x in second_res]keyword = {'second_keywords': keyword}else:# 没有匹配到二级属性词,但频次最高的一级关键词有多个,文章分别上架到一级关键词对应的文章分类if len(first_res) > 1:keyword = [x[0] for x in first_res]keyword = {'first_keywords': keyword}else:return Falsereturn keywordreturn False# 标题查找def article_title_filter(self):first_keyword_dict = dict()for first_keyword in self.first_keywords:num = 0num += self.title.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]first_keywords = {'first_keywords': keyword}return first_keywordsreturn False# 关键词查找--主函数,返回文章关键词对应的分类IDdef article_filter(self):# 1.标题查找title_keyword = self.article_title_filter()if title_keyword:first_keywords = title_keyword.get('first_keywords')group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)else:# 2.内容查找content_keyword = self.article_content_filter()if content_keyword:first_keywords = content_keyword.get('first_keywords')if isinstance(first_keywords, str):group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)elif isinstance(first_keywords, list):for first_keyword in first_keywords:group_id = self.get_keyword_group_id(first_keyword)self.group_id_list.append(group_id)else:second_keywords = content_keyword.get('second_keywords')if isinstance(second_keywords, str):group_id = self.get_keyword_group_id(second_keywords)self.group_id_list.append(group_id)elif isinstance(second_keywords, list):for second_keyword in second_keywords:group_id = self.get_keyword_group_id(second_keyword)self.group_id_list.append(group_id)else:self.group_id_list = Noneelse:self.group_id_list = Nonereturn self.group_id_list# 选取出现频次最高的关键字@staticmethoddef select_high(keyword_dict):ls = sorted(list(keyword_dict.items()), key=lambda a: a[1], reverse=True)index = 0for i, x in enumerate(ls):if x[1] == ls[0][1]:index = i + 1else:breakprint((ls[:index]))return ls[:index]# Redis取出关键词对应的文章分类IDdef get_keyword_group_id(self, keyword):article_group_id = self.redis_client.hget('group_id_of_keyword', keyword)return article_group_id# 文章敏感词过滤def sensitive_words_filter(self):try:sensitive_words = self.redis_client.get('sensitive_words')if sensitive_words:sensitive_words = sensitive_words.split(',')text = ''.join([item.get('text') for item in self.content if item.get('text')])for sensitive_word in sensitive_words:resp_title = self.title.find(sensitive_word)resp_cOntent= text.find(sensitive_word)if resp_title != -1 or resp_content != -1:return Trueelse:return Falseelse:return Falseexcept Exception as e:return Falseclass huxiu_spider(object):def __init__(self):self.base_url = 'https://www.huxiu.com/'self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}def send_request(self, url):respOnse= requests.get(url, headers=self.headers)text = response.textreturn text# 文章列表def first_analysis(self, text):selector = etree.HTML(text)results = selector.xpath('//*[@id="index"]/div[2]/div[2]/div')# //*[@id="index"]/div[1]/div[2]/div[9]/div[1]/a/div/@stylenew_list = []i = 1for res in results:res_dict = {}web_name = '虎嗅网'res_dict['web_name'] = web_name# 文章标题title = res.xpath('div[1]/h2/a/text()')[0]print('正在爬取第%s篇文章,标题是:%s' % (i, title))num = self.get_title(title, web_name)print('查看文章是否存在=====')if num == 0:print('文章不存在~~~')url = res.xpath('div/h2/a[starts-with(@href, "/article")]/@href')[0]article_link = 'https://www.huxiu.com' + urlarticle_content, article_time = self.second_analysis(article_link)if article_content != 1:print('敏感词开始过滤')# 本地敏感关键词过滤article_filter_obj = ArticleFilter(title, article_content)resp = article_filter_obj.sensitive_words_filter()if resp:print('文章存在敏感词汇')else:# 文章内容res_dict['content'] = article_content# 文章发布时间res_dict['date'] = article_time# 文章内容链接res_dict['article_link'] = article_link# 文章标题res_dict['title'] = title# 文章简介summary = res.xpath('div/div[2]/text()')[0]res_dict['summary'] = summary# 文章作者name = res.xpath('div/div/a/span/text()')[0]res_dict["name"] = name# 文章作者链接# res_dict["author_link"] = 'https://www.huxiu.com' + res.xpath('div/div/a/@href')[0]# 文章列表主图if res.xpath('div/a/img/@data-original'):min_pic = res.xpath('div/a/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('a/div/img/@data-original'):min_pic = res.xpath('a/div/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('div/a/div/@style'):# 截取图片是视频样式的mystr = res.xpath('div/a/div/@style')[0]print(111, mystr)start_index = mystr.find('(', 0, len(mystr))end_index = mystr.find('?', 0, len(mystr))min_pic = mystr[start_index + 2:end_index]print(123, min_pic)oss_url = self.upload_oss(min_pic)print(321, oss_url)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelse:oss_url = ''res_dict["min_pic"] = oss_urlself.upload_mongo(res_dict)self.upload_mysql(title, name, article_time, oss_url, summary, web_name, article_link)print('成功获取并保存第%s篇文章' % i)i += 1new_list.append(res_dict)else:i += 1continueelse:i += 1continueprint('成功获取到%s篇文章' % (i - 1))# 文章内容def second_analysis(self, url):try:# 自定义PhantomJS的请求头cap = DesiredCapabilities.PHANTOMJS.copy()for key, value in self.headers.items():cap['phantomjs.page.customHeaders.{}'.format(key)] = valuebrowser = webdriver.PhantomJS('/usr/local/lib/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')browser.get(url)time.sleep(3)html = browser.page_source# 选取文章发布时间selector = etree.HTML(html)if selector.xpath('//div[@class="column-link-box"]/span[1]/text()'):article_time = selector.xpath('//div[@class="column-link-box"]/span[1]/text()')[0]print(article_time)# //*[@id="article_content301428"]/p[138]/span[2]/text() ---new# //*[@class="article-author"]/span[2]/text() ---oldelif selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() '):article_time = selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() ')[0]else:article_time = ''# 文章内头图if selector.xpath('//div[@class="article-img-box"]/img/@src'):article_min_pic = selector.xpath('//div[@class="article-img-box"]/img/@src')[0]else:article_min_pic = ""# 选取文章内容cOntent= selector.xpath('//*[@class="article-content-wrap"]')[0]result = etree.tostring(content, method='html')print('获取到文章内容')# 获取bs4对象soup = BeautifulSoup(result, 'html.parser', from_encoding='utf-8')new_list = []# 通过标签来获取内容ls = soup.find_all(["p", "img"])for table in ls:res = {}data = table.get_text()if data:# # 去除空字符和特殊字符new_data = "".join(data.split())new_data = new_data.replace(u'\ufeff', '')if new_data != "":res["text"] = new_datanew_list.append(res)link = table.get('src')if link:oss_url = self.upload_oss(link)res["img"] = oss_urlnew_list.append(res)if article_min_pic != '':article_min_pic = self.upload_oss(article_min_pic)# article_min_pic = article_min_pic.replace('http', 'https')new_list.insert(0, {'img': article_min_pic})browser.quit()return new_list, article_timeexcept Exception as e:print('文章不存在了', e)return 1, 1# 上传图片到ossdef upload_oss(self, url):kw = {'fileurl': url,'filepath': 'gander_goose/dev/test2'}result = requests.post(url='http://api.max-digital.cn/Api/oss/uploadByUrl', data=kw)result = result.json()oss_url = result.get('oss_file_url')oss_url = oss_url.replace('maxpr.oss-cn-shanghai.aliyuncs.com', 'cdn.max-digital.cn')oss_url = oss_url.replace('http', 'https')return oss_url# 数据上传mongodef upload_mongo(self, article_dict):try:client = MongoClient('127.0.0.1', 27017)my_db = client.wechatmy_db.articles.insert_one(article_dict)print('上传到mongo成功')except Exception as e:print('上传到mongo失败:', e)# 插入到mysqldef upload_mysql(self, title, name, date, oss_url, summary, web_name, link):try:# 上传mysql# 创建Connection连接cOnn= connect(host='localhost', port=3306, database='wechat',user='root', password='mysql', charset='utf8')# 获得Cursor对象cs1 = conn.cursor()# 执行insert语句,并返回受影响的行数:添加一条数据# 增加now = datetime.datetime.now()imgurl = "https://cdn.max-digital.cn/gander_goose/dev/test2/15368082362561.jpg"sql1 = "insert into article_info (title,author,wechat_art_date,min_pic,summary,web_name,is_show,is_big,link,round_head_img,create_time) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (title, name, date, oss_url, summary, web_name, 0, 0, link, imgurl, now)cs1.execute(sql1)# 获取最新插入的文章的IDnew_article_id = int(conn.insert_id())# 修改分类--24小时下文章的自定义排序值# sql2 = 'update article_group set sort_num = sort_num + 1 where group_id=1'# cs1.execute(sql2)# 上线到24小时分类sql3 = 'insert into article_group (article_id,group_id,sort_num,create_time) values ("%s", "%s", "%s", "%s")' % (new_article_id, 1, 1, now)cs1.execute(sql3)# 修改文章上线状态sql4 = "update article_info set is_show = 1, zj_art_date='%s' where id='%s'" % (now, new_article_id)cs1.execute(sql4)conn.commit()cs1.close()conn.close()print('上传到mysql成功')except Exception as e:print('mysql上传失败:', e)def get_title(self, title, query):# 查询mysqlcOnn= connect(host='127.0.0.1', port=3306,database='zj',user='root', password='mysql', charset='utf8')# 获得Cursor对象cs1 = conn.cursor()res = 'select * from article_info where title = "%s" and web_name = "%s" ' % (title, query)num = cs1.execute(res)return numdef run(self):text = self.send_request(self.base_url)self.first_analysis(text)if __name__ == '__main__':huxiu = huxiu_spider()while True:start_time = time.time()print('开始时间:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))huxiu.run()time.sleep(3600)


转载于:https://my.oschina.net/u/3892643/blog/3055002


推荐阅读
  • 技术分享:从动态网站提取站点密钥的解决方案
    本文探讨了如何从动态网站中提取站点密钥,特别是针对验证码(reCAPTCHA)的处理方法。通过结合Selenium和requests库,提供了详细的代码示例和优化建议。 ... [详细]
  • 根据最新发布的《互联网人才趋势报告》,尽管大量IT从业者已转向Python开发,但随着人工智能和大数据领域的迅猛发展,仍存在巨大的人才缺口。本文将详细介绍如何使用Python编写一个简单的爬虫程序,并提供完整的代码示例。 ... [详细]
  • 本文详细介绍了Java中org.neo4j.helpers.collection.Iterators.single()方法的功能、使用场景及代码示例,帮助开发者更好地理解和应用该方法。 ... [详细]
  • Explore how Matterverse is redefining the metaverse experience, creating immersive and meaningful virtual environments that foster genuine connections and economic opportunities. ... [详细]
  • 本文详细介绍如何使用Python进行配置文件的读写操作,涵盖常见的配置文件格式(如INI、JSON、TOML和YAML),并提供具体的代码示例。 ... [详细]
  • 深入理解Tornado模板系统
    本文详细介绍了Tornado框架中模板系统的使用方法。Tornado自带的轻量级、高效且灵活的模板语言位于tornado.template模块,支持嵌入Python代码片段,帮助开发者快速构建动态网页。 ... [详细]
  • 本文详细解析了Python中的os和sys模块,介绍了它们的功能、常用方法及其在实际编程中的应用。 ... [详细]
  • 掌握远程执行Linux脚本和命令的技巧
    本文将详细介绍如何利用Python的Paramiko库实现远程执行Linux脚本和命令,帮助读者快速掌握这一实用技能。通过具体的示例和详尽的解释,让初学者也能轻松上手。 ... [详细]
  • MySQL 数据库迁移指南:从本地到远程及磁盘间迁移
    本文详细介绍了如何在不同场景下进行 MySQL 数据库的迁移,包括从一个硬盘迁移到另一个硬盘、从一台计算机迁移到另一台计算机,以及解决迁移过程中可能遇到的问题。 ... [详细]
  • 本文详细探讨了HTML表单中GET和POST请求的区别,包括它们的工作原理、数据传输方式、安全性及适用场景。同时,通过实例展示了如何在Servlet中处理这两种请求。 ... [详细]
  • 将Web服务部署到Tomcat
    本文介绍了如何在JDeveloper 12c中创建一个Java项目,并将其打包为Web服务,然后部署到Tomcat服务器。内容涵盖从项目创建、编写Web服务代码、配置相关XML文件到最终的本地部署和验证。 ... [详细]
  • 本文详细探讨了JDBC(Java数据库连接)的内部机制,重点分析其作为服务提供者接口(SPI)框架的应用。通过类图和代码示例,展示了JDBC如何注册驱动程序、建立数据库连接以及执行SQL查询的过程。 ... [详细]
  • 解决FCKeditor应用主题后上传问题及优化配置
    本文介绍了在Freetextbox收费后选择FCKeditor作为替代方案时遇到的上传问题及其解决方案。通过调整配置文件和调试工具,最终解决了上传失败的问题,并对相关配置进行了优化。 ... [详细]
  • 一、需求:        将MongoDB表中的数据按照时间戳增量抽取到Mysql表中。二、实现方式:   1.kettle    2.pytho ... [详细]
  • 我正在为我的Flask网络应用程序使用geopy库。我想将我从模态(html ... [详细]
author-avatar
沸腾的热水_948
这个家伙很懒,什么也没留下!
PHP1.CN | 中国最专业的PHP中文社区 | DevBox开发工具箱 | json解析格式化 |PHP资讯 | PHP教程 | 数据库技术 | 服务器技术 | 前端开发技术 | PHP框架 | 开发工具 | 在线工具
Copyright © 1998 - 2020 PHP1.CN. All Rights Reserved | 京公网安备 11010802041100号 | 京ICP备19059560号-4 | PHP1.CN 第一PHP社区 版权所有