作者:萱恭俊逸明靖 | 来源:互联网 | 2023-08-30 12:28
网站:http:weixin.sogou.comweixin?type2&querypython&page1实例中含
网站:http://weixin.sogou.com/weixin?type=2&query=python&page=1
实例中含有IP代理池!
import requests, re, pymongo, time
from fake_useragent import UserAgent
from urllib.parse import urlencode
from pyquery import PyQuery
from requests.exceptions import ConnectionError
client = pymongo.MongoClient('localhost')
db = client['weixin']
key_word = 'python开发'
connection_count = 0 # 连接列表页失败的次数
connection_detail_count = 0 # 连接列表页失败的次数
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'COOKIE': 'CXID=161A70BF2483DEF017E035BBBACD2A81; [email protected]@@@@@@@@@@; SUID=57A70FAB5D68860A5B1E1053000BC731; IPLOC=CN4101; SUV=1528705320668261; pgv_pvi=5303946240; ABTEST=5|1528705329|v1; SNUID=EF1FB713B7B2D9EE6E2A6351B8B3F072; weixinIndexVisited=1; sct=2; SUIR=F607AE0BA0A5CFF9D287956DA129A225; pgv_si=s260076544; JSESSIOnID=aaaILWONRn9wK_OiUhlnw; PHPSESSID=1i38a2ium8e5th2ukhnufua6r1; ppinf=5|1528783576|1529993176|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTklQUQlOTQlRTklOTUlOUN8Y3J0OjEwOjE1Mjg3ODM1NzZ8cmVmbmljazoxODolRTklQUQlOTQlRTklOTUlOUN8dXNlcmlkOjQ0Om85dDJsdUtPQzE0d05mQkJFeUI2d1VJVkhZUE1Ad2VpeGluLnNvaHUuY29tfA; pprdig=ENOZrtvLfoIOct75SgASWxBJb8HJQztLgFbyhRHBfeqrzcirg5WQkKZU2GDCFZ5wLI93Wej3P0hCr_rST0AlvGpF6MY9h24P267oHdqJvgP2DmCHDr2-nYvkLqKs8bjA7PLM1IEHNaH4zK-q2Shcz2A8V5IDw0qEcEuasGxIZQk; sgid=23-35378887-AVsfYtgBzV8cQricMOyk9icd0; ppmdig=15287871390000007b5820bd451c2057a94d31d05d2afff0',
}
def get_proxy():
try:
response = requests.get("http://127.0.0.1:5010/get/")
if response.status_code == 200:
return response.text
return None
except Exception as e:
print('获取代理异常:',e)
return None
def get_page_list(url):
global connection_count
proxies = get_proxy()
print('列表页代理:', proxies)
# 请求url,获取源码
if proxies != None:
proxies = {
'http':'http://'+proxies
}
try:
response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
if response.status_code == 200:
print('列表页{}请求成功',url)
return response.text
print('状态码:',response.status_code)
if response.status_code == 302:
# 切换代理,递归调用当前函数。
get_page_list(url)
except ConnectionError as e:
print('连接对方主机{}失败: {}',url,e)
connection_count += 1
if connection_count == 3:
return None
# 增加连接次数的判断
get_page_list(url)
def parse_page_list(html):
obj = PyQuery(html)
all_a = obj('.txt-box > h3 > a').items()
for a in all_a:
href = a.attr('href')
yield href
def get_page_detail(url):
global connection_detail_count
"""
请求详情页
:param url: 详情页的url
:return:
"""
proxies = get_proxy()
print('详情页代理:',proxies)
# 请求url,获取源码
if proxies != None:
proxies = {
'http': 'http://' + proxies
}
try:
# 注意:将重定向allow_redirects=False删除。列表页是https: verify=False,
# 注意:将重定向allow_redirects=False删除。列表页是https: verify=False,
# 注意:将重定向allow_redirects=False删除。列表页是https: verify=False,
# 注意:将重定向allow_redirects=False删除。列表页是https: verify=False,
response = requests.get(url, headers=headers, verify=False, proxies=proxies)
if response.status_code == 200:
print('详情页{}请求成功', url)
return response.text
else:
print('状态码:', response.status_code,url)
# 切换代理,递归调用当前函数。
get_page_detail(url)
except ConnectionError as e:
print('连接对方主机{}失败: {}', url, e)
connection_detail_count += 1
if connection_detail_count == 3:
return None
# 增加连接次数的判断
get_page_detail(url)
def parse_page_detail(html):
obj = PyQuery(html)
# title = obj('#activity-name').text()
info = obj('.profile_inner').text()
weixin = obj('.xmteditor').text()
print('info')
return {
'info':info,
'weixin':weixin
}
def save_to_mongodb(data):
# insert_one: 覆盖式的
db['article'].insert_one(data)
# 更新的方法:
# 参数1:指定根据什么字段去数据库中进行查询,字段的值。
# 参数2:如果经过参数1的查询,查询到这条数据,执行更新的操作;反之,执行插入的操作;$set是一个固定的写法。
# 参数3:是否允许更新
db['article'].update_one({'info': data['info']}, {'$set': data}, True)
time.sleep(1)
def main():
for x in range(1, 101):
url = 'http://weixin.sogou.com/weixin?query={}&type=2&page={}'.format(key_word, 1)
html = get_page_list(url)
if html != None:
# 详情页的url
urls = parse_page_list(html)
for url in urls:
detail_html = get_page_detail(url)
if detail_html != None:
data = parse_page_detail(detail_html)
if data != None:
save_to_mongodb(data)
if __name__ == '__main__':
main()