热门标签 | HotTags
当前位置:  开发笔记 > 编程语言 > 正文

开发笔记:爬取知乎话题async使用协程

本文由编程笔记#小编为大家整理,主要介绍了爬取知乎话题async使用协程相关的知识,希望对你有一定的参考价值。impo
本文由编程笔记#小编为大家整理,主要介绍了爬取知乎话题async使用协程相关的知识,希望对你有一定的参考价值。



import requests
import json
import time
from pyquery import PyQuery
import pandas as pd
from collections import OrderedDict
import multiprocessing
import asyncio
from functools import partial
# COOKIEs = input(‘请输入COOKIE:‘)
#
url = input(‘请输入url:‘)
init_url = https://www.zhihu.com/api/v4/topics/19562045/feeds/top_activity?offset=5&limit=10
headers
= {
User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (Khtml, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1,
COOKIE: **,
Referer: https://www.zhihu.com/topic/19606409/hot,
Host: www.zhihu.com,
X-UDID: AGDlzA1itw2PTr6aWsPp6OtejkxQ9iF7xgA=
}
def get_all_url(url):
res
= requests.get(url,headers=headers)
data
= json.loads(res.text)
next_page_url
= data[paging][next]
url_list.append(next_page_url)
print(len(url_list))
end_page
= data[paging][is_end] # true
if end_page:
return url_list
else:
get_all_url(next_page_url)
async
def get_all_data(url):
future
= loop.run_in_executor(None,partial(requests.get,url,headers=headers))
#res = requests.get(url,headers=headers)
res = await future
data
= json.loads(res.text)
res_data
= data[data]
print(len(data_list))
for i in res_data:
final_data
= OrderedDict()
type
= i[target][type]
if type ==answer:
final_data[
title] = i[target][question][title] or ‘‘
try:
final_data[
content] = PyQuery(i[target][content]).text()
except Exception as e:
final_data[
content] = PyQuery(i[target][excerpt]).text()
final_data[
comment_count] = i[target][comment_count]
final_data[
voteup_count] = i[target][voteup_count]
data_list.append(final_data)
if __name__ == __main__:
data_list
=[]
url_list
= []
get_all_url(init_url)
tasks
= [asyncio.ensure_future(get_all_data(url)) for url in url_list]
loop
= asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
df1
=pd.DataFrame(data_list)
df1.to_excel(
保险+time.strftime("%Y%m%d%H%M%S")+.xlsx,index=False)
print(done)

 


推荐阅读
author-avatar
浩哥
这个家伙很懒,什么也没留下!
PHP1.CN | 中国最专业的PHP中文社区 | DevBox开发工具箱 | json解析格式化 |PHP资讯 | PHP教程 | 数据库技术 | 服务器技术 | 前端开发技术 | PHP框架 | 开发工具 | 在线工具
Copyright © 1998 - 2020 PHP1.CN. All Rights Reserved | 京公网安备 11010802041100号 | 京ICP备19059560号-4 | PHP1.CN 第一PHP社区 版权所有