import requests from bs4 import BeautifulSoup import bs4 import pandas as pd titles=[] h=[] url=‘http://top.baidu.com/buzz?b=341&c=513&fr=topbuzz_b1_c513‘ #选择要爬取的网站 headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36‘}#伪装爬虫 r=requests.get(url)#构造一个向服务器请求资源的url对象 r.raise_for_status()#检查请求是否成功 r.encoding = r.apparent_encoding#分析后得到可能性最大的编码 html = r.text table = BeautifulSoup(html,"html.parser").find("table") #对获得的文本进行html解析,查找
内的信息 soup=BeautifulSoup(html,‘lxml‘) for m in soup.find_all(class_="keyword"): titles.append(m.get_text().strip()) for n in soup.find_all(class_="icon-rise"): h.append(n.get_text().strip()) final=[titles,h] pd.DataFrame(final,index=["标题","热度"]) #数据可视化