作者:lixinglon257 | 来源:互联网 | 2023-06-26 07:24
网站:
http://www.kugou.com/yy/html/rank.html
爬取目标:
酷酷狗飙升榜的歌手,歌曲名字,歌曲链接等内容,存到Mysql数据库中
网页解析:
此次爬取采用三种解析方式:
代码如下:
import requests
from lxml import etree
import pymongo
from pyquery import PyQuery as pq
from bs4 import BeautifulSoupdef get_info():url = 'http://www.kugou.com/yy/html/rank.html'try:respOnse= requests.get(url)if response.status_code == 200:# print(response)return response.textexcept requests.ConnectionError:return Nonedef get_detail_info(response):"""xpath抓取数据"""html = etree.HTML(response)# xpathresult = html.xpath('//div[@id="rankWrap"]/div[@class="pc_temp_songlist pc_rank_songlist_short"]/ul/li')# print(result)list1 = []for msg in result:#查找歌手名字还有歌曲名字msg_star = msg.xpath('./@title')[0]#查找歌曲的榜名msg_address = msg.xpath('./span[@class="pc_temp_tips_l"]/i/@title')[0]#查找歌曲链接msg_lianjie = msg.xpath('./a/@href')[0]dic = {"msg_star":msg_star,'msg_address':msg_address,'msg_lianjie':msg_lianjie,}# print(dic)list1.append(dic)return list1def get_detail_info_css(response):"""pyquery抓取"""# print(response)doc = pq(response)# print(doc)items = doc('#rankWrap')lis = items.find('ul').find('li')print(type(lis))lis = doc(lis)list1 = []for msg in lis.items():# print(type(msg))msg_star = msg.attr.titlemsg_address = msg.children('.pc_temp_tips_l').find('i').attr.titlemsg_lianjie = msg.find('a').attr.hrefdic = {"msg_star": msg_star,'msg_address': msg_address,'msg_lianjie': msg_lianjie,}list1.append(dic)return list1def get_detail_info_xml(response):"""beautifulsoup抓取"""list1 =[]soup = BeautifulSoup(response,'lxml')info = soup.find(class_='pc_temp_songlist pc_rank_songlist_short').ulfor msg in info.select('li'):# print(msg)msg_star = msg.attrs['title']msg_address = msg.find(class_='pc_temp_tips_l').i.attrs['title']msg_lianjie = msg.a.attrs['href']print(msg_lianjie)dic = {"msg_star": msg_star,'msg_address': msg_address,'msg_lianjie': msg_lianjie,}list1.append(dic)return list1def db(list1):# print(list1)client = pymongo.MongoClient(host='localhost',port=27017)db = client.testcollection = db.musicfor music_info in list1:print(music_info)result = collection.insert(music_info)print(result)def main():#获取响应respOnse= get_info()# xpath提取# get_detail_info(response)# list1 = get_detail_info(response)# pyquery提取# get_detail_info_css(response)# list1 = get_detail_info_css(response)# bs4提取get_detail_info_xml(response)list1 = get_detail_info_xml(response)db(list1)if __name__ == '__main__':main()