作者:处男是你_909 | 来源:互联网 | 2023-09-04 18:44
requestspython基于http协议进行网络请求的的三方库importrequests发送请求requests.get(url,*,headers,params,pro
requests
- python基于http协议进行网络请求的的三方库
import requests
发送请求
-
requests.get(url, *, headers, params, proxies) - 发送get请求
-
requests.post(url, *, headers, params, proxies) - 发送post请求
-
参数:
- url - 请求地址(一个网站的网址、接口地址、图片地址等)
- headers - 设置请求头(设置COOKIE和User-Agent的时候使用)
- params - 设置参数
- proxies - 设置代理
requests.get('www.xxx.com?key=1234567890&num=10')
params = {'key': '1234567890','num': '10'
}
requests.get('www.xxx.com',params=params)
requests.post('www.xxx.com', params=params)
添加请求头
添加User-Agent
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
response = requests.get('https://www.51job.com', headers=header)
response.encoding = 'gbk'
print(response.text)
添加cooike
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67','COOKIE':'_zap'
}
response = requests.get('https://www.zhihu.com', headers=header)
print(response.text)
获取响应信息
response = requests.get('http://www.yingjiesheng.com/')
设置编码方式(乱码的时候才需要设置)
response.encoding = 'GBK'
获取响应头
head = response.headers
获取响应体
获取text值
res_text = response.text
获取json解析结果
res_json = response.json()
获取content值
- 获取二进制类型的原数据,用于图片、视频、音频的下载
res_con = response.content
json解析
url = 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc&_signature=_02B4Z6wo00d01lV7XNAAAIDC1XmmkccfxtZVX1hAAPRc4q1lffyoHnSH6QUjeMpESOysibusQzMT4SFKQRYqPcysxXX0rosf5v95jp--h4Twooxn1Q7PvJH1hggqpvwVxGkcisci55B1y4d-13'
response = requests.get(f'{url}')
res_json = response.json()['data']
for news in res_json:print(news['Title'])
图片下载
def download_image(img_url):response = requests.get(img_url)data = response.contentf = open(f'files/{img_url.split("/")[-1].split("!")[0]}', 'wb')f.write(data)print('下载完成!')if __name__ == '__main__':response = requests.get('https://www.58pic.com/tupian/qixi-0-0.html')result = findall(r'(?s)', response.text)for x in result:download_image(f'https:{x}')
千图网图片获取
from re import findall
import requestsdef download_image(img_url):response = requests.get(img_url)data = response.contentf = open(f'files/{img_url.split("/")[-1]}', 'wb')if __name__ == '__main__':url = ''regular = r''response = requests.get(url)result = findall(regular, response.text)
bs4的使用
from bs4 import BeautifulSoup
- 准备需要解析的网页数据(实际是用requests或者selenium获取)
data = open('text1.html', encoding='utf-8').read()
- 创建BeautifulSoup对象(可以自动纠正数据中错误的html结构)
soup = BeautifulSoup(data, 'lxml')
- 通过BeautifulSoup对象获取标签和标签内容
- 获取标签
- BeautifulSoup对象.select(css选择器) - 获取css选择器中所有的标签,返回列表,元素是选中的标签对象
- BeautifulSoup对象.select_one(css选择器) - 获取css选择器中的第一个标签,返回标签对象
- 标签对象.select/select_one(css选择器) - 在标签对象中获取css选择器选中的标签
result = soup.select('p')
print(result) result = soup.select_one('p')
print(result) result = soup.select('#p1')
print(result) result = soup.select_one('#p1')
print(result)
- 获取标签内容
- 标签对象.string - 获取标签中的文字内容,只有标签内容是纯文字才有效,否则结果是None
- 标签对象.get_text() - 标签内容中所有的文字信息
- 标签对象.contents
p2 = soup.select_one('div>p')
print(p2)
print(p2.string) s1 = soup.select_one('#s1')
print(s1)
print(s1.string)
print(p2.get_text())
print(s1.get_text())
print(p2.contents)
result = s1.contents
print(result)
print(result[-1].get_text())
a1 = soup.select_one('div>a')
print(a1)
print(a1.attrs['href']) img = soup.select_one('img')
print(img)
print(img.attrs['src'])
豆瓣电影TOP250信息抓取
import csv
import requests
from bs4 import BeautifulSoup
f = open('files/doubantop250.csv', 'a', encoding='utf-8', newline='')
writer = csv.writer(f)
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67','COOKIE': '自己的COOKIE信息'
}
for x in range(0, 226, 25):url = f'https://movie.douban.com/top250?start={x}&filter='print(url)res = requests.get(url, headers=header)soup = BeautifulSoup(res.text, 'lxml')all_movie_li = soup.select('#content>div>div.article>ol>li')tmp = []for movie_info in all_movie_li:mov_name = movie_info.select_one('.pic>a>img').attrs['alt']mov_star = movie_info.select_one('.rating_num').get_text()try:mov_quote = movie_info.select_one('.inq').get_text()except AttributeError:mov_quote = Nonemov_rev = movie_info.select('.info>.bd>.star>span')[3].get_text()mov_img = movie_info.select_one('.pic>a>img').attrs['src']tmp.extend([mov_name, mov_star, mov_rev, mov_img, mov_quote])writer.writerow(tmp)tmp.clear()
电影名,评分,评分人数,电影海报,短评
肖申克的救赎,9.7,2418141人评价,https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg,希望让人自由。
霸王别姬,9.6,1799374人评价,https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2561716440.jpg,风华绝代。
阿甘正传,9.5,1819919人评价,https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2372307693.jpg,一部美国近现代史。