1、urllib python 内置的http的请求库
urllib.request #请求模块
www=urllib.request.urlopen(url,data,timeout)
www.read().decode('utf-8')
type(www)#响应类型
www.status#响应状态吗
www.getheaders()#响应头
www.getheader('Server')
代理handler
handler=urllib.request.ProxyHandler({'http':'http://207.246.77.83:2010'})
COOKIE#维持网站登陆状态
urllib.error#异常处理模块
try
需要捕获异常的代码
except 异常原因 as e:
操作
#print(e.reason)
#if isinstance(e.reason,'TIMEOUT')
# print('连接超时')
异常类型
error.HTTPError
error.URLError
urllib.parse#解析模块
urllib.parse.urlpasrse(url,scheme,allow_fragmengts=True)#scheme协议类型 allow_fragmengts允许分割6个字段
urllib.parse.urlunpasrse()
urllib.parse.urljoin()#url拼接
urllib.parse.urljoin()
urllib.parse.urlencode()#将字典的键值对转化为url
urllib.robotparse#解析txt文本
Request python 内置的http库
pip3 install requests#安装requests库
url = 'http://www.365cmd.com/forum.php'
re=request.get(url)
type(re)
re.status_code
re.COOKIEs
re.headers
re.text
arms={
'mod':'viewthread',
'tid':'14807'
}
re1=requests.get(url,params=arms)#利用字典形式传递params参数构造网址http://www.365cmd.com/forum.php?mod=viewthread&tid=14807
解析json
re1.json()
获取二进制数据
re1.content
保存二进制数据
with open('文件名','mode') as file
file.write(re1.content)
file.close()
打开windows文件C:\Users\Raytine\Desktop\111\123\1111222.txt
f=open(r'C:/Users/Raytine/Desktop/111/123/1111222.txt','r')
如果文件不在项目目录下,需写完整的路径,windows下的\代表转移字符
添加header
header={
}
re2=requests.get(url,header=header)
post方法
data={
...
}
header={
...
}
requests.post(url,data=data,header)
response状态码判断
re.status_code
200成功
404NOT FOUNLD
获取COOKIE
re.COOKIEs
for key, value, in re.COOKIEs.items()
string = '%s%s%s' % (key, ' = ', value)#字符串格式化
print(string)
会话维持
request.Session().get(url)#模拟在同一个浏览器操作
证书验证
爬取https网站url='https://www.12306.cn'
request.get(url)
若浏览器检测证书不安全,response报错
过滤错误信息以及不检测证书
from request.packages import urllib3
urllib3.disable_warning()
requests.get(url,verify=False)
requests.get(url,cert='crt,key')
代理设置
proxies={
'http':'http://207.246.77.83:2010'
'https':'http://user:passwd@207.246.77.83:2010'}
pip3 install requests[socks]# sock5代理安装
proxies={
'https':'sock5://207.246.77.83:2010'
'https'='https:sock5//user:passwd@207.246.77.83:2010'}
requests.get(url,proxies=proxies)
超时设置
try
requests.get(url,timeout=1)
except ReadTimeOut
print('TIMEOUT')
认证设置
from requests.auth import HTTPBasicAuth
requests.get(url,auth=HTTPBasicAuth('USER':'PASSWORD'))
requests.get(url,auth=('USER':'PASSWORD'))
异常捕获处理
try
requests.get(url)
except ReadTimeOut:
操作
print('**异常')
except HTTPError:
操作
print('**异常')
except ConnectionError:
操作
print('**异常')
except RequestException:
操作#捕获父类异常
print('异常')