运行系统:MAC
Python版本:python3.5
主要模块:Requests、BeautifulSoup
下载网站Unsplash中的图片:
程序结构图如下:
代码如下:
import requests
import os
from bs4 import BeautifulSoup
class Picture():
def __init__(self):
# 给请求指定一个请求头来模拟chrome浏览器
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
self.web_url = 'https://unsplash.com/'
self.base_path = os.path.dirname(__file__)
def makedir(self, name):
path = os.path.join(self.base_path, name)
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
print('File has been created.')
else:
print('The file is existed.')
#切换到该目录下
os.chdir(path)
def request(self, url):
r = requests.get(url, headers=self.headers)
return r
def get_img(self):
r = self.request(self.web_url)
# r.text是返回的网页HTML;这里的find_all('a', class_='cV68d') 是找到所有class为cV68d的a标签,返回的是一个list,所以可以用for循环获取每个a标签
all_a = BeautifulSoup(r.text, 'lxml').find_all('a', class_='cV68d')
self.makedir('pictures')
for a in all_a:
# a标签中完整的style字符串
img_str = a['style']
# 使用python的切片功能街区双引号之间的内容
img_url = img_str[img_str.index('"') + 1: img_str.index('"', img_str.index('"') + 1)]
start_pos = img_url.index('photo')
end_pos = img_url.index('?')
file_name = img_url[start_pos: end_pos] + '.jpg'
with open(file_name, 'ab') as f:
r = requests.get(img_url)
f.write(r.content)
if __name__ == '__main__':
picture = Picture()
picture.get_img()
下载图片