作者:JHH先森 | 来源:互联网 | 2023-05-24 09:39
通过python3进程池实现多进程爬虫,以下是爬虫简单实现斗鱼主播小姐姐暴走漫画主页源码main.py(主函数)#_*_coding:utf-8_*___author__zhaoz
通过python3进程池实现多进程爬虫,以下是爬虫简单实现
斗鱼主播小姐姐
暴走漫画主页
源码
main.py(主函数)
# _*_ coding: utf-8 _*_
__author__ = 'zhaozhao'
import deal_html
import coreData
import download_images
import re
import moreProcess
import managerDir
def main():
dest_url = input("请输入需要下载图片的网址:")
# dest_url = "https://www.youtube.com/"
# 获取存放下载图片的目录名称
url_file_name_before = re.search(r".*//.*?\.+(.*?)\.+[^\.]*", dest_url)
url_file_name = url_file_name_before.group(1)
coreData.folder_name = url_file_name
print(url_file_name)
# 创建下载文件的文件夹
managerDir.create_folder()
# 获得下载图片的列表
coreData.all_images_addr_list = deal_html.main(dest_url)
# 开始任务
moreProcess.main()
if __name__ == '__main__':
main()
managerDir.py(管理下载图片的目录)
import os
import coreData
def create_folder():
if os.path.exists(coreData.folder_name):
pass
else:
os.mkdir(coreData.folder_name)
moreProcess.py(开启多进程)
import os
import coreData
def create_folder():
if os.path.exists(coreData.folder_name):
pass
else:
os.mkdir(coreData.folder_name)
download_images.py(图片下载器)
import re
import urllib.request
import time
import os
import coreData
def core_download(image_addr):
print("当前图片的地址为%s"%image_addr)
image_name_whole = re.search(r".*?([^/]*\.(jpg|JPG|png|PNG))", image_addr)
image_name = str(time.ctime(time.time()))+"_"+image_name_whole.group(1)
print(image_name)
try:
image_data = urllib.request.urlopen(image_addr)
image_data = image_data.read()
# 打开文件
with open(coreData.folder_name+"/"+image_name, "wb+") as f:
f.write(image_data)
except Exception as erro:
print(erro)
deal_html.py(处理html,提取图片地址)
import re
import urllib.request
import coreData
def main(download_url):
# 下载html
request = urllib.request.urlopen(download_url)
source_html = None
if request:
source_html = request.read()
source_html = source_html.decode("utf-8")
print(source_html)
else:
print("html加载失败")
# 分析html
img_addr = re.findall(r"(http\w?://[^\"]*?\.(jpg|JPG|png|PNG))", source_html)
new_img_addr = list()
for img in img_addr:
new_img_addr.append(img[0])
img_addr = new_img_addr
print(img_addr)
coreData.all_images_addr_list = img_addr
return img_addr
if __name__ == '__main__':
main()
coreData.py(共享变量)
all_images_addr_list = None
folder_name = None