应该算是我这个新人在吾爱的首贴了
前言
由于是py新人,所以代码可能不够简练,都是想到哪写到哪,但是看起来可能不会太吃力,应该还是比较好理解的
大神勿喷!
食用方法
1.首先需要python环境,这个网上教程也很多,我就不说明了
2.仅python 3的版本,3以下版本无法运行
3.不需要安装第三方库,一切操作基于标准库完成
爬取的对象:第三方小说网站:顶点小说网
以小说:修真聊天群 为例
首先打开修真聊天群章节目录,将目录的网址 http://www.booktxt.net/1_1439/ 复制后输入到命令行中,然后自动获取所有章节列表
然后就静静等待结束即可
后面有图,因为图链可能会挂。。。所以当附件上传了
源码
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib.request
import re
import os
import time
import threading
import shutil
txt_content_partern = '
(.*?)
'
txt_name_partern = '
(.*?)'
catalog_partern = '
(.*?)'
flag = -1
max_len = 0
atalog = []
# 章节间隔
txt_max = 20
# 线程数量
max_thread = 20
thread_stop = 0
start_time = time.clock()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Host': 'http://www.booktxt.net',
'Referer': 'https://www.google.com.hk/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
def down_txt(url, txtname, filename):
# print(url)
fo = open(filename, "a")
for i in range(0, 10):
try:
html_data = urllib.request.urlopen(url).read().decode('gbk')
content = re.findall(txt_content_partern, html_data, re.S | re.M)
fo.write("\r\n" + txtname + "\r\n")
fo.write(content[0].replace(" ", "").replace("
", "").replace("\r\n\r\n", "\r\n").replace("<", "").replace("/p>", ""))
fo.close()
break
except:
if i &#61;&#61; 9:
print("请求失败次数过多&#xff0c;请重新下载")
print("请求失败&#xff0c;正在重试...")
time.sleep(0.5)
continue
def down_mul(url, cnt, file_path):
global flag, max_len, atalog, txt_max, thread_stop
down_flag &#61; 1
while flag * txt_max flag &#43;&#61; 1
star &#61; flag * txt_max
end &#61; star &#43; txt_max
if star >&#61; end:
break
if end > max_len:
end &#61; max_len
print("正在抓取章节" &#43; str(star) &#43; &#39;-&#39; &#43; str(end) &#43; &#39;...&#39;)
down_flag &#61; 0
for i in range(star, end):
if i >&#61; max_len:
break
for j in range(0, 10):
try:
down_txt(url &#43; atalog[i][0] &#43; ".html", atalog[i][1], file_path &#43; &#39;\\&#39; &#43; str(star &#43; 1) &#43; &#39;.txt&#39;)
break
except:
if i &#61;&#61; 9:
print("请求失败次数过多&#xff0c;请重新下载")
print("请求失败&#xff0c;正在重试...")
time.sleep(0.5)
continue
thread_stop &#43;&#61; 1
if down_flag:
print("线程[" &#43; str(cnt) &#43; "]未获取到任务...")
else:
print("线程[" &#43; str(cnt) &#43; "]运行完毕...")
def main():
global atalog, max_len, thread_stop, max_thread, start_time
basic_url &#61; &#39;www.booktxt.net&#39;
url_1 &#61; input("请输入需要下载的小说目录地址,仅限顶点小说网[www.booktxt.net]&#xff1a;")
print(&#39;正在抓取目录章节...&#39;)
# url_1&#61;&#39;http://www.booktxt.net/1_1137/&#39;
for i in range(0, 10):
try:
html_data &#61; urllib.request.urlopen(url_1).read().decode(&#39;gbk&#39;)
txt_name &#61; re.compile(txt_name_partern).findall(html_data)
print(&#39;小说名称&#xff1a;&#39; &#43; txt_name[0])
atalog &#61; re.compile(catalog_partern).findall(html_data)
print(&#39;章节目录抓取完毕...总章节数&#xff1a;&#39; &#43; str(len(atalog)))
break
except:
if i &#61;&#61; 9:
print("请求失败次数过多&#xff0c;请重新下载")
print("请求失败&#xff0c;正在重试...")
time.sleep(0.5)
continue
files &#61; txt_name[0]
if not os.path.exists(files):
os.mkdir(files)
else:
file_path_list &#61; os.listdir(files)
for file in file_path_list:
os.remove(files &#43; &#39;\\&#39; &#43; file)
# print(atalog)
max_len &#61; len(atalog)
atalog.sort(key&#61;len)
# max_len &#61;19
for x in range(0, max_thread):
t &#61; threading.Thread(target&#61;down_mul, args&#61;(url_1, x &#43; 1, files))
print(&#39;线程[&#39; &#43; str(x &#43; 1) &#43; &#39;]Runing Star&#39;)
t.start()
while (1):
if thread_stop &#61;&#61; max_thread:
break
print("正在抓取...请稍后...剩余线程数:" &#43; str(max_thread - thread_stop))
time.sleep(5)
print("等待合并章节...")
filenames &#61; os.listdir(files)
filenames.sort(key&#61;len)
print(filenames)
fo &#61; open(txt_name[0] &#43; &#39;.txt&#39;, "w")
for file in filenames:
filepath &#61; files &#43; &#39;\\&#39; &#43; file
for line in open(filepath):
fo.write(line)
fo.close()
print("合并章节完成...等待删除工作目录...")
shutil.rmtree(files)
times &#61; time.clock() - start_time
h &#61; int(times) // 3600
m &#61; int(times) % 3600 // 60
s &#61; int(times) % 60
print("小说下载完成&#xff0c;总共消耗时间&#xff1a;", h, "小时", m, &#39;分钟&#39;, s, &#39;秒&#39;)
s &#61; input()
if __name__ &#61;&#61; &#39;__main__&#39;:
opener &#61; urllib.request.build_opener()
header_list &#61; []
for key, value in headers.items():
header_list.append((key, value))
opener.addheaders &#61; header_list
urllib.request.install_opener(opener)
main()