这篇文章主要介绍如何使用Python3制作一个带GUI界面的小说爬虫工具,文中介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们一定要看完!
最近帮朋友写个简单爬虫,顺便整理了下,搞成了一个带GUI界面的小说爬虫工具,用来从笔趣阁爬取小说。
1.多线程采集,一个线程采集一本小说
2.支持使用代理,尤其是多线程采集时,不使用代理可能封ip
3.实时输出采集结果
使用 threading.BoundedSemaphore() pool_sema.acquire() pool_sema.release()
来限制线程数量,防止并发线程过。具体限制数量,可在软件界面输入,默认5个线程
# 所有线程任务开始前 pool_sema.threading.BoundedSemaphore(5) # 具体每个线程开始前 锁 pool_sema.acquire() .... # 线程任务执行结束释放 pol_sema.release()
pip install requests pip install pysimplegui pip install lxml pip install pyinstaller
GUI 界面使用了一个tkinter 的封装库 PySimpleGUI
, 使用非常方便,虽然界面不够漂亮,但胜在简单,非常适合开发些小工具。https://pysimplegui.readthedocs.io/en/latest/比如这个界面的布局,只需简单几个 list
layout = [ [sg.Text('输入要爬取的小说网址,点此打开笔趣阁站点复制', fOnt=("微软雅黑", 12), key="openwebsite", enable_events=True, tooltip="点击在浏览器中打开")], [sg.Text("小说目录页url,一行一个:")], [ sg.Multiline('', key="url", size=(120, 6), autoscroll=True, expand_x=True, right_click_menu=['&Right', ['粘贴']] ) ], [sg.Text(visible=False, text_color="#ff0000", key="error")], [ sg.Button(button_text='开始采集', key="start", size=(20, 1)), sg.Button(button_text='打开下载目录', key="opendir", size=(20, 1), button_color="#999999") ], [sg.Text('填写ip代理,有密码格式 用户名:密码@ip:端口,无密码格式 ip:端口。如 demo:123456@123.1.2.8:8580')], [ sg.Input('', key="proxy"), sg.Text('线程数量:'), sg.Input('5', key="threadnum"), ], [ sg.Multiline('等待采集', key="res", disabled=True, border_width=0, background_color="#ffffff", size=( 120, 6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, fOnt=("宋体", 10), text_color="#999999") ], ]
pyinstaller -Fw start.py
import time import requests import os import sys import re import random from lxml import etree import webbrowser import PySimpleGUI as sg import threading # user-agent header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" } # 代理 proxies = {} # 删除书名中特殊符号 # 笔趣阁基地址 baseurl = &#39;https://www.xbiquwx.la/&#39; # 线程数量 threadNum = 6 pool_sema = None THREAD_EVENT = &#39;-THREAD-&#39; cjstatus = False # txt存储目录 filePath = os.path.abspath(os.path.join(os.getcwd(), &#39;txt&#39;)) if not os.path.exists(filePath): os.mkdir(filePath) # 删除特殊字符 def deletetag(text): return re.sub(r&#39;[\[\]#\/\\:*\,;\?\"\&#39;<>\|\(\)《》&\^!~=%\{\}@!:。·!¥……() ]&#39;,&#39;&#39;,text) # 入口 def main(): global cjstatus, proxies, threadNum, pool_sema sg.theme("reddit") layout = [ [sg.Text(&#39;输入要爬取的小说网址,点此打开笔趣阁站点复制&#39;, fOnt=("微软雅黑", 12), key="openwebsite", enable_events=True, tooltip="点击在浏览器中打开")], [sg.Text("小说目录页url,一行一个:")], [ sg.Multiline(&#39;&#39;, key="url", size=(120, 6), autoscroll=True, expand_x=True, right_click_menu=[&#39;&Right&#39;, [&#39;粘贴&#39;]] ) ], [sg.Text(visible=False, text_color="#ff0000", key="error")], [ sg.Button(button_text=&#39;开始采集&#39;, key="start", size=(20, 1)), sg.Button(button_text=&#39;打开下载目录&#39;, key="opendir", size=(20, 1), button_color="#999999") ], [sg.Text(&#39;填写ip代理,有密码格式 用户名:密码@ip:端口,无密码格式 ip:端口。如 demo:123456@123.1.2.8:8580&#39;)], [ sg.Input(&#39;&#39;, key="proxy"), sg.Text(&#39;线程数量:&#39;), sg.Input(&#39;5&#39;, key="threadnum"), ], [ sg.Multiline(&#39;等待采集&#39;, key="res", disabled=True, border_width=0, background_color="#ffffff", size=( 120, 6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, fOnt=("宋体", 10), text_color="#999999") ], ] window = sg.Window(&#39;采集笔趣阁小说&#39;, layout, size=(800, 500), resizable=True,) while True: event, values = window.read() if event == sg.WIN_CLOSED or event == &#39;close&#39;: # if user closes window or clicks cancel break if event == "openwebsite": webbrowser.open(&#39;%s&#39; % baseurl) elif event == &#39;opendir&#39;: os.system(&#39;start explorer &#39; + filePath) elif event == &#39;start&#39;: if cjstatus: cjstatus = False window[&#39;start&#39;].update(&#39;已停止...点击重新开始&#39;) continue window[&#39;error&#39;].update("", visible=False) urls = values[&#39;url&#39;].strip().split("\n") lenth = len(urls) for k, url in enumerate(urls): if (not re.match(r&#39;%s\d+_\d+/&#39; % baseurl, url.strip())): if len(url.strip()) > 0: window[&#39;error&#39;].update("地址错误:%s" % url, visible=True) del urls[k] if len(urls) < 1: window[&#39;error&#39;].update( "每行地址需符合 %s84_84370/ 形式" % baseurlr, visible=True) continue # 代理 if len(values[&#39;proxy&#39;]) > 8: proxies = { "http": "http://%s" % values[&#39;proxy&#39;], "https": "http://%s" % values[&#39;proxy&#39;] } # 线程数量 if values[&#39;threadnum&#39;] and int(values[&#39;threadnum&#39;]) > 0: threadNum = int(values[&#39;threadnum&#39;]) pool_sema = threading.BoundedSemaphore(threadNum) cjstatus = True window[&#39;start&#39;].update(&#39;采集中...点击停止&#39;) window[&#39;res&#39;].update(&#39;开始采集&#39;) for url in urls: threading.Thread(target=downloadbybook, args=( url.strip(), window,), daemon=True).start() elif event == "粘贴": window[&#39;url&#39;].update(sg.clipboard_get()) print("event", event) if event == THREAD_EVENT: strtext = values[THREAD_EVENT][1] window[&#39;res&#39;].update(window[&#39;res&#39;].get()+"\n"+strtext) cjstatus = False window.close() #下载 def downloadbybook(page_url, window): try: bookpage = requests.get(url=page_url, headers=header, proxies=proxies) except Exception as e: window.write_event_value( &#39;-THREAD-&#39;, (threading.current_thread().name, &#39;\n请求 %s 错误,原因:%s&#39; % (page_url, e))) return if not cjstatus: return # 锁线程 pool_sema.acquire() if bookpage.status_code != 200: window.write_event_value( &#39;-THREAD-&#39;, (threading.current_thread().name, &#39;\n请求%s错误,原因:%s&#39; % (page_url, page.reason))) return bookpage.encoding = &#39;utf-8&#39; page_tree = etree.HTML(bookpage.text) bookname = page_tree.xpath(&#39;//div[@id="info"]/h2/text()&#39;)[0] bookfilename = filePath + &#39;/&#39; + deletetag(bookname)+&#39;.txt&#39; zj_list = page_tree.xpath( &#39;//div[@class="box_con"]/div[@id="list"]/dl/dd&#39;) for _ in zj_list: if not cjstatus: break zjurl = page_url + _.xpath(&#39;./a/@href&#39;)[0] zjname = _.xpath(&#39;./a/@title&#39;)[0] try: zjpage = requests.get( zjurl, headers=header, proxies=proxies) except Exception as e: window.write_event_value(&#39;-THREAD-&#39;, (threading.current_thread( ).name, &#39;\n请求%s:%s错误,原因:%s&#39; % (zjname, zjurl, zjpage.reason))) continue if zjpage.status_code != 200: window.write_event_value(&#39;-THREAD-&#39;, (threading.current_thread( ).name, &#39;\n请求%s:%s错误,原因:%s&#39; % (zjname, zjurl, zjpage.reason))) return zjpage.encoding = &#39;utf-8&#39; zjpage_content = etree.HTML(zjpage.text).xpath(&#39;//div[@id="content"]/text()&#39;) content = "\n【"+zjname+"】\n" for _ in zjpage_content: content += _.strip() + &#39;\n&#39; with open(bookfilename, &#39;a+&#39;, encoding=&#39;utf-8&#39;) as fs: fs.write(content) window.write_event_value( &#39;-THREAD-&#39;, (threading.current_thread().name, &#39;\n%s:%s 采集成功&#39; % (bookname, zjname))) time.sleep(random.uniform(0.05, 0.2)) # 下载完毕 window.write_event_value(&#39;-THREAD-&#39;, (threading.current_thread( ).name, &#39;\n请求 %s 结束&#39; % page_url)) pool_sema.release() if __name__ == &#39;__main__&#39;: main()
以上是“如何使用Python3制作一个带GUI界面的小说爬虫工具”这篇文章的所有内容,感谢各位的阅读!希望分享的内容对大家有帮助,更多相关知识,欢迎关注编程笔记行业资讯频道!