这篇文章我们将使用 urllib 和 re 模块爬取百度贴吧,并使用三种文件格式存储数据,下面先贴上最终的效果图
![](https://img7.php1.cn/3cdc5/f46b/5a0/fc5e5f3065adecf0.jpeg)
1、网页分析
(1)准备工作
首先我们使用 Chrome 浏览器打开 百度贴吧,在输入栏中输入关键字进行搜索,这里示例为 “计算机吧”
![](https://img7.php1.cn/3cdc5/f46b/5a0/2d1eead344c45901.jpeg)
2)分析 URL 规律
接下来我们开始分析网站的 URL 规律,以便于通过构造 URL 获取网站中所有网页的内容
第一页:
http://tieba.baidu.com/f?kw=%E8%AE%A1%E7%AE%97%E6%9C%BA&ie=utf-8&pn=0
第二页:
http://tieba.baidu.com/f?kw=%E8%AE%A1%E7%AE%97%E6%9C%BA&ie=utf-8&pn=50
第三页:
http://tieba.baidu.com/f?kw=%E8%AE%A1%E7%AE%97%E6%9C%BA&ie=utf-8&pn=100
...
通过观察不难发现,它的 URL 十分有规律,主要的请求参数分析如下:
kw
:搜索的关键字,使用 URL 编码,可以通过 urllib.parse.quote()
方法实现
ie
:字符编码的格式,其值为 utf-8
pn
:当前页面的页码,并且以 50 为步幅增长
所以完整的 URL 可以泛化如下:
http://tieba.baidu.com/f?kw={keyword}&ie=utf-8&pn={page}
![](https://img7.php1.cn/3cdc5/f46b/5a0/fd951cd0216fbbdc.png)
![](https://img7.php1.cn/3cdc5/f46b/5a0/94b342ebe36e1c74.jpeg)
核心代码如下:
import urllib.request
import urllib.parse
# 获取网页源代码
def get_page(url):# 构造请求头部headers = {'USER-AGENT':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}# 构造请求对象req = urllib.request.Request(url=url,headers=headers)# 发送请求,得到响应response = urllib.request.urlopen(req)# 获得网页源代码html = response.read().decode('utf-8')# 返回网页源代码return html
![](https://img7.php1.cn/3cdc5/f46b/5a0/2a5d75f524a36f8c.png)
(3)分析内容规律
接下来我们直接使用快捷键 Ctrl+U
打开网页的源代码,认真分析每一页中我们需要抓取的数据
容易发现每一个帖子的内容都被包含在一个
标签中,我们可以使用正则表达式进行匹配,具体包括:主题名称:r'href="/p/\d+" title="(.+?)"'
主题作者:r'title="主题作者: (.+?)"'
链接地址:r'href="/p/(\d+)"'
回复数&#xff1a;r&#39;title&#61;"回复">(\d&#43;)<&#39;
创建日期&#xff1a;r&#39;title&#61;"创建时间">(.&#43;?)<&#39;
![](https://img7.php1.cn/3cdc5/f46b/5a0/c4351c0848433435.png)
核心代码如下&#xff1a;
import re
# 解析网页源代码&#xff0c;提取数据
def parse_page(html):# 主题名称titles &#61; re.findall(r&#39;href&#61;"/p/\d&#43;" title&#61;"(.&#43;?)"&#39;,html)# 主题作者authods &#61; re.findall(r&#39;title&#61;"主题作者: (.&#43;?)"&#39;,html)# 链接地址nums &#61; re.findall(r&#39;href&#61;"/p/(\d&#43;)"&#39;,html)links &#61; [&#39;http://tieba.baidu.com/p/&#39;&#43;str(num) for num in nums]# 回复数量focus &#61; re.findall(r&#39;title&#61;"回复">(\d&#43;)&#39;,html)# 创建时间ctimes &#61; re.findall(r&#39;title&#61;"创建时间">(.&#43;?)<&#39;,html)# 获得结果data &#61; zip(titles,authods,links,focus,ctimes)# 返回结果return data
&#xff08;4&#xff09;保存数据
下面将数据保存为 txt 文件、json 文件和 csv 文件
import json
import csv
# 打开文件
def openfile(fm,fileName):fd &#61; Noneif fm &#61;&#61; &#39;txt&#39;:fd &#61; open(fileName&#43;&#39;.txt&#39;,&#39;w&#39;,encoding&#61;&#39;utf-8&#39;)elif fm &#61;&#61; &#39;json&#39;:fd &#61; open(fileName&#43;&#39;.json&#39;,&#39;w&#39;,encoding&#61;&#39;utf-8&#39;)elif fm &#61;&#61; &#39;csv&#39;:fd &#61; open(fileName&#43;&#39;.csv&#39;,&#39;w&#39;,encoding&#61;&#39;utf-8&#39;,newline&#61;&#39;&#39;)return fd# 将数据保存到文件
def save2file(fm,fd,data):if fm &#61;&#61; &#39;txt&#39;:for item in data:fd.write(&#39;----------------------------------------\n&#39;)fd.write(&#39;title&#xff1a;&#39; &#43; str(item[0]) &#43; &#39;\n&#39;)fd.write(&#39;authod&#xff1a;&#39; &#43; str(item[1]) &#43; &#39;\n&#39;)fd.write(&#39;link&#xff1a;&#39; &#43; str(item[2]) &#43; &#39;\n&#39;)fd.write(&#39;focus&#xff1a;&#39; &#43; str(item[3]) &#43; &#39;\n&#39;)fd.write(&#39;ctime&#xff1a;&#39; &#43; str(item[4]) &#43; &#39;\n&#39;)if fm &#61;&#61; &#39;json&#39;:temp &#61; (&#39;title&#39;,&#39;authod&#39;,&#39;link&#39;,&#39;focus&#39;,&#39;ctime&#39;)for item in data:json.dump(dict(zip(temp,item)),fd,ensure_ascii&#61;False)if fm &#61;&#61; &#39;csv&#39;:writer &#61; csv.writer(fd)for item in data:writer.writerow(item)
2、编码实现
完整代码如下&#xff0c;也很简单&#xff0c;还不到 100 行
import urllib.request
import urllib.parse
import re
import json
import csv
import time
import random# 获取网页源代码
def get_page(url):headers &#61; {&#39;USER-AGENT&#39;:&#39;Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36&#39;}req &#61; urllib.request.Request(url&#61;url,headers&#61;headers)response &#61; urllib.request.urlopen(req)html &#61; response.read().decode(&#39;utf-8&#39;)return html# 解析网页源代码&#xff0c;提取数据
def parse_page(html):titles &#61; re.findall(r&#39;href&#61;"/p/\d&#43;" title&#61;"(.&#43;?)"&#39;,html)authods &#61; re.findall(r&#39;title&#61;"主题作者: (.&#43;?)"&#39;,html)nums &#61; re.findall(r&#39;href&#61;"/p/(\d&#43;)"&#39;,html)links &#61; [&#39;http://tieba.baidu.com/p/&#39;&#43;str(num) for num in nums]focus &#61; re.findall(r&#39;title&#61;"回复">(\d&#43;)&#39;,html)ctimes &#61; re.findall(r&#39;title&#61;"创建时间">(.&#43;?)<&#39;,html)data &#61; zip(titles,authods,links,focus,ctimes)return data# 打开文件
def openfile(fm,fileName):if fm &#61;&#61; &#39;txt&#39;:return open(fileName&#43;&#39;.txt&#39;,&#39;w&#39;,encoding&#61;&#39;utf-8&#39;)elif fm &#61;&#61; &#39;json&#39;:return open(fileName&#43;&#39;.json&#39;,&#39;w&#39;,encoding&#61;&#39;utf-8&#39;)elif fm &#61;&#61; &#39;csv&#39;:return open(fileName&#43;&#39;.csv&#39;,&#39;w&#39;,encoding&#61;&#39;utf-8&#39;,newline&#61;&#39;&#39;)else:return None# 将数据保存到文件
def save2file(fm,fd,data):if fm &#61;&#61; &#39;txt&#39;:for item in data:fd.write(&#39;----------------------------------------\n&#39;)fd.write(&#39;title&#xff1a;&#39; &#43; str(item[0]) &#43; &#39;\n&#39;)fd.write(&#39;authod&#xff1a;&#39; &#43; str(item[1]) &#43; &#39;\n&#39;)fd.write(&#39;link&#xff1a;&#39; &#43; str(item[2]) &#43; &#39;\n&#39;)fd.write(&#39;focus&#xff1a;&#39; &#43; str(item[3]) &#43; &#39;\n&#39;)fd.write(&#39;ctime&#xff1a;&#39; &#43; str(item[4]) &#43; &#39;\n&#39;)if fm &#61;&#61; &#39;json&#39;:temp &#61; (&#39;title&#39;,&#39;authod&#39;,&#39;link&#39;,&#39;focus&#39;,&#39;ctime&#39;)for item in data:json.dump(dict(zip(temp,item)),fd,ensure_ascii&#61;False)if fm &#61;&#61; &#39;csv&#39;:writer &#61; csv.writer(fd)for item in data:writer.writerow(item)# 开始爬取网页
def crawl():kw &#61; input(&#39;请输入主题贴吧名字&#xff1a;&#39;)base_url &#61; &#39;http://tieba.baidu.com/f?kw&#61;&#39; &#43; urllib.parse.quote(kw) &#43; &#39;&ie&#61;utf-8&pn&#61;{page}&#39;fm &#61; input(&#39;请输入文件保存格式&#xff08;txt、json、csv&#xff09;&#xff1a;&#39;)while fm!&#61;&#39;txt&#39; and fm!&#61;&#39;json&#39; and fm!&#61;&#39;csv&#39;:fm &#61; input(&#39;输入错误&#xff0c;请重新输入文件保存格式&#xff08;txt、json、csv&#xff09;&#xff1a;&#39;)fd &#61; openfile(fm,kw)page &#61; 0total_page &#61; int(re.findall(r&#39;共有主题数(\d&#43;)个&#39;,get_page(base_url.format(page&#61;str(0))))[0])print(&#39;开始爬取&#39;)while page ![](https://img7.php1.cn/3cdc5/f46b/5a0/cda58013ba434b95.png)
![](https://img7.php1.cn/3cdc5/f46b/5a0/4dd2ec114a768f5e.png)
请继续关注我
![](https://img7.php1.cn/3cdc5/f46b/5a0/86f0654411be3cae.png)
记得点赞加关注哦&#xff0c;记得加鸡腿啊