使用XPath
import requests
import json
from lxml import etree
from urllib import parse
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
url = 'https://www.zhihu.com/explore'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')
items ={}
for node in node_list:# xpath返回的列表,这个列表就这一个参数,用索引方式取出来#问题question = node.xpath('.//h2/a')[0].text.replace("\n","")# 作者author = node.xpath('.//*[@class="author-link-line"]/*')[0].text#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")# 回答answer = node.xpath('.//*[@class="content"]')[0].text#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]items = {"question" : question,"author" : author,"answer" : answer,} with open("explore.json", "a") as f:#f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")f.write(json.dumps(items, ensure_ascii = False) + "\n")
####保存为TXT
import requests
from lxml import etree
from urllib import parse
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
url = 'https://www.zhihu.com/explore'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')for node in node_list:# xpath返回的列表,这个列表就这一个参数,用索引方式取出来#问题question = node.xpath('.//h2/a')[0].text.replace("\n","")# 作者author = node.xpath('.//*[@class="author-link-line"]/*')[0].text#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")# 回答answer = node.xpath('.//*[@class="content"]')[0].text#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]with open('explore.txt', 'a', encoding='utf-8') as file:file.write('\n'.join([question, author, answer]))file.write('\n' + '=' * 50 + '\n')
保存为csv
import requests
from lxml import etree
from urllib import parse
import csvurl = 'https://www.zhihu.com/explore'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')for node in node_list:# xpath返回的列表,这个列表就这一个参数,用索引方式取出来#问题question = node.xpath('.//h2/a')[0].text.replace("\n","")# 作者author = node.xpath('.//*[@class="author-link-line"]/*')[0].text#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")# 回答,为方便展示,只取部分内容,text[ :10]answer = node.xpath('.//*[@class="content"]')[0].text[ :10]#answer = node.xpath('.//*[@class="content"]')[0].text#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]with open('explore.csv', 'a', encoding='utf-8') as csvfile:fieldnames = ['question', 'author', 'answer']writer = csv.DictWriter(csvfile, fieldnames=fieldnames)writer.writeheader()writer.writerow({'question': question, 'author': author, 'answer': answer})
####读取csv
1 import csv
2
3 with open('explore.csv', 'r', encoding='utf-8') as csvfile:
4 reader = csv.reader(csvfile)
5 for row in reader:
6 print(row)
####保存到MongoDB
import requests
from lxml import etree
from urllib import parse
from pymongo import MongoClientclient = MongoClient()
db = client['explore']
collection = db['explore']url = 'https://www.zhihu.com/explore'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')for node in node_list:# xpath返回的列表,这个列表就这一个参数,用索引方式取出来#问题question = node.xpath('.//h2/a')[0].text.replace("\n","")# 作者author = node.xpath('.//*[@class="author-link-line"]/*')[0].text#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")# 回答answer = node.xpath('.//*[@class="content"]')[0].text#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]items = {"question" : question,"author" : author,"answer" : answer,} if collection.insert(items):print('Saved to Mongo')