xpath语法可参考
https://www.cnblogs.com/gaochsh/p/6757475.html
https://cuiqingcai.com/2621.html
其中可以通过chrome安装xpath插件进行验证自己写的关键爬虫xpath语句(过程略)
第一则是爬取豌豆荚下的应用类别,输入格式.xlsx(主要由包名组成,前缀一致)
代码如下:
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import xlrd
import xlwt
import urllib2
from lxml import etree
# 读取文件,如上截图所示
data = xlrd.open_workbook('app0306.xlsx')
table = data.sheets()[0]
n_row = table.nrows
n_col = table.ncols
x = []
y = []
for i in range(n_row):
x.append(table.row_values(i)[0])
workbook = xlwt.Workbook(encoding='ascii')
worksheet = workbook.add_sheet('app')
count = 0
for xx in x:
url = xx
# 发起请求
req = urllib2.Request(url)
fd = urllib2.urlopen(req)
data = fd.read()
data = data.decode('utf-8')
# print(type(data))
print "on scanning ", count + 1
if '抱歉,该应用已下架' in data:
y.append('N0')
flag = 'NO'
print('NO')
else:
y.append('Yes')
flag = 'Yes'
print('Yes')
selector = etree.HTML(data)
zz = "http://www.wandoujia.com/apps/" # 删除网址豌豆荚前缀
l = len(zz)
if flag == 'Yes':
content1 = selector.xpath('//div[@class="app-info"]/p/span/text()') # 内容定位,应用名
# for i in content1:
# print i
content2 = selector.xpath('//div[@class="col-right"]/div/dl/dd/a/text()') # 内容定位,应用类别
# for j in content2:
# print j
worksheet.write(count, 0, url[l:]) # 1列
worksheet.write(count, 1, content1) # 2列
worksheet.write(count, 2, content2) # 2列
else:
worksheet.write(count, 0, url[l:]) # 1列
worksheet.write(count, 1, "NO") # 2列
count += 1
workbook.save('app.xls') # 输出路径自行定义
下一则是爬取谷歌应用类别,输入txt文件,最好设置停顿时间和代理商反爬虫
输入txtshili
# -*- coding: UTF-8 -*- # -*- coding: utf-8 -*- """ Created on Sun Nov 5 11:03:06 2017 @author: Administrator """ #批量检查url有效性 import urllib2
from urllib2 import URLError
import xlwt
import datetime,time
import requests
from lxml import etree
result_url=[]
result = []
count=0
not_200=0
f=open("app0306.txt","r") # 域名或网址的txt文件 workbook = xlwt.Workbook(encoding='ascii')
worksheet = workbook.add_sheet('My workshet')
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64)' headers = { 'User-Agent' : user_agent }
for line in f:
count+=1
print "on scanning ",count
try:
# req = requests.request("get", "http://" + line) # print req.status_code req = urllib2.Request(line, headers = headers) # 网址 #req = urllib2.Request("http://" + line) #域名 respOnse= urllib2.urlopen(req)
data = response.read()
data = data.decode('utf-8')
except URLError, e:
if hasattr(e,'reason'): #stands for URLError print "can not reach a server,writing..." elif hasattr(e,'code'): #stands for HTTPError print "find http error, writing..." else: #stands for unknown error print "unknown error, writing..." not_200 += 1
# result_url.append(line) # result.append('NO') re = 'NO' time.sleep(1) # 休眠1秒 else:
#print "url is reachable!" #else 中不用再判断 response.code 是否等于200,若没有抛出异常,肯定返回200,直接关闭即可 #result.append('YES') print "Yes!" response.close()
time.sleep(1) # 休眠1秒 re = 'YES' finally:
pass if re == 'YES':
selector = etree.HTML(data)
content1 = selector.xpath('//div[@class="details-info"]/div/div/h1/div/text()')
# for i in content1: # print i content2 = selector.xpath('//div[@class="left-info"]/div/a/span[@itemprop="genre"]/text()')
# for j in content2: # print j worksheet.write(count-1, 0, line) # 1列 worksheet.write(count-1, 1, content1) # 2列 worksheet.write(count-1, 2, content2) # 2列 else:
worksheet.write(count-1, 0, line) # 1列 worksheet.write(count-1, 1, "NO") # 2列 # worksheet.write(count-1, 0, line) # 1列 # worksheet.write(count-1, 1, re) # 2列 workbook.save('appmonth.xls') # 路径自行定义
print "scanning over,total",count,"; did not response 200:",not_200
f.close()