特别声明:本文为原创,可自由转载、引用,但需署名作者且注明文章出处,如有侵权请联系!
使用python写个爬虫批量抓取并验证快代理免费代理地址import requests
import threading
from bs4 import BeautifulSoup
from queue import Queue
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
# 分隔符
Separator='|'
data_queue=Queue()
def get_list(page):
countNum=0
All_proxy=[]
for p in range(1,page+1):
url='https://www.kuaidaili.com/free/inha/{}/'.format(p)
print(url)
r=requests.get(url,headers=headers)
soup=BeautifulSoup(r.text,'lxml')
trs = soup.find('table', class_='table').find_all('tr')
for tr in trs[1:]:
tds = tr.find_all('td')
ip=tds[0].text.strip()
port=tds[1].text.strip()
anony=tds[2].text.strip()
protocol=tds[3].text.strip()
locate=tds[4].text.strip()
time=tds[6].text.strip()
proxy_str=ip+Separator+port+Separator+protocol+Separator+anony+Separator+locate+Separator+time
data_queue.put(proxy_str)
All_proxy.append(proxy_str)
print(proxy_str)
countNum += 1
text_save('Alldaili_kuai.txt',All_proxy)
return countNum
def verifyProxyList(items):
myurl = 'http://www.baidu.com/'
while 1:
print('线程 %s:启动;还有%d条数据待处理!'%(threading.current_thread().name,data_queue.qsize()))
if data_queue.empty():
break
ll=data_queue.get()
line=ll.strip().split(Separator)
daili='{}://{}:{}'.format(line[2].lower(),line[0],line[1])
if line[2].lower()=='http':
proxies={
'http':daili,
}
else:
proxies={
'https':daili,
}
try:
r=requests.get(url=myurl,headers=headers,proxies=proxies)
items.append(ll+Separator+'验证成功')
print (daili+" 连接成功!")
except Exception as e:
# items.append(ll+Separator+'验证失败')
print (daili+" 连接失败!")
print('线程 %s:结束;还有%d条数据待处理!'%(threading.current_thread().name,data_queue.qsize()))
def text_save(filename, data):#filename为写入txt文件的路径,data为要写入数据列表.
file = open(filename,'a')
for i in range(len(data)):
s = str(data[i]).replace('[','').replace(']','')#去除[],这两行按数据不同,可以选择
s = s.replace("'",'').replace(',','') +'\n' #去除单引号,逗号,每行末尾追加换行符
file.write(s)
file.close()
print("保存文件成功")
def main():
page=int(input('请输入你要爬取的页码总数:'))
if page>10:
page=10
# page=10
countNum=get_list(page)
print('共采集了%d页,共%d条数据!'%(page,countNum))
print('下面准备开始逐一验证,请稍后......')
all_thread = []
verify_list=[]
for i in range(1,31):
t = threading.Thread(target=verifyProxyList,name="线程"+str(i),args=(verify_list,))
all_thread.append(t)
t.start()
for t in all_thread:
t.join()
text_save('verified_kuaidaili.txt',verify_list)
print('验证结束,共有%d条信息有效'%len(verify_list))
print ("All Done.")
if __name__ == '__main__':
main()
赏
文章来源:
zyglz
版权声明:本文为原创,可自由转载、引用,但需署名作者且注明文章出处,如有侵权请联系!