作者:手机用户2502897397 | 来源:互联网 | 2023-09-09 10:54
importurllibimportsocketimporturllib2importtimefrombs4importBeautifulSoupurlhttp:www.xi
import urllib
import socket
import urllib2
import time
from bs4 import BeautifulSoup
url = 'http://www.xicidaili.com/nn/'
target="https://msdn.microsoft.com"
dirt={}
proxy = {'http': '223.15.151.149:8888'}
proxy_support = urllib2.ProxyHandler(proxy)
# opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
# 添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题
# i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
i_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
f = open("proxy.txt","w")
for i in range(1,1504):
new_url=url+str(i)
print new_url
time.sleep(3)
req = urllib2.Request(new_url, headers=i_headers)
html = urllib2.urlopen(req).read()
soup=BeautifulSoup(html,"html.parser")
#print soup.body
ips = soup.find_all('tr')
#print ips
for x in range(1,len(ips)):
ip = ips[x]
tds = ip.find_all("td")
#print tds[1].text,tds[2].text
dirt[tds[1].text]=tds[2].text
f.write(tds[1].text+":"+tds[2].text+"\n")
print len(dirt)
socket.setdefaulttimeout(3)