1 #!/usr/bin/env python
2 #-*-coding: utf-8 -*-
3 import re
4 import urllib.request as request
5 from bs4 import BeautifulSoup as bs
6 import csv
7 import os
8 import sys
9 from imp import reload
10 reload(sys)
11
12 def GetAllLink():
13 num = int(input("爬取多少页:>"))
14 if not os.path.exists('./data/'):
15 os.mkdir('./data/')
16
17 for i in range(num):
18 if i+1 == 1:
19 url = 'http://nj.58.com/piao/'
20 GetPage(url, i)
21 else:
22 url = 'http://nj.58.com/piao/pn%s/' %(i+1)
23 GetPage(url, i)
24
25
26 def GetPage(url, num):
27 Url = url
28 user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
29 headers = { 'User-Agent' : user_agent }
30 req = request.Request(Url, headers = headers)
31 page = request.urlopen(req).read().decode('utf-8')
32 soup = bs(page, "html.parser")
33 table = soup.table
34 tag = table.find_all('tr')
35 # 提取出所需的那段
36 soup2 = bs(str(tag), "html.parser")
37 title = soup2.find_all('a','t') #标题与url
38 price = soup2.find_all('b', 'pri') #价格
39 fixedprice = soup2.find_all('del') #原价
40 date = soup2.find_all('span','pr25') #时间
41
42 atitle = []
43 ahref = []
44 aprice = []
45 afixedprice = []
46 adate = []
47
48 for i in title:
49 #print i.get_text(), i.get('href')
50 atitle.append(i.get_text())
51 ahref.append(i.get('href'))
52 for i in price:
53 #print i.get_text()
54 aprice.append(i.get_text())
55 for i in fixedprice:
56 #print j.get_text()
57 afixedprice.append(i.get_text())
58 for i in date:
59 #print i.get_text()
60 adate.append(i.get_text())
61
62 csvfile = open('./data/ticket_%s.csv'%num, 'w')
63 writer = csv.writer(csvfile)
64 writer.writerow(['标题','url','售价','原价','演出时间'])
65 '''
66 每个字段必有title,但是不一定有时间date
67 如果没有date日期,我们就设为'---'
68 '''
69 if len(atitle) > len(adate):
70 for i in range(len(atitle) - len(adate)):
71 adate.append('---')
72 for i in range(len(atitle) - len(afixedprice)):
73 afixedprice.append('---')
74 for i in range(len(atitle) - len(aprice)):
75 aprice.append('---')
76
77 for i in range(len(atitle)):
78 message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]
79 writer.writerow([i for i in str(message).split('|')])
80 print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1))
81 csvfile.close()
82
83
84 if __name__ == '__main__':
85 GetAllLink()