1 #-*- conding:utf-8 -*-
2
3 #1.两页的内容
4 #2.抓取每页title和URL
5 #3.根据title创建文件,发送URL请求,提取数据
6 importrequests7 from lxml importetree8 importtime, random, xlwt9
10
11 #专家委员会成员的xpath("//tbody//tr[@height="29"]’)
12
13 classDoc_spider(object):14
15 def __init__(self):16 self.base_url = "http://www.bjmda.com"
17 self.url = "http://www.bjmda.com/Aboutus/ShowClass.asp?ClassID=12&page={}"
18 self.headers ={19 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}20
21 defget_request(self, url):22 """发送请求,返回html"""
23 response = requests.get(url, headers=self.headers).content.decode("gbk")24 #time.sleep(random.random())
25 html =etree.HTML(response)26 returnhtml27
28 defparse_page_html(self, html, url):29 """提取列表页的专家委员会title和URL"""
30
31 url_lists = html.xpath("//tr/td[2]/a[2]/@href")[1:]32 temp_lists = html.xpath("//tr/td[2]/a[2]/text()")[1:]33 title_lists = [title.rstrip() for title intemp_lists]34
35 urls =[]36 titles =[]37
38 for i inrange(len(title_lists)):39 url = self.base_url +url_lists[i]40 title =title_lists[i]41 urls.append(url)42 titles.append(title)43
44 returnurls, titles45
46 defparse_detail(self, html):47 """详细页的提取数据,返回每组列表信息"""
48
49 lists = html.xpath("//td[@id="fontzoom"]//tr")50 content_list =[]51 for list inlists:52 contents = list.xpath(".//td//text()")53 new =[]54 for i incontents:55 new.append("".join(i.split()))56 content_list.append(new)57
58 returncontent_list59
60 defsave_excel(self, sheet_name, contents, worksheet, workbook):61 """保存数据到Excel"""
62
63 #创建一个workbook 设置编码
64 #workbook = xlwt.Workbook()
65 #创建一个worksheet
66 #worksheet = workbook.add_sheet(sheet_name)
67
68 try:69
70 for i inrange(len(contents)):71 if len(contents[i+1])>1:72 content_list = contents[i + 1]73
74 #写入excel
75 #参数对应 行, 列, 值
76 worksheet.write(i, 0, label=content_list[0])77 worksheet.write(i, 1, label=content_list[1])78 worksheet.write(i, 2, label=content_list[2])79 if len(contents[i+1])>3:80 worksheet.write(i, 3, label=content_list[3])81
82 #保存
83 #workbook.save(sheet_name + ".xls")
84 #time.sleep(0.1)
85 except:86 print(sheet_name,"保存OK")87
88 pass
89
90 defrun(self):91 #1.发送专家委员会列表页请求
92 urls = [self.url.format(i + 1) for i in range(2)]93
94 #创建一个workbook 设置编码
95 workbook =xlwt.Workbook()96
97 for url inurls:98 html =self.get_request(url)99 #2.提取委员会的title和URL
100 list_urls, titles =self.parse_page_html(html, url)101
102 for i inrange(len(list_urls)):103 url_detail =list_urls[i]104 #每个委员会的名称
105 title_detail =titles[i]106 #3.创建每个委员会文件,发送每个委员会的请求
107 html_detail =self.get_request(url_detail)108 #4.提取专家委员会详细页的内容
109 contents =self.parse_detail(html_detail)110 #保存每个委员会的所有人
111
112 #创建一个worksheet
113 worksheet =workbook.add_sheet(title_detail)114 self.save_excel(title_detail, contents,worksheet,workbook)115 workbook.save("专家委员会.xls")116 print("保存结束,请查看")117
118
119
120 if __name__ == "__main__":121 doc =Doc_spider()122 doc.run()