scrapypipeline,mysql

scrapy-pipeline,mysql

scrapy中多个pipeline作用&＃xff1a;

一个项目可能需要爬取多个网站&＃xff0c;根据每个网站的数据量&＃xff08;处理方式&＃xff09;不同&＃xff0c;可创建多个管道 pipeline

class SpideranythingPipeline(object):def process_item(self, item, spider):if spider.name &＃61;&＃61; &＃39;itcast&＃39;: # spider为爬虫实例 itcast是爬虫的名字&＃xff0c;&＃xff0c; 由此可区分多个爬虫print(item)return item

pipeline的方法

mysql

class SpiderSuningBookPipeline(object):def process_item(self, item, spider):# collection.insert(dict(item))sql &＃61; """insert into book(title,author,download_text,new) values(&＃39;%s&＃39;,&＃39;%s&＃39;,&＃39;%s&＃39;,&＃39;%s&＃39;)"""\%(item[&＃39;title&＃39;],item[&＃39;author&＃39;],item[&＃39;download_text&＃39;],item[&＃39;new&＃39;]
)print(sql)self.cursor.execute(sql)return itemdef open_spider(self, spider):# 连接数据库self.connect &＃61; pymysql.connect(host&＃61;&＃39;127.0.0.1&＃39;,port&＃61;3306,db&＃61;&＃39;study&＃39;,user&＃61;&＃39;root&＃39;,passwd&＃61;&＃39;123456&＃39;,charset&＃61;&＃39;utf8&＃39;,use_unicode&＃61;True)# 通过cursor执行增删查改self.cursor &＃61; self.connect.cursor()self.connect.autocommit(True)def close_spider(self, spider):self.cursor.close()self.connect.close()

mongodb

from pymongo import MongoClientclass PracticePipeline(object):def process_item(self, item, spider):&＃39;&＃39;&＃39; 接受爬虫返回的数据 &＃39;&＃39;&＃39;passdef open_spider(self, spider):&＃39;&＃39;&＃39; 爬虫启动的时候调用 &＃39;&＃39;&＃39;spider.hello &＃61; &＃39;world&＃39; # 可以给spider添加属性# 初始化数据库连接client &＃61; MongoClient()spider.collection &＃61; client[&＃39;SpiderAnything&＃39;][&＃39;hr&＃39;]def close_spider(self, spider):&＃39;&＃39;&＃39; 爬虫关闭的时候调用 &＃39;&＃39;&＃39;pass