1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
| #####coding=utf-8
import urllib
import urllib.request
import pymysql
import time
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
import pymongo
from pymongo import MongoClient
import gridfs
#####获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
#####获取使用的database对象
db = client.news
#####开始爬取数据
def start_crawler():
page_num = 1
while page_num<=1:
url = "http://www.sxcoal.com/news/seniorsearch?GeneralNewsSearch%5Bcategory_id%5D%5B0%5D=1&GeneralNewsSearch%5Bnews_industry_ids%5D=&GeneralNewsSearch%5Bnews_tag_ids%5D=&GeneralNewsSearch%5Bport_ids%5D=&GeneralNewsSearch%5Bprov_area_id%5D=&page={}&per-page=10".format(page_num)
print (url)
page_num += 1
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib.request.Request(url,headers=headers)
respOnse=requests.get(url,headers=headers)
cOntent=response.text
one_page = get_page_news(content)
time.sleep(1)
if one_page:
to_mysql(one_page)
time.sleep(1)
else:
break
print ('新闻抓取完毕')
#####爬取新闻标题、日期、地址到数据库
def to_mysql(one_page):
print (one_page)
def get_page_news(content):
soup = BeautifulSoup(content,'lxml')
one_page_list = []
for i in soup.find_all("div",class_="artnr"):
title = i.select('h4')[0].text
url = i.a['href']
date = i.p.find('span',class_='spandate').string.split(" ")[1]
one_page={'title':title,'url':url,'date':date,'type':'news','label':'www.sxcoal.com'}
db.newstitle.insert_one(one_page)
one_page_list.append((title,url,date))
return one_page_list
#####抓取具体内容
def get_new_body():
link_list = get_news_linksfrom_database()
for url in link_list:
news_body = get_news_text(url)
print('_id')
#写入数据库
one_page={'newsbody':get_news_text(url)}
db.newstitle.insert_one(one_page)
print("新闻主体完毕!")
def get_news_linksfrom_database():
result = db.newstitle.find({'label':'www.sxcoal.com'},{'_id':1,'url':1})
return result if result else []
def get_news_text(url):
html = requests.get(url)
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text,'html.parser')
try:
return str(soup.find('div',{'id':'Zoom'}))
except:
return None
if __name__ == '__main__':
'''爬取新闻简要'''
##### start_crawler() #开始爬虫
'''爬取具体新闻'''
get_new_body() |