利用python爬取豆瓣电影Top250的相关信息,包括电影详情链接,图片链接,影片中文名,影片外国名,评分,评价数,概况,导演,主演,年份,地区,类别这12项内容,然后将爬取的信息写入Excel表中。基本上爬取结果还是挺好的。具体代码如下:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from bs4 import BeautifulSoup
import re
import urllib2
import xlwt
def askURL(url):
request = urllib2.Request(url)
try:
respOnse= urllib2.urlopen(request)
html= response.read()
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
return html
def getData(baseurl):
findLink=re.compile(r'')
findImgSrc=re.compile(r',re.S)
findTitle=re.compile(r'(.*)')
findRating=re.compile(r'(.*)')
findJudge=re.compile(r'(\d*)人评价')
findInq=re.compile(r'(.*)')
findBd=re.compile(r'(.*?)
',re.S)
remove=re.compile(r' |\n||\.*')
datalist=[]
for i in range(0,10):
url=baseurl+str(i*25)
html=askURL(url)
soup = BeautifulSoup(html)
for item in soup.find_all('div',class_='item'):
data=[]
item=str(item)
link=re.findall(findLink,item)[0]
data.append(link)
imgSrc=re.findall(findImgSrc,item)[0]
data.append(imgSrc)
titles=re.findall(findTitle,item)
if(len(titles)==2):
ctitle=titles[0]
data.append(ctitle)
otitle=titles[1].replace(" / ","")
data.append(otitle)
else:
data.append(titles[0])
data.append(' ')
rating=re.findall(findRating,item)[0]
data.append(rating)
judgeNum=re.findall(findJudge,item)[0]
data.append(judgeNum)
inq=re.findall(findInq,item)
if len(inq)!=0:
inq=inq[0].replace("。","")
data.append(inq)
else:
data.append(' ')
bd=re.findall(findBd,item)[0]
bd=re.sub(remove,"",bd)
bd=re.sub('
'," ",bd)
bd=re.sub('/'," ",bd)
words=bd.split(" ")
for s in words:
if len(s)!=0 and s!=' ':
data.append(s)
if(len(data)!=12):
data.insert(8,' ')
datalist.append(data)
return datalist
def saveData(datalist,savepath):
book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet('豆瓣电影Top250',cell_overwrite_ok=True)
col=('电影详情链接','图片链接','影片中文名','影片外国名',
'评分','评价数','概况','导演','主演','年份','地区','类别')
for i in range(0,12):
sheet.write(0,i,col[i])
for i in range(0,250):
data=datalist[i]
for j in range(0,12):
sheet.write(i+1,j,data[j])
book.save(savepath)
def main():
baseurl='https://movie.douban.com/top250?start='
datalist=getData(baseurl)
savapath=u'豆瓣电影Top250.xlsx'
saveData(datalist,savapath)
main()
Excel表部分内容如下: