作者:手机用户2602906305_849 | 来源:互联网 | 2023-05-17 12:43
先来张爬取结果的截图
再来份代码吧
import requests
import re
from bs4 import BeautifulSoup
from tkinter import scrolledtext
from tkinter import ttk
import tkinter as tk
import threading
def getHtml(ID):
url = 'https://movie.douban.com/top250?start=%s&filter=' % ID
print('url ' + url)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2372.400 QQBrowser/9.5.10548.400'
,
'COOKIE': 'bid=I0klBiKF3nQ; ll="118277"; gr_user_id=ffdf2f63-ec37-49b5-99e8-0e0d28741172; ap=1; _vwo_uuid_v2=8C5B24903B1D1D3886FE478B91C5DE97|7eac18658e7fecbbf3798b88cfcf6113; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1498305874%2C%22https%3A%2F%2Fbook.douban.com%2Ftag%2F%25E9%259A%258F%25E7%25AC%2594%3Fstart%3D20%26type%3DT%22%5D; _pk_id.100001.4cf6=4e61f4192b9486a8.1485672092.5.1498306809.1498235389.; _pk_ses.100001.4cf6=*'
}
req = requests.get(url, headers)
return req.text
def parseHtml(html):
print('init html.....')
titleRe = r'(.[^&]*?)'
regTitle = re.compile(titleRe)
titleStr = re.findall(regTitle, html)
retStars = r'.*?"v:average">(.*?)'
regStars = re.compile(retStars)
starts = re.findall(regStars, html)
regCommend = r'(.*?)'
regCommends = re.compile(regCommend)
commends = []
commends = re.findall(regCommends, html)
commends.remove('·')
commends.remove('更多')
commends.remove('{{= year}}')
commends.remove('{{= sub_title}}')
commends.remove('{{= address}}')
commends.remove('集数未知')
commends.remove('共{{= episode}}集')
regScrip = r'.*?"inq">(.*?)'
regx_scrip = re.compile(regScrip)
list_scrip = re.findall(regx_scrip, html)