我在代码中使用的是python+bs4+pyside,请看下面部分代码:enter code here
#coding:gb2312
import urllib2
import sys
import urllib
import urlparse
import random
import time
from datetime import datetime, timedelta
import socket
from bs4 import BeautifulSoup
import lxml.html
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
def download(self, url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers or {})
opener = self.opener or urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except Exception as e:
print 'Download error:', str(e)
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <&#61; code <600:
# retry 5XX HTTP errors
return self._get(url, headers, proxy, num_retries-1, data)
else:
code &#61; None
return {&#39;html&#39;: html, &#39;code&#39;: code}
def crawling_hdf(openfile):
filename &#61; open(openfile,&#39;r&#39;)
namelist &#61; filename.readlines()
app &#61; QApplication(sys.argv)
for name in namelist:
url &#61; "http://so.haodf.com/index/search?type&#61;doctor&kw&#61;"&#43; urllib.quote(name)
#get doctor&#39;s home page
D &#61; Downloader(delay&#61;DEFAULT_DELAY, user_agent&#61;DEFAULT_AGENT, proxies&#61;None, num_retries&#61;DEFAULT_RETRIES, cache&#61;None)
html &#61; D(url)
soup &#61; BeautifulSoup(html)
tr &#61; soup.find(attrs&#61;{&#39;class&#39;:&#39;docInfo&#39;})
td &#61; tr.find(attrs&#61;{&#39;class&#39;:&#39;docName font_16&#39;}).get(&#39;href&#39;)
print td
#get doctor&#39;s detail information page
loadPage_bs4(td)
filename.close()
if __name__ &#61;&#61; &#39;__main__&#39;:
crawling_hdf("name_list.txt")
运行程序后&#xff0c;会显示警告信息&#xff1a;
警告(来自警告模块)&#xff1a;
文件“C:\Python27\lib\site packages\bs4\该死的.py“&#xff0c;第231行
无法解码某些字符&#xff0c;并且
UnicodeWarning:某些字符无法解码&#xff0c;已替换为替换字符。
我用过print str(html)发现所有标签中的中文都是乱码。在
我尝试过使用“解码或编码”和“gzip”解决方案&#xff0c;这是在这个网站上搜索&#xff0c;但它不适用于我的情况。在
非常感谢你的帮助