作者:唯爱-U澄_155 | 来源:互联网 | 2023-10-10 12:05
查看页面源码
案例 """ 1. 提取页面源代码 2. 解析页面源代码,提取数据""" import requestsfrom pyquery import PyQuery f = open ( "qingchezhijia.csv" , mode= 'w' , encoding= 'utf-8' ) def get_page_source ( url) : resp = requests. get( url) resp. encoding = "gbk" return resp. textdef parse_page_source ( html) : doc = PyQuery( html) mt_list = doc( ".mt-10" ) . items( ) for mt in mt_list: if not mt( "div >dl:nth-child(3)>dt:contains(购车经销商)" ) : mt( "div >dl:nth-child(2)" ) . after( PyQuery( """购车经销商 """ ) ) car = mt( "div>dl:nth-child(1)>dd" ) . eq( 0 ) . text( ) . replace( "\n" , "" ) . replace( " " , "" ) place = mt( "div>dl:nth-child(2)>dd" ) . eq( 0 ) . text( ) time = mt( "div>dl:nth-child(4)>dd" ) . eq( 0 ) . text( ) price = mt( "div>dl:nth-child(5)>dd" ) . eq( 0 ) . text( ) . replace( "万元" , "" ) youhao = mt( "div>dl:nth-child(6)>dd >p:nth-child(1)" ) . eq( 0 ) . text( ) . replace( "升/百公里" , "" ) kilometer = mt( "div>dl:nth-child(6)>dd >p:nth-child(2)" ) . eq( 0 ) . text( ) . replace( "公里" , "" ) other = mt( "div>div>dl>dd" ) . text( ) . split( ) f. write( f"购买车型: { car} ,购买地点: { place} ,购买时间: { time} ,购车购买价: { price} ,油耗: { youhao} ,目前行驶: { kilometer} ,其它: { other} \n" ) def main ( ) : url = "https://k.autohome.com.cn/146/" html = get_page_source( url) parse_page_source( html) if __name__ == '__main__' : main( )
运行结果: