利用lxml和request完成当当网图书信息提取

版权声明:2018/4/10重启blog;转载请注明出处 https://blog.csdn.net/zhaiqiming2010/article/details/86512662
import requests
from lxml import html

def spider(sn):
    '''get book data of Dangdang'''
    url = 'http://search.dangdang.com/?key={sn}&act=input'.format(sn=sn)
    # 请求
    html_data = requests.get(url).text
    # 解析HTML
    selector = html.fromstring(html_data)
    # 获取图书列表
    ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')

    #提取出书籍的部分信息
    book_list = []
    for li in ul_list:

        #bookName
        bookName = li.xpath('a/@title')[0]
        
        #link
        link = li.xpath('a/@href')[0]
        
        #price
        price = li.xpath('p[3]/span[1]/text()')[0].replace('¥', '')
        
        #business
        business = li.xpath('p[@class="search_shangjia"]/a/text()') if len(li.xpath('p[@class="search_shangjia"]/a/text()')) > 0 else '当当自营'

        book_list.append({'name':bookName,'link':link,'price':price,'business':business,})

    book_list = sorted(book_list, key=lambda item:float(item['price']))
    for item in book_list:
        print(item['price'])
if __name__ == '__main__':
    # sn = input('PELEASE INPUT BOOK NUMBER')
    sn = "9787115428028" #书籍的编号
    spider(sn)

猜你喜欢

转载自blog.csdn.net/zhaiqiming2010/article/details/86512662