python爬虫 - 爬取豆瓣上的数据

使用xpath来解析，并提取网页中的数据
想详细了解xpath请点击xpath教程

import urllib.request
import urllib.parse
from lxml import etree
import time
import json

item_list = []

def main():
    start_page = int(input("请输入起始页："))
    end_page = int(input("请输入结束页："))

    for page in range(start_page-1,end_page):
        url = 'https://book.douban.com/tag/%E5%8E%86%E5%8F%B2?start=' + str((page)*20) + '&type=T'
        header = {'User-Agent':'Mozilla/5.0 (X11; U; Linux x86_64;'
                           ' zh-CN; rv:1.9.2.10) Gecko/20100922'
                           ' Ubuntu/10.10 (maverick) Firefox/3.6.10'}
        #print(url)
        print("第" + str(page+1) + "页开始下载")
        request = urllib.request.Request(url=url,headers=header)
        content = urllib.request.urlopen(request).read().decode()
        #print(content)
        #解析内容
        tree = etree.HTML(content)

        li_list = tree.xpath('//li[@class="subject-item"]')
        for main in li_list:
            book_names = main.xpath('.//div[@class="info"]/h2/a/@title')
            book_images = main.xpath('.//div[@class="pic"]/a[@class="nbg"]/img/@src')
            book_synopsis = main.xpath('.//div[@class="info"]/p/text()')
            book_author_info = main.xpath('.//div[@class="info"]/div[@class="pub"]/text()')
            #print(book_names,'\n',book_synopsis_list ,'\n',book_images,'\n',book_author_info)
            #去除列表中的'\n'.'\t'等符号
            return_book_author_info = [x.strip() for x in book_author_info if x.strip()!='']
            item = {'name':book_names,
                    'image':book_images,
                    'synopsis':book_synopsis,
                    'author':return_book_author_info}
            #print(item)
            print(str(book_names) + "正在下载.....")
            item_list.append(item)
            time.sleep(0.5)
    #改成json格式
    strings = json.dumps(item_list, ensure_ascii=False)
    #写入文件
    with open('douban.txt', 'w', encoding='utf8') as fp:
        fp.write(strings)
if __name__ == '__main__':
    main()

kyle-fang

发布了51 篇原创文章 · 获赞 29 · 访问量 2382

私信关注

python爬虫 - 爬取豆瓣上的数据

猜你喜欢