- 使用xpath来解析,并提取网页中的数据
- 想详细了解xpath请点击xpath教程
import urllib.request
import urllib.parse
from lxml import etree
import time
import json
item_list = []
def main():
start_page = int(input("请输入起始页:"))
end_page = int(input("请输入结束页:"))
for page in range(start_page-1,end_page):
url = 'https://book.douban.com/tag/%E5%8E%86%E5%8F%B2?start=' + str((page)*20) + '&type=T'
header = {'User-Agent':'Mozilla/5.0 (X11; U; Linux x86_64;'
' zh-CN; rv:1.9.2.10) Gecko/20100922'
' Ubuntu/10.10 (maverick) Firefox/3.6.10'}
print("第" + str(page+1) + "页开始下载")
request = urllib.request.Request(url=url,headers=header)
content = urllib.request.urlopen(request).read().decode()
tree = etree.HTML(content)
li_list = tree.xpath('//li[@class="subject-item"]')
for main in li_list:
book_names = main.xpath('.//div[@class="info"]/h2/a/@title')
book_images = main.xpath('.//div[@class="pic"]/a[@class="nbg"]/img/@src')
book_synopsis = main.xpath('.//div[@class="info"]/p/text()')
book_author_info = main.xpath('.//div[@class="info"]/div[@class="pub"]/text()')
return_book_author_info = [x.strip() for x in book_author_info if x.strip()!='']
item = {'name':book_names,
'image':book_images,
'synopsis':book_synopsis,
'author':return_book_author_info}
print(str(book_names) + "正在下载.....")
item_list.append(item)
time.sleep(0.5)
strings = json.dumps(item_list, ensure_ascii=False)
with open('douban.txt', 'w', encoding='utf8') as fp:
fp.write(strings)
if __name__ == '__main__':
main()