import requests
r=requests.get('https://book.douban.com/')
content=r.text
需要解析的主要HTMl
# <div class="info">
# <div class="title">
# <a class="" href="https://book.douban.com/subject/30163860/?icn=index-editionrecommend"
# title="绿毛水怪">绿毛水怪</a>
# </div>
# <div class="author">
# 王小波
# </div>
# <div class="more-meta">
# <h4 class="title">
# 绿毛水怪
# </h4>
# <p>
# <span class="author">
# 王小波
# </span>
# /
# <span class="year">
# 2018-5-1
# </span>
# /
# <span class="publisher">
# 北京十月文艺出版社
# </span>
正则
import re
pattern=re.compile('<h4.*?>(.*?)</h4>.*?<p>.*?author">.*?(.*?)</span>.*?year">.*?(.*?)</span>.*?publisher">.*?(.*?)</span>.*?</p>',re.S)
results = re.findall(pattern, content)
print(results)
for result in results:
# print(result)
name,author,time,chuban=result
name=re.sub("\s",'',name)
author=re.sub('\s','',author)
time=re.sub("\s",'',time)
chuban=re.sub("\s",'',chuban)
print(name,author,time,chuban)
bs4
from bs4 import beautiful
html=r.content
soup=BeautifulSoup(html,"lxml")
print(type(soup))
name=soup.findAll(name='h4',class_='title',text=re.compile(".*?"))
author=soup.findAll(name='span',class_='author',text=re.compile(".*?"))
time=soup.findAll(name='span',class_="year",text=re.compile(".*?"))
chuban=soup.findAll(name="span",class_="publisher",text=re.compile((".*?")))
xpath
from lxml import etree