from bs4 import BeautifulSoup
from urllib.request import urlopen
import lxml
url = '网址(https://baike.baidu.com)'
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html,features="lxml")#使用lxml的方式解析网页#输出<p></p>标签的内容
print(soup.p)
a = soup.find_all('a')#找到所有a标签#循环输出a中的网址for i in a:
print(i[href])
BeautifulSoup 中的 find_all() 可以添加参数
soup = BeautifulSoup(html,features='lxml')
month = soup.find_all('li',{'class':'month'})#如:<li class="month">一月</li>
print(month.get_text())#一月
img_links = soup.find_all('img',{'src':re.compile('.*?\.jpg')})
for i in img_links:
print(i['src'])