版权声明: https://blog.csdn.net/qq_25233621/article/details/80978018
BeautufulSoup库 #导入库 from bs4 import BeautifulSoup
demo = r.text ---------------------------------------------------------------------------------------#赋值
soup = BeautifulSoup(demo,"html.parser") ------------------------------------------------------#做汤
print(soup.prettify()) --------------------------------------------------------------------------------#格式化网页
Tag --------------------------------------------------------------------------------------------------#标签,最基本的信息组织单元 .<Tag>
Name ------------------------------------------------------------------------------------------------#标签的名字 .name
Attributes -------------------------------------------------------------------------------------------#标签的属性,字典形式组织 .attrs
Navigablestring -------------------------------------------------------------------------------------#标签内非属性字符串 .string
Comment -------------------------------------------------------------------------------------------#标签内字符串注释部分
标签树下行遍历
.contents --------------------------------------------------------------------------------------------#子节点列表,将所有<Tag>所有儿子节点存入列表
.children ---------------------------------------------------------------------------------------------#用于循环遍历儿子节点
.descendants ----------------------------------------------------------------------------------------#包含所有子孙节点
遍历儿子节点 遍历子孙节点
for child in soup.body.chiledren:
print(child)
标签树上行遍历
、
.parent ----------------------------------------------------------------------------------------------#节点的父亲标签
.parents ---------------------------------------------------------------------------------------------#节点的先辈标签
标签树的平行遍历
.next_sibling -----------------------------------------------------------------------------------------#返回按照HTML文本顺序的下一个平行节点标签
.previous_sibling -------------------------------------------------------------------------------------#返回按照HTML文本顺序的上一个平行节点标签
.next_siblings ----------------------------------------------------------------------------------------#返回按照HTML文本顺序的后续所有平行节点标签
.previous_siblings ------------------------------------------------------------------------------------#返回按照HTML文本顺序的前续所有平行节点标签
查找元素
demo = r.text ---------------------------------------------------------------------------------------#赋值
soup = BeautifulSoup(demo,"html.parser") ------------------------------------------------------#做汤
print(soup.prettify()) --------------------------------------------------------------------------------#格式化网页
Tag --------------------------------------------------------------------------------------------------#标签,最基本的信息组织单元 .<Tag>
Name ------------------------------------------------------------------------------------------------#标签的名字 .name
Attributes -------------------------------------------------------------------------------------------#标签的属性,字典形式组织 .attrs
Navigablestring -------------------------------------------------------------------------------------#标签内非属性字符串 .string
Comment -------------------------------------------------------------------------------------------#标签内字符串注释部分
标签树下行遍历
.contents --------------------------------------------------------------------------------------------#子节点列表,将所有<Tag>所有儿子节点存入列表
.children ---------------------------------------------------------------------------------------------#用于循环遍历儿子节点
.descendants ----------------------------------------------------------------------------------------#包含所有子孙节点
遍历儿子节点 遍历子孙节点
for child in soup.body.chiledren:
print(child)
标签树上行遍历
、
.parent ----------------------------------------------------------------------------------------------#节点的父亲标签
.parents ---------------------------------------------------------------------------------------------#节点的先辈标签
标签树的平行遍历
.next_sibling -----------------------------------------------------------------------------------------#返回按照HTML文本顺序的下一个平行节点标签
.previous_sibling -------------------------------------------------------------------------------------#返回按照HTML文本顺序的上一个平行节点标签
.next_siblings ----------------------------------------------------------------------------------------#返回按照HTML文本顺序的后续所有平行节点标签
.previous_siblings ------------------------------------------------------------------------------------#返回按照HTML文本顺序的前续所有平行节点标签
查找元素
<>.find_all(name,attrs,recurisive,string) #查找 name:查找标签名 attrs:对标签属性检索字符串 string:包含字符串
代码示例:re--beautifulsoup
import re
import requests
from bs4 import BeautifulSoup
import lxml
def getHtml(url):
try:
Bs={'user-agent':'firefox/1545.45(windows)23.445'}
r= requests.get(url,headers=Bs)
print(r.status_code)
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,"html.parser")
return soup
except:
print("try_again")
def find_tag(soup):
try:
print("输入标签名:")
tag=input()
tags=soup.find_all(tag)
return tags
except:
print("can't_find_tag")
if __name__ == '__main__':
print("输入网址:")
start_url=input()
text=getHtml(start_url)
tag_massage=find_tag(text)
print(tag_massage)