import re
from bs4 import BeautifulSoup
from lxml import etree
html = '''
<div id="info">
<span ><span class='pl'>导演</span>: <span class='attrs'><a>郭帆</a></span></span><br/>
<span ><span class='pl'>编剧</span>: <span class='attrs'><a >郭帆</a></span></span><br/>
<span class="pl">制片国家/地区:</span> 中国大陆<br/>
<span class="pl">语言:</span> 汉语普通话 / 俄语 / 英语 / 印地语 / 法语<br/>
<span class="pl">上映日期:</span> <span >2023-01-22(中国大陆)</span><br/>
<span class="pl">片长:</span> <span>173分钟</span><br/>
<span class="pl">又名:</span> The Wandering Earth Ⅱ / The Wandering Earth 2 / 《流浪地球》前传<br/>
<span class="pl">IMDb:</span> tt13539646<br>
</div>
'''
# 方法一
pattern = re.compile(r'<[^>]+>', re.S)
result = pattern.sub('', html)
print(f"正则去除:{
result}")
# 方法二
soup = BeautifulSoup(html, 'html.parser')
print(f"BeautifulSoup去除:{
soup.get_text()}")
# 方法三
response = etree.HTML(text=html)
# print(dir(response))
print(f"etree去除:{
response.xpath('string(.)')}")
python去除html所有标签的方法
猜你喜欢
转载自blog.csdn.net/weixin_43824520/article/details/129349325
今日推荐
周排行