2.数据解析

1.bs4

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 创建 Beautiful Soup 对象
# 使用lxml来进行解析
soup = BeautifulSoup(html, "lxml")
# print(soup.prettify())
'''
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
'''
#####################################################################
# find获取一个相应标签，find_all所有相应标签
# 得到的每一个元素是一个Tag对象
print(soup.find("p"))  # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

print(soup.find_all("p"))
'''
[<p class="title" name="dromouse"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]
'''
# 当然也可以加上限制条件
soup.find_all("p", limit=2)  # 只会获取前两个p标签
soup.find_all("p", attrs={"class": "sister"})  # 获取class="sister"的p标签
soup.find_all("a", attrs={"class": "sister", "id": "link2"})  # 获取class="sister"并且id="link2"的a标签

##############################################
# 获取某个标签的属性
a_tags = soup.find_all("a")  # 找到所有的a标签
for a_tag in a_tags:
    # 属性都在attrs里面
    print(a_tag.attrs.get("href"), a_tag.attrs.get("class"))
    '''
    http://example.com/elsie ['sister']
    http://example.com/lacie ['sister']
    http://example.com/tillie ['sister']
    '''
    # 也可以获取文本
    print(a_tag.string)
    '''
     Elsie 
    Lacie
    Tillie
    '''
    # 如果是tag.strings，那么会将内部子标签里的文本也全部打印出来
    # <p><a>111</a></p>,p_tag.string-->None, p_tag.strings-->111
    # 但是这样内部如果是空白，也会打印，所以不要空字符的胡，可以使用.stripped_strings方法

2.pyquery

from pyquery import PyQuery as PQ
 
html = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>古明地觉</title>
</head>
<body>
    <p><a href="http://www.bilibili.com">想进入基佬的大门吗？还等什么，快点击吧</a></p>
    <p>my name is satori</p>
    <div>
        <p>古明地觉</p>
    </div>
    <table>
        <tbody>
            <tr>
                <td>姓名：</td>
                <td><input type="text" name="name"></td>
            </tr>
            <tr>
                <td>密码：</td>
                <td><input type="password" name="password"></td>
            </tr>
            <tr>
                <td></td>
                <td><input type="submit" value="提交"></td>
            </tr>
        </tbody>
    </table>
</body>
</html>   
'''
# 既可以传入字符串，也可以填入url，还可以填入文件路径
# 得到的是一个<class 'pyquery.pyquery.PyQuery'>对象
# 使用就像jquery那样使用就可以了，这个pyquery就是仿jquery实现的
p = PQ(html)
 
 
# 获取相应的标签内容
print(p("head"))
# 这里获取选择获取head标签,注意这里得到的依旧是一个PyQuery对象
'''
<head>
    <meta charset="UTF-8"/>
    <title>古明地觉</title>
</head>
'''
 
 
# html()：获取相应标签里的HTML块
print(p("head").html())
# 我们通过p("head")获取head标签，然后.html()获取里面的内容
'''
    <meta charset="UTF-8"/>
    <title>古明地觉</title>
     
'''
print(p("title").html())
'''
古明地觉
'''
 
 
# text()：获取相应的文本块
print(p("head").text())
# 只有title标签里面有文本
'''
古明地觉
'''
 
 
# 我们来获取一下p标签
print(p("p"))
# 可以看到，把所有的p标签全获取下来了，而且连藏在div标签里面的p标签也获取下来了
'''
<p><a href="http://www.bilibili.com">想进入基佬的大门吗？还等什么，快点击吧</a></p>
    <p>my name is satori</p>
    <p>古明地觉</p>
'''
 
 
# 我们可以指定索引,p("p").eq(n)表示获取第n-1个p标签
# 这里得到的依旧是一个PyQuery对象，在进行html()或者text()的时候会转化成str
print(p("p").eq(0))  # <p><a href="http://www.bilibili.com">想进入基佬的大门吗？还等什么，快点击吧</a></p>
print(p("p").eq(2))  # <p>古明地觉</p>
print(p("p").eq(5))  # 索引越界的话，不报错，会打印空行。
print(p("p").html(), type(p("p").html()))
# 可以看到我们虽然有很多p标签，但是在获取标签里面的html块的时候，默认获取第一个p标签里面的html块
# 而且最终也变成了str类型
'''
<a href="http://www.bilibili.com">想进入基佬的大门吗？还等什么，快点击吧</a> <class 'str'>
'''

from pyquery import PyQuery as PQ
 
html = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>古明地觉</title>
</head>
<body>
    <p id="bili"><a href="http://www.bilibili.com">想进入基佬的大门吗？还等什么，快点击吧</a></p>
    <p class="s1">my name is satori</p>
    <div>
        <p class="s1">古明地觉</p>
    </div>
    <table >
        <tbody>
            <tr>
                <td>姓名：</td>
                <td><input type="text" name="name"></td>
            </tr>
            <tr class="tr">
                <td>密码：</td>
                <td><input type="password" name="password"></td>
            </tr>
            <tr>
                <td></td>
                <td><input type="submit" value="提交"></td>
            </tr>
        </tbody>
    </table>
    <a href="http://www.baidu.com" target="_blank">百度</a>
    <a href="http://www.yahoo.com">雅虎</a>
</body>
</html>   
'''
 
p = PQ(html)
 
# 通过css选择器获取
# 获取id=bili的标签
print(p("#bili"))
'''
<p id="bili"><a href="http://www.bilibili.com">想进入基佬的大门吗？还等什么，快点击吧</a></p>
'''
 
 
# 获取class=s1的标签
print(p(".s1"))
'''
<p class="s1">my name is satori</p>
    <p class="s1">古明地觉</p>
'''
 
 
# tbody .tr td 表示获取tbody下的class=tr的标签下的所有td标签，再加上.eq(0)表示获取第一个
print(p("tbody .tr td").eq(0))
'''
<td>密码：</td>
'''
 
 
# 常用方法：
'''
p(".color")：获取class=color的标签
p("#color")：获取id=color的标签
p("*")：获取所有的标签
p("p")：获取所有的p标签
p("div,p")：获取所有的div标签和p标签
p("div p")：获取所有div标签内部的p标签
p("[target]")：选择带有target属性的所有元素
p("[target=_blank]")：选择带有target=_blank的所有元素
'''
 
 
# 也可以分开写
# 表示选择所有的p标签，然后将class=s1的p标签过滤出来
print(p("p").filter(".s1"))
'''
<p class="s1">my name is satori</p>
    <p class="s1">古明地觉</p>
'''
 
 
# 选择所有的tr标签，然后过滤出class=tr的标签，再选择下面的td标签
# 可以看出filter是按照属性来筛选的，find是按照标签名来筛选的
print(p("tr").filter(".tr").find("td"))
'''
                <td>密码：</td>
                <td><input type="password" name="password"/></td>
'''
 
 
# 如果我们想获取里面的标签里的属性的话，可以使用attr
print(p("a").attr("href"))
# 但是此时跟html()一样，只会获取第一个标签里面的href
'''
# http://www.bilibili.com
'''
 
 
# 如何获取所有标签里面的属性呢，可以使用eq，但是我们不知道标签有多少个.
# 我们可以把循环写长一点，或者无限自增等等，都行。
# 如果越界了，那么p("a").eq(i)只会打印空行，此时仍是个PyQuery对象，但是我们可以转化成str类型
# 所以我们可以进行判断，如果条件成立，执行。如果越界了，那么不成立之后，直接break掉
for i in range(10):
    if str(p("a").eq(i)).strip():
        print(p("a").eq(i).attr("href"))
        continue
    break
'''
http://www.bilibili.com
http://www.baidu.com
http://www.yahoo.com
'''

1.bs4

2.pyquery

3.lxml

猜你喜欢