Python3~xpath

from lxml import etree
from urllib import request
import ssl

ssl._create_default_https_context=ssl._create_unverified_context
html='''
<bookstore>
    <title>新华书店</title>
    <book href="http://www.langlang2017.com/">
        <title lang="eng">Harry Potter</title>
        <price>29.99</price>
    </book>
    <book>
        <title lang="zh">Learning XML</title>
        <price>39.95</price>
    </book>
    <book>
        <title>python大全</title>
        <price>99.95</price>
    </book>
</bookstore>
'''
# etree.HTML(html)与etree.tostring(xhtml)是互逆操作
html=etree.HTML(html)
# print(html)
# print(xhtml)#<Element html at 0x104c5f048>
re=etree.tostring(html)
print(re)

#Xpath使用
# result=html.xpath('//bookstore')
# result=html.xpath('//book')
# book1=result[0]
# print(book1.xpath('.'))
# print(book1.xpath('..'))
# book2=result[1]
# print(book2.xpath('.'))
# print(book2.xpath('../book'))

#拿第二本书title
titles=html.xpath('//title')
# print(titles)

books=html.xpath('//book')
book2=books[1]
#book2 title
# print(book2.xpath('./title'))
#book1 title
# book1=book2.xpath('../book')[0]
# print(book1.xpath('./title')[0])

#真正的根节点
# print(html.xpath('/html'))
# print(html.xpath('/html/body/bookstore'))
#book1当中title找到book2当中title节点
# title1=html.xpath('//title')[0]
# print(title1)
# book1=title1.xpath('..')[0]
# print(book1)
# book2=book1.xpath('../book')[1]
# print(book2)
# title2=book2.xpath('./title')[0]
# print(title2)

# #探讨//
# books=html.xpath('//book')
# book1=books[1]
# print(html.xpath('//title'))
# print(book1.xpath('//title')) #不论//在什么位置

#@属性
# titles=html.xpath('//title')
# print(titles)

# titles=html.xpath('//title[@lang]')
# print(titles)

# titles=html.xpath('//title[@lang="zh"]')
# print(titles)

# books=html.xpath('//book[price<40]')
# print(books)

# titles=html.xpath('//book[price<40]/title')
# print(titles)

# books = html.xpath('//book[price=29.99]')
# print(books)

books = html.xpath('//book')
# print(books[0].xpath('./*')) #title price
# print(books[0].xpath('@*'))
# print(books[0].xpath('@href'))
#
# print(html.xpath('//*'))#选取文档中所有元素
# print(html.xpath('//title[@*]'))#选取所有带属性的title

print(html.xpath('//book/title | //book/price'))
#上下行等价
titles = html.xpath('//book/title')
prices = html.xpath('//book/price')
print(titles + prices)

/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 /Users/apple/PycharmProjects/stage4/spider/2018_3_9/02xpath.py
b'<html><body><bookstore>\n <title>新华书店</title>\n <book href="http://www.langlang2017.com/">\n <title lang="eng">Harry Potter</title>\n <price>29.99</price>\n </book>\n <book>\n <title lang="zh">Learning XML</title>\n <price>39.95</price>\n </book>\n <book>\n <title>python大全</title>\n <price>99.95</price>\n </book>\n</bookstore>\n</body></html>'
[<Element html at 0x103ee4908>, <Element body at 0x103efe388>, <Element bookstore at 0x103efe0c8>, <Element title at 0x103ee4f88>, <Element book at 0x103ef5f48>, <Element title at 0x103ee8908>, <Element price at 0x103efe088>, <Element book at 0x103ef5f88>, <Element title at 0x103ef5ec8>, <Element price at 0x103efe108>, <Element book at 0x103ef5fc8>, <Element title at 0x103ef5f08>, <Element price at 0x103efe048>]
[<Element title at 0x103ee8908>, <Element title at 0x103ef5ec8>]

Process finished with exit code 0

猜你喜欢