from lxml import etree from urllib import request import ssl ssl._create_default_https_context=ssl._create_unverified_context html=''' <bookstore> <title>新华书店</title> <book href="http://www.langlang2017.com/"> <title lang="eng">Harry Potter</title> <price>29.99</price> </book> <book> <title lang="zh">Learning XML</title> <price>39.95</price> </book> <book> <title>python大全</title> <price>99.95</price> </book> </bookstore> ''' # etree.HTML(html)与etree.tostring(xhtml)是互逆操作 html=etree.HTML(html) # print(html) # print(xhtml)#<Element html at 0x104c5f048> re=etree.tostring(html) print(re) #Xpath使用 # result=html.xpath('//bookstore') # result=html.xpath('//book') # book1=result[0] # print(book1.xpath('.')) # print(book1.xpath('..')) # book2=result[1] # print(book2.xpath('.')) # print(book2.xpath('../book')) #拿第二本书title titles=html.xpath('//title') # print(titles) books=html.xpath('//book') book2=books[1] #book2 title # print(book2.xpath('./title')) #book1 title # book1=book2.xpath('../book')[0] # print(book1.xpath('./title')[0]) #真正的根节点 # print(html.xpath('/html')) # print(html.xpath('/html/body/bookstore')) #book1当中title找到book2当中title节点 # title1=html.xpath('//title')[0] # print(title1) # book1=title1.xpath('..')[0] # print(book1) # book2=book1.xpath('../book')[1] # print(book2) # title2=book2.xpath('./title')[0] # print(title2) # #探讨// # books=html.xpath('//book') # book1=books[1] # print(html.xpath('//title')) # print(book1.xpath('//title')) #不论//在什么位置 #@属性 # titles=html.xpath('//title') # print(titles) # titles=html.xpath('//title[@lang]') # print(titles) # titles=html.xpath('//title[@lang="zh"]') # print(titles) # books=html.xpath('//book[price<40]') # print(books) # titles=html.xpath('//book[price<40]/title') # print(titles) # books = html.xpath('//book[price=29.99]') # print(books) books = html.xpath('//book') # print(books[0].xpath('./*')) #title price # print(books[0].xpath('@*')) # print(books[0].xpath('@href')) # # print(html.xpath('//*'))#选取文档中所有元素 # print(html.xpath('//title[@*]'))#选取所有带属性的title print(html.xpath('//book/title | //book/price')) #上下行等价 titles = html.xpath('//book/title') prices = html.xpath('//book/price') print(titles + prices)
/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 /Users/apple/PycharmProjects/stage4/spider/2018_3_9/02xpath.py
b'<html><body><bookstore>\n <title>新华书店</title>\n <book href="http://www.langlang2017.com/">\n <title lang="eng">Harry Potter</title>\n <price>29.99</price>\n </book>\n <book>\n <title lang="zh">Learning XML</title>\n <price>39.95</price>\n </book>\n <book>\n <title>python大全</title>\n <price>99.95</price>\n </book>\n</bookstore>\n</body></html>'
[<Element html at 0x103ee4908>, <Element body at 0x103efe388>, <Element bookstore at 0x103efe0c8>, <Element title at 0x103ee4f88>, <Element book at 0x103ef5f48>, <Element title at 0x103ee8908>, <Element price at 0x103efe088>, <Element book at 0x103ef5f88>, <Element title at 0x103ef5ec8>, <Element price at 0x103efe108>, <Element book at 0x103ef5fc8>, <Element title at 0x103ef5f08>, <Element price at 0x103efe048>]
[<Element title at 0x103ee8908>, <Element title at 0x103ef5ec8>]
Process finished with exit code 0