lxml是python中一个非常强大的解析库。其中的etree更是常常用来判断网页中某一节点是否存在,并获取相应的文本或属性。
一、用法详解
1、导入etree
import requests
from lxml import etree
2、获取网页信息并生成etree选择器,即下面中的html
res = requests.get('http://www.baidu.com')
html =etree.HTML(res.text)
3、利用选择器的xpath()函数,获取相应的节点
datalist01 = html.xpath('//table//table//a/text()')#获取当前页面下的表格下的表格中的a标签的文字
datalist02 = html.xpath('//table//table//a/@href')#获取当前页面下的表格下的表格中的a标签的href属性
注:
1) // 双斜杠 定位根节点,会对全文进行扫描,在文档中选取所有符合条件的内容,以列表的形式返回。
2) / 单斜杠 寻找当前标签路径的下一层路径标签或者对当前路标签内容进行操作
3) /text() 获取当前路径下的文本内容
4) /@xxxx 提取当前路径下标签的属性值
5) | 可选符 使用|可选取若干个路径 如//p | //div 即在当前路径下选取所有符合条件的p标签和div标签。
6) . 点 用来选取当前节点
7) .. 双点 选取当前节点的父节点
二、爬取“我要爱死你”网站代码
import requests
import time
from lxml import etree
from xlrd import open_workbook
from xlutils.copy import copy
#乡镇------>村
def parseCun(link):
header={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
time.sleep(3)
res = requests.get(link,headers=header)
html =etree.HTML(res.text)
datalist = html.xpath('//table//table//a/text()')
datalist2 = html.xpath('//table//table//a/@href')
n = 5
name = []
code = []
url = []
sjcode = link[33:45]
print(sjcode)
while n < len(datalist):
if n%2 != 0:
name.append(datalist[n])
url.append('https://xingzhengquhua.51240.com'+datalist2[n])
else:
code.append(datalist[n])
n = n+1
saveData(sjcode,name,code)
#区县------>乡镇
def parseQxj(link):
header={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
res = requests.get(link,headers=header)
html =etree.HTML(res.text)
datalist = html.xpath('//table//table//a/text()')
datalist2 = html.xpath('//table//table//a/@href')
n = 4
name = []
code = []
url = []
sjcode = link[33:45]
print(sjcode)
while n < len(datalist):
if n%2 != 1:
name.append(datalist[n])
else:
code.append(datalist[n])
url.append('https://xingzhengquhua.51240.com'+datalist2[n])
n = n+1
i = 0
for u in url:
print(name[i])
sname = []
scode = []
sname.append(name[i])
scode.append(code[i])
saveData(sjcode,sname,scode)
time.sleep(3)
parseCun(u)
del sname[:]
del scode[:]
i = i + 1
#市州---->区县
def parseSzj(link):
header={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
res = requests.get(link,headers=header)
html =etree.HTML(res.text)
datalist = html.xpath('//table//table//a/text()')
datalist2 = html.xpath('//table//table//a/@href')
n = 3
name = []
code = []
url = []
sjcode = link[33:45]
print(sjcode)
while n < len(datalist):
if n%2 != 0:
name.append(datalist[n])
url.append('https://xingzhengquhua.51240.com'+datalist2[n])
else:
code.append(datalist[n])
n = n + 1
i = 0
for u in url:
print(name[i])
sname = []
scode = []
sname.append(name[i])
scode.append(code[i])
saveData(sjcode,sname,scode)
time.sleep(3)
parseQxj(u)
del sname[:]
del scode[:]
i = i + 1
#解析省级 --->市州
def parseGzs(link):
header={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
res = requests.get(link,headers=header)
html =etree.HTML(res.text)
datalist = html.xpath('//table//table//a/text()')
datalist2 = html.xpath('//table//table//a/@href')
n = 2
name = []
code = []
url = []
sjcode = link[33:45]
print(sjcode)
while n < len(datalist):
if n%2 != 1:
name.append(datalist[n])
else:
code.append(datalist[n])
url.append('https://xingzhengquhua.51240.com'+datalist2[n])
n = n + 1
i = 0
print(url)
for u in url:
sname = []
scode = []
print(2)
sname.append(name[i])
scode.append(code[i])
saveData(sjcode,sname,scode)
parseSzj(u)
del sname[:]
del scode[:]
i = i + 1
#保存数据
def saveData(sjcoede,name,bjcode):
rexcel = open_workbook("./行政区划.xls")
rows = rexcel.sheets()[0].nrows
excel = copy(rexcel)
table = excel.get_sheet(0)
row = rows
i = 0
for code in bjcode:
table.write(row, 0, sjcoede)
table.write(row, 1, name[i])
table.write(row, 2, bjcode[i])
row += 1
i = i +1
excel.save("./行政区划.xls")
#主函数
def main(url):
parseGzs(url)
if __name__ == '__main__':
url = 'https://xingzhengquhua.51240.com/520000000000__xingzhengquhua/'
main(url)