网页:
1. python Xpath
1.1 获取(extract )当前节点下的元素(element)内容
# use Chrome Browser open URL, F2 click, select item and right click, copy xpath
#<strong>JavaScript 是 Web 的编程语言</strong>
# /html/body/div[4]/div/div[2]/div[2]/a[1]/strong
#<strong>HTML,即超文本标记语言(Hyper Text Markup Language)</strong>
# /html/body/div[4]/div/div[2]/div[1]/a[1]/strong
对比两者差异,并去除差异得出 /html/body/div[4]/div/div[2]/div/a[1]/strong
python 代码:
buyers = tree.xpath('/html/body/div[4]/div/div[2]/div/a/strong/text()')
Print:
for buyer in buyers:
print('Buyers: '+ '\n' ,buyer)
Buyers:
HTML,即超文本标记语言(Hyper Text Markup Language)
Buyers:
HTML5 是下一代 HTML 标准
Buyers:
层叠样式表(Cascading StyleSheet)
Buyers:
CSS3是CSS技术的升级版本
Buyers:
Bootstrap,来自 Twitter,是目前最受欢迎的前端框架
Buyers:
Bootstrap4 目前是 Bootstrap 的最新版本
Buyers:
Font Awesome 是一套绝佳的图标字体库和CSS框架。
Buyers:
Foundation 用于开发响应式的 HTML, CSS and JavaScript 框架
Buyers:
JavaScript 是 Web 的编程语言
Buyers:
HTML DOM 定义了访问和操作 HTML 文档的标准方法
.
.
Buyers:
网站建设指导课程
Buyers:
对于网站开发人员来说,浏览器信息和统计数据都是非常重要的
Buyers:
如果您希望向全世界发布自己的网站,那么您的网站就需要被放置于一个 WEB 服务器
Buyers:
TCP/IP 是因特网的通信协议
Buyers:
W3C 让每个人都能在互联网上分享资源
Buyers:
学习如何创建高质量的web网站
from lxml import html
import requests
base_url = "http://www.runoob.com/"
page = requests.get(base_url)
#page = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
tree = html.fromstring(page.content)
#This will create a list of buyers:
buyers = tree.xpath('//div[@title="buyer-name"]/text()')
buyers = tree.xpath('/html/body/div[4]/div/div[2]/div/a/strong/text()')
# use Chrome Browser open URL, F2 click, select item and right click, copy to
#<strong>JavaScript 是 Web 的编程语言</strong>
# /html/body/div[4]/div/div[2]/div[2]/a[1]/strong
#<strong>HTML,即超文本标记语言(Hyper Text Markup Language)</strong>
# /html/body/div[4]/div/div[2]/div[1]/a[1]/strong
for buyer in buyers:
print('Buyers: '+ '\n' ,buyer)
1.2 获取(extract )当前节点下的子元素(element)内容
REF:https://zhuanlan.zhihu.com/p/29436838
本例介绍输出元素<a>的‘href’网络链接内容,还有其中的子元素<strong>的text文本内容。
from lxml import html,etree
import requests
from bs4 import BeautifulSoup
import re
file ='./runoob_cainiao.html'
tree = html.parse(file)
root = tree
select = tree
#chapters = select.xpath("//@item-top item-1")
#chapters_url = select.xpath('//td[@item-top item-1]/a/@href')
conts = select.xpath('/html/body/div[4]/div/div[2]/div/a')
#print (conts)
i=0
for cont in conts:
if "href" in cont.attrib:
href = cont.xpath('/@href')
# print(cont.get('href')) # get('href'): extract 'cont' s attribution
# print('\n')
# print(cont.getchildren()) #print "cont"'s all children element
#if "strong" in cont.getchildren():
strong_1 = cont.xpath('./descendant::strong')# ref:https://zhuanlan.zhihu.com/p/29436838
# './' means current element,'descendant' means child element
for strong_1_1 in strong_1:
i = i + 1
print(i,'\t',strong_1_1.xpath('./text()'),'\n\t',cont.get('href'))
# './text()' means extracting 'strong_1_1's text content
"""
output:
1 ['HTML,即超文本标记语言(Hyper Text Markup Language)']
http://www.runoob.com/html/html-tutorial.html
2 ['HTML5 是下一代 HTML 标准']
http://www.runoob.com/html/html5-intro.html
3 ['层叠样式表(Cascading StyleSheet)']
http://www.runoob.com/css/css-tutorial.html
...
88 ['TCP/IP 是因特网的通信协议']
http://www.runoob.com/tcpip/tcpip-tutorial.html
89 ['W3C 让每个人都能在互联网上分享资源']
http://www.runoob.com/w3c/w3c-tutorial.html
90 ['学习如何创建高质量的web网站']
http://www.runoob.com/quality/quality-tutorial.html
"""
1.3 使用openpyxl 生成xlsx Excel文件,并保存extract的内容
from lxml import html,etree
import requests
from bs4 import BeautifulSoup
import re #正则
import xlwt
import xlrd
from openpyxl import Workbook
from openpyxl import load_workbook
import datetime
from openpyxl.utils import get_column_letter
#---------------------------openpyxl xlsx-------------------------------------------
wb = Workbook()
ws = wb.active
dest_filename = 'empty_book.xlsx'
ws.title = "range names"
select = tree
conts = select.xpath('/html/body/div[4]/div/div[2]/div/a')
i=0 #row
j=0 #column
for cont in conts:
if "href" in cont.attrib:
href = cont.xpath('/@href')
j=0
strong_1 = cont.xpath('./descendant::strong')# ref:https://zhuanlan.zhihu.com/p/29436838
# './' means current element,'descendant' means child element
for strong_1_1 in strong_1:
i = i + 1
j = j + 1
title_1 = strong_1_1.xpath('./text()')
title_2 = cont.get('href')
#print(i,'\t',strong_1_1.xpath('./text()'),'\n\t',cont.get('href'))
#d_1 = ws.cell(row=i, column=j, value=title_1)
#d_2 = ws.cell(row=i, column=j+1, value=title_2)
#d_1 = ws.cell(row=i, column=j).value=title_1
#d_2 = ws.cell(row=i, column=j+1).value=title_2
ws.cell(row=i, column=j, value="{0}".format(title_1))
ws.cell(row=i, column=j+1, value="{0}".format(title_2))
# './text()' means extracting 'strong_1_1's text content
ws.insert_rows(1)
i=j=1
ws.cell(row=i, column=j, value="{0}".format("name"))
ws.cell(row=i, column=j+1, value="{0}".format("link"))
wb.save(filename = dest_filename)
#------Node----
# https://openpyxl.readthedocs.io/en/stable/usage.html#write-a-workbook
# ws.cell(row=i, column=j).value=title_1
# ws.cell(row=i, column=j, value="{0}".format(title_1))
# because 'title_1' is a string , this method will get ValueError:
# raise ValueError("Cannot convert {0!r} to Excel".format(value))
# ues "ws.cell(row=i, column=j, value="{0}".format(title_1))" solved this ERROR.
#---------------------------openpyxl xlsx end-------------------------------------------