版权声明:本文为博主原创文章,转载请注明出处 https://blog.csdn.net/chenlnehc/article/details/78781432
Go语言解析Html
思想来源:BeautifulSoup4
原则:简单、快、省内存
特点:自造轮子随心用,不规则html照样干
结构体及其接口定义
package bs
type SelFunc interface {
Sel(tag string, attrs *map[string]string) (nodes []*Node) // 只提供给user此方法
SelById(id string) []*Node
SelByTag(tag string) []*Node
SelByClass(class string) []*Node
}
type Node struct { // 基本节点结构
Tag string // 标签名
Attrs *map[string]string //属性
Value string // 此节点的值
Sons []*Node // 子节点
is bool // 节点是否已经遍历
start bool // 是否开始节点
}
type Soup struct { // 解析结构
html string // 文本
nodes []*Node // 标签列表
index []int // 所有标签的下标
}
解析步骤(核心)
1.初始化 Soup
,此时生成Html文档的各个节点列表以及节点位置的记录表
2.用户调用 Sel
方法,传入解析规则(标签名、标签属性限制等)
3.解析用户请求并返回子节点指针(指针省内存)
代码:
package bs
import (
"container/list"
"fmt"
"regexp"
"strings"
)
var (
regTag = regexp.MustCompile(`<[a-z|A-Z|/].*?>`) // 匹配标签
regAttrs = regexp.MustCompile(`([a-z|A-Z]+?)= *?"(.*?)"`) // 匹配属性
DEBUG = false
)
func out(s string) {
if DEBUG {
fmt.Println(s)
}
}
func Init(html string) *Soup { // 初始化Soup
sp := Soup{}
sp.setHtml(html)
return &sp
}
func (self *Soup) setHtml(text string) {
self.html = text
for _, ss := range regTag.FindAllStringIndex(self.html, 100000) {
s := self.html[ss[0]:ss[1]]
if strings.Contains(s, "/>") || strings.Contains(s, "<br>") || strings.Contains(s, "<img") || strings.Contains(s, "<hr") || strings.Contains(s, "<input") { // 不要单独的标签
continue
}
var nd Node
if s[:2] == "</" { // 结束标签
nd.Tag = s[2 : len(s)-1]
nd.start = false
} else { // 开始标签
nd.Tag = strings.Split(s, " ")[0][1:]
nd.start = true
if strings.Contains(nd.Tag, ">") {
nd.Tag = nd.Tag[:len(nd.Tag)-1]
}
}
// fmt.Println("Tag:", nd.Tag, nd.start)
attrs := make(map[string]string)
for _, a := range regAttrs.FindAllStringSubmatch(s, 10) {
if len(a) == 3 {
attrs[a[1]] = a[2]
}
}
nd.Attrs = &attrs
nd.is = false
// fmt.Println(nd.Tag, *nd.Attrs)
self.nodes = append(self.nodes, &nd)
self.index = append(self.index, ss[0]) // 只需要开始位置
}
}
func right(cur *map[string]string, attrs *map[string]string) bool {
// cur 包含 attrs 则返回true
for k, v := range *attrs {
if (*cur)[k] != v {
return false
}
}
return true
}
func trim(c rune) bool { // 去除首尾的无用字符
return c == '\n' || c == '\t' || c == ' '
}
func (self *Soup) parse(cur int) { // 解析cur节点
if self.nodes[cur].is || !self.nodes[cur].start { // 当前节点已被解析
out("已经解析/结束节点")
return
}
leng := len(self.index)
nds := list.New() // 节点树
nds.PushBack(cur) // 根节点入栈(位置)
for cur < leng { // 找结束节点
cur++
if cur >= leng {
return
}
tp := nds.Back()
iv := tp.Value.(int)
if self.nodes[cur].start { // 是开始节点
// 压栈, 此节点为前一节点子节点
self.nodes[iv].Sons = append(self.nodes[iv].Sons, self.nodes[cur])
nds.PushBack(cur)
} else if self.nodes[iv].Tag == self.nodes[cur].Tag { // 是结束节点, 且匹配前一个,完成解析,出栈
// 存其Value
self.nodes[iv].Value = strings.TrimFunc(regTag.ReplaceAllString(self.html[self.index[iv]:self.index[cur]], ""), trim)
// 将其置为已解析
self.nodes[iv].is = true
nds.Remove(tp)
}
if nds.Len() == 0 {
break
}
}
}
func (self *Soup) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
cur := 0
leng := len(self.index)
for cur < leng {
if tag != "" && tag != self.nodes[cur].Tag { // 标签不匹配
cur++
continue
}
if attrs != nil && !right(self.nodes[cur].Attrs, attrs) { // 属性不匹配
cur++
continue
}
// 找到满足条件的节点
nodes = append(nodes, self.nodes[cur])
// 解析该节点及其子节点
self.parse(cur)
cur++
}
return
}
func itool(n *Node, tag string, attrs *map[string]string, nodes *[]*Node) {
for _, i := range n.Sons {
if (i.Tag == tag || tag == "") && (attrs != nil && right(i.Attrs, attrs) || attrs == nil) {
*nodes = append(*nodes, i)
}
itool(i, tag, attrs, nodes)
}
}
func (self *Node) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
// 对于节点,之前已经解析过了
itool(self, tag, attrs, &nodes)
return
}
func (self *Soup) SelById(id string) []*Node {
return self.Sel("", &map[string]string{"id": id})
}
func (self *Soup) SelByTag(tag string) []*Node {
return self.Sel(tag, nil)
}
func (self *Soup) SelByClass(class string) []*Node {
return self.Sel("", &map[string]string{"class": class})
}
func (self *Node) SelById(id string) []*Node {
return self.Sel("", &map[string]string{"id": id})
}
func (self *Node) SelByTag(tag string) []*Node {
return self.Sel(tag, nil)
}
func (self *Node) SelByClass(class string) []*Node {
return self.Sel("", &map[string]string{"class": class})
}
示例
package main
import (
"fmt"
"myspider/bs"
)
var html = `
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story" id="sp">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
<b>nothing in here</b>
</p>
<p class="story">...</p>
<ul class="story" id="0">
<li class="t" id="1">
<li class="t" id="2">asdf</li>
</li>
<li class="t" id="3">2</li>
<li class="t" id="4">3</li>
</ul>
`
var soup = bs.Init(html)
func t1() {
// by tag
fmt.Println("By Tag........................")
for _, j := range soup.Sel("a", nil) {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
fmt.Println("Value:", j.Value)
}
// by attrs
fmt.Println("By Attrs........................")
for _, j := range soup.Sel("", &map[string]string{"class": "story"}) {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
fmt.Println("Value:", j.Value)
}
// by tag and attrs
fmt.Println("By Tag And Attrs........................")
for _, j := range soup.Sel("p", &map[string]string{"class": "story"}) {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
fmt.Println("Value:", j.Value)
}
// more
fmt.Println("More.......................................")
for _, j := range soup.Sel("", &map[string]string{"id": "sp"}) {
for _, a := range j.Sel("a", nil) {
fmt.Println("Tag:", a.Tag)
fmt.Println("Attrs:", *a.Attrs)
fmt.Println("Value:", a.Value)
}
}
// Detail
fmt.Println("Soup Details....................................")
for _, j := range soup.SelById("sp") {
fmt.Println("Tag:", j.Tag)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range soup.SelByClass("sister") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range soup.SelByTag("title") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
fmt.Println("Node Details....................................")
note := soup.SelById("sp")[0]
for _, j := range note.SelByClass("sister") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range note.SelById("link3") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range note.SelByTag("a") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
}
func t2() {
n := soup.SelByTag("ul")[0]
for _, i := range n.Sons {
fmt.Println(i.Value)
}
}
func t3() {
n := soup.SelById("sp")[0]
for _, i := range n.Sons {
fmt.Println(i.Tag)
}
}
func t4() {
n := soup.SelByTag("ul")[0]
for _, j := range n.SelByTag("li") {
fmt.Println(j.Value)
}
}
func main() {
t1()
}
至于稳定性,示例的html片段都能解析还有什么不能解析的?