8.6 并发的web爬虫

知识点

1.用bfs(广度优先)算法来抓取整个网站
2.每一个彼此独立的抓取命令可以并行进行IO，最大化利用网络资源

代码

func test_web_crawler()  {

    //初步11111
    //crawl_one()

    //优化并发数量22222
    //第二个问题是这个程序永远都不会终止，即使它已经爬到了所有初始链接衍生出的链接
    //crawl_one()

    //优化并发程序能够终止33333
    crawl_one()
}
/*
    练习 8.6： 为并发爬虫增加深度限制。
    也就是说，如果用户设置了depth=3，
    那么只有从首页跳转三次以内能够跳到的页面才能被抓取到。
*/
var depths int64 = 3
var depthFirst int64 = 0

var tokens = make(chan struct{}, 20)
func web_crawl_two(url string) []string {
    fmt.Println(url)
    if depthFirst >= depths {
        return nil
    }
    depthFirst++
    tokens <- struct{}{} // acquire a token
    list, err := web_Extract(url)
    defer func() { <-tokens }() // release the token
    if err != nil {
        log.Print(err)
    }
    return list
}
func crawl_one()  {
    worklist := make(chan []string)

    //33333333
    var n int // number of pending sends to worklist
    // Start with the command-line arguments.
    n++
    //这个版本中，计算器n对worklist的发送操作数量进行了限制

    // Start with the command-line arguments.
    go func() {
        list := []string{
        "http://gopl.io/",
        "http://gopl.io/",
        "https://golang.org/help/",
        "https://golang.org/doc/",
        "https://golang.org/blog/"}

        worklist <- list
    }()


    // Crawl the web concurrently.
    seen := make(map[string]bool)

    for ; n > 0; n-- {
        list := <-worklist
        for _, link := range list {
            if !seen[link] {
                seen[link] = true
                n++
                go func(link string) {
                    //11111
                    //worklist <- web_crawl_one(link)

                    //22222
                    worklist <- web_crawl_two(link)
                }(link)
            }
        }
    }
}
func web_crawl_one(url string) []string {
    fmt.Println(url)
    list, err := web_Extract(url)
    if err != nil {
        log.Print(err)
    }
    return list
}
func web_Extract(url string) ([]string, error) {
    resp, err := http.Get(url)
    if err != nil {
        return nil, err
    }
    if resp.StatusCode != http.StatusOK {
        resp.Body.Close()
        return nil, fmt.Errorf("getting %s: %s", url, resp.Status)
    }
    doc, err := html.Parse(resp.Body)
    resp.Body.Close()
    if err != nil {
        return nil, fmt.Errorf("parsing %s as HTML: %v", url, err)
    }
    var links []string
    visitNode := func(n *html.Node) {
        if n.Type == html.ElementNode && n.Data == "a" {
            for _, a := range n.Attr {
                if a.Key != "href" {
                    continue
                }
                link, err := resp.Request.URL.Parse(a.Val)
                if err != nil {
                    continue // ignore bad URLs
                }
                links = append(links, link.String())
            }
        }
    }
    web_forEachNode(doc, visitNode, nil)
    return links, nil
}
func web_forEachNode(n *html.Node, pre, post func(n *html.Node)) {
    if pre != nil {
        pre(n)
    }
    for c := n.FirstChild; c != nil; c = c.NextSibling {
        web_forEachNode(c, pre, post)
    }
    if post != nil {
        post(n)
    }
}

——不足之处，欢迎补充——

`备注`

《Go 语言圣经》

学习记录所使用的GO版本是1.8
学习记录所使用的编译器工具为GoLand
学习记录所使用的系统环境为Mac os
学习者有一定的C语言基础

代码仓库

Go 语言圣经 8.6 并发的web爬虫

8.6 并发的web爬虫

知识点

代码

——不足之处，欢迎补充——

`备注`

猜你喜欢