随笔---Go爬虫爬取14寸笔记本电脑桌面

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"strconv"
	"strings"
	"sync"
	"time"
)

//HandleError 错误打印
func HandleError(err error,why string)  {
    
    
	if err != nil {
    
    
		fmt.Println(why,err)
	}
}

//DownloadFile 下载图片,传入的图片叫什么
func DownloadFile(url, filename string) (ok bool){
    
    
	resp,err := http.Get(url)
	HandleError(err,"http.get.url")
	defer resp.Body.Close()
	bytes, err := ioutil.ReadAll(resp.Body)
	HandleError(err,"resp.body")
	filename = "D:/codeTestForStudy/Go/reptile/img/" + filename
	// 写出数据
	ioutil.WriteFile(filename,bytes,0666)
	if err != nil {
    
    
		return false
	} else {
    
    
		return true
	}
}

var (
	// 存放图片链接的数据管道
	chanImagUrls chan string
	waitGroup	sync.WaitGroup
	// 用于监控协程
	chanTask chan string
	reImg	= `https?://[^"]+?(\.((jpg)|(png)|(jpeg)|(gif)|(bmp)))`
)

//DownloadImg 下载图片
func DownloadImg() {
    
    
	for url := range chanImagUrls {
    
    
		filename := GetFilenameFromUrl(url)
		ok := DownloadFile(url, filename)
		if ok {
    
    
			fmt.Printf("%s 下载成功\n", filename)
		} else {
    
    
			fmt.Printf("%s 下载失败\n", filename)
		}
	}
	waitGroup.Done()
}

//GetFilenameFromUrl 截取url名字
func GetFilenameFromUrl(url string) (filename string){
    
    
	// 返回最后一个 / 的位置
	lastIndex := strings.LastIndex(url,"/")
	// 将名字切出来
	filename = url[lastIndex+1:]
	// 时间戳决定重名
	timePrefix := strconv.Itoa(int(time.Now().UnixNano()))
	filename = timePrefix + "_" +filename
	// 尝试将文件名改为中文
	//resp, err := http.Get(url)
	//HandleError(err,"http.Get(url)")
	//
	return
}

//CheckOk 任务统计协程
func CheckOk() {
    
    
	var count int
	for {
    
    
		url := <-chanTask
		fmt.Printf("%s 完成爬取任务\n",url)
		count++
		if count == 26 {
    
    
			close(chanImagUrls)
			break
		}
	}
	waitGroup.Done()
}

//GetImgUrls 爬图片链接到通道,url是传的整页链接
func GetImgUrls(url string) {
    
    
	urls := GetImgs(url)
	// 遍历切片里所有的连接,存入数据管道
	for _, url := range urls{
    
    
		chanImagUrls <- url
	}
	// 表示当前协程完成
	// 每完成一个任务写一条数据
	// 用于监控协程知道已经完成了几个任务
	chanTask <- url
	waitGroup.Done()
}

//GetImgs 获取当前页面图片链接
func GetImgs(url string) (urls []string) {
    
    
	pageStr := GetPageStr(url)
	re := regexp.MustCompile(reImg)
	results := re.FindAllStringSubmatch(pageStr, -1)
	fmt.Printf("共找到 %d 条结果\n",len(results))
	for _, result := range results {
    
    
		url := result[0]
		urls = append(urls,url)
	}
	return
}

func GetPageStr(url string) (pageStr string) {
    
    
	resp, err := http.Get(url)
	HandleError(err,"http.Get url")
	defer resp.Body.Close()
	// 读取页面内容
	pageBytes, err := ioutil.ReadAll(resp.Body)
	HandleError(err,"ioutil.ReadAll")
	// 字节转字符串
	pageStr = string(pageBytes)
	return
}

func main() {
    
    
	// 1. 初始化管道
	chanImagUrls = make(chan string,1000000)
	chanTask = make(chan string,26)
	// 2. 爬虫协程
	for i := 10; i < 270; i = i + 10 {
    
    
		waitGroup.Add(1)
		go GetImgUrls("https://www.bizhizu.cn/bizhi/" + strconv.Itoa(i) + ".html")
	}
	// 3. 任务统计协程,统计26个协程是否都完成,完成则关闭通道
	waitGroup.Add(1)
	go CheckOk()
	// 4. 下载协程:从管道中读取连接并下载
	for i := 0; i < 5; i++ {
    
    
		waitGroup.Add(1)
		go DownloadImg()
	}
	waitGroup.Wait()
}

猜你喜欢

转载自blog.csdn.net/weixin_52025712/article/details/121577896