package main
import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
)
//HandleError 错误打印
func HandleError(err error,why string) {
if err != nil {
fmt.Println(why,err)
}
}
//DownloadFile 下载图片,传入的图片叫什么
func DownloadFile(url, filename string) (ok bool){
resp,err := http.Get(url)
HandleError(err,"http.get.url")
defer resp.Body.Close()
bytes, err := ioutil.ReadAll(resp.Body)
HandleError(err,"resp.body")
filename = "D:/codeTestForStudy/Go/reptile/img/" + filename
// 写出数据
ioutil.WriteFile(filename,bytes,0666)
if err != nil {
return false
} else {
return true
}
}
var (
// 存放图片链接的数据管道
chanImagUrls chan string
waitGroup sync.WaitGroup
// 用于监控协程
chanTask chan string
reImg = `https?://[^"]+?(\.((jpg)|(png)|(jpeg)|(gif)|(bmp)))`
)
//DownloadImg 下载图片
func DownloadImg() {
for url := range chanImagUrls {
filename := GetFilenameFromUrl(url)
ok := DownloadFile(url, filename)
if ok {
fmt.Printf("%s 下载成功\n", filename)
} else {
fmt.Printf("%s 下载失败\n", filename)
}
}
waitGroup.Done()
}
//GetFilenameFromUrl 截取url名字
func GetFilenameFromUrl(url string) (filename string){
// 返回最后一个 / 的位置
lastIndex := strings.LastIndex(url,"/")
// 将名字切出来
filename = url[lastIndex+1:]
// 时间戳决定重名
timePrefix := strconv.Itoa(int(time.Now().UnixNano()))
filename = timePrefix + "_" +filename
// 尝试将文件名改为中文
//resp, err := http.Get(url)
//HandleError(err,"http.Get(url)")
//
return
}
//CheckOk 任务统计协程
func CheckOk() {
var count int
for {
url := <-chanTask
fmt.Printf("%s 完成爬取任务\n",url)
count++
if count == 26 {
close(chanImagUrls)
break
}
}
waitGroup.Done()
}
//GetImgUrls 爬图片链接到通道,url是传的整页链接
func GetImgUrls(url string) {
urls := GetImgs(url)
// 遍历切片里所有的连接,存入数据管道
for _, url := range urls{
chanImagUrls <- url
}
// 表示当前协程完成
// 每完成一个任务写一条数据
// 用于监控协程知道已经完成了几个任务
chanTask <- url
waitGroup.Done()
}
//GetImgs 获取当前页面图片链接
func GetImgs(url string) (urls []string) {
pageStr := GetPageStr(url)
re := regexp.MustCompile(reImg)
results := re.FindAllStringSubmatch(pageStr, -1)
fmt.Printf("共找到 %d 条结果\n",len(results))
for _, result := range results {
url := result[0]
urls = append(urls,url)
}
return
}
func GetPageStr(url string) (pageStr string) {
resp, err := http.Get(url)
HandleError(err,"http.Get url")
defer resp.Body.Close()
// 读取页面内容
pageBytes, err := ioutil.ReadAll(resp.Body)
HandleError(err,"ioutil.ReadAll")
// 字节转字符串
pageStr = string(pageBytes)
return
}
func main() {
// 1. 初始化管道
chanImagUrls = make(chan string,1000000)
chanTask = make(chan string,26)
// 2. 爬虫协程
for i := 10; i < 270; i = i + 10 {
waitGroup.Add(1)
go GetImgUrls("https://www.bizhizu.cn/bizhi/" + strconv.Itoa(i) + ".html")
}
// 3. 任务统计协程,统计26个协程是否都完成,完成则关闭通道
waitGroup.Add(1)
go CheckOk()
// 4. 下载协程:从管道中读取连接并下载
for i := 0; i < 5; i++ {
waitGroup.Add(1)
go DownloadImg()
}
waitGroup.Wait()
}
随笔---Go爬虫爬取14寸笔记本电脑桌面
猜你喜欢
转载自blog.csdn.net/weixin_52025712/article/details/121577896
今日推荐
周排行