爬虫目标:游民*空网站内各类搞笑图集
爬虫思路:
- 找到网页连接,分析网页分页格式。比如https://www.gamersky.com/ent/202003/1268257.shtml,该图集具有30页,第二页的格式为https://www.gamersky.com/ent/202003/1268257_2.shtml,所以理论上知道了首页,剩下的用函数拼起来就行;
- 对每一个网页进行html内容爬取,存放为一个大字符串;
- 接着对整个html进行正则表达式匹配,比如该图集中全部为动态图,那么正则表达式的格式为
[a-zA-z]+://[^\s]*gif
; - 将所有匹配结果进行去重操作,放到一个字符串数组中等待下载;
- 对数组中每一个链接进行http下载既可。
目前爬了几个G之后并没有发现反爬虫机制,所以算是爬虫初级实战。
源代码:
package main
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"path"
"regexp"
"strconv"
"time"
)
//去重操作
func RemoveRepeatedElement(arr []string) (newArr []string) {
newArr = make([]string, 0)
for i := 0; i < len(arr); i++ {
repeat := false
for j := i + 1; j < len(arr); j++ {
if arr[i] == arr[j] {
repeat = true
break
}
}
if !repeat {
newArr = append(newArr, arr[i])
}
}
return newArr
}
//下载图片
func DownloadIMG(url string) {
savepath := "/Users/guoyutong/Desktop/Go/Spider/download/download2/"
fileName := path.Base(url)
res, err := http.Get(url)
if err != nil {
fmt.Println("A error occurred!")
}
defer res.Body.Close()
// 获得get请求响应的reader对象
reader := bufio.NewReaderSize(res.Body, 32*1024)
file, err := os.Create(savepath + fileName)
if err != nil {
panic(err)
}
// 获得文件的writer对象
writer := bufio.NewWriter(file)
written, _ := io.Copy(writer, reader)
fmt.Printf("Total length: %d kb\n", written/1024)
}
//下载网页
func DownloadWeb(url string) {
//正则表达式匹配
r, _ := regexp.Compile("[a-zA-z]+://[^\\s]*gif")
res, err := http.Get(url)
if err != nil {
fmt.Fprintf(os.Stderr, "fetch: %v\n", err)
os.Exit(1)
}
// 读取资源数据 body: []byte
body, err := ioutil.ReadAll(res.Body)
// 关闭资源流
res.Body.Close()
if err != nil {
fmt.Fprintf(os.Stderr, "fetch: reading %s: %v\n", url, err)
os.Exit(1)
}
//正则表达式筛选
gif_resources := r.FindAllString(string(body), -1)
//去重
gif_resources_final := RemoveRepeatedElement(gif_resources)
for _, v := range gif_resources_final {
time.Sleep(3 * time.Second)
DownloadIMG(v)
}
}
//2个参数,第一个参数原网页首页,第二个参数网页数量
func main() {
url := os.Args[1]
num := os.Args[2]
n, _ := strconv.Atoi(num)
DownloadWeb(url)
for i := 2; i <= n; i++ {
//构造新的url
real_url := url[:len(url)-6] + "_" + strconv.Itoa(i) + url[len(url)-6:]
DownloadWeb(real_url)
}
}