基于Python的网页图片爬虫

两个爬虫代码参考了多篇资料，若有需要标注，请私信联系。闲言少叙，直接上代码。

# -*- coding:utf-8 -*-
import re
import requests

def dowmloadPic(html, keyword):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
    i = 1
    print('找到关键词:' + keyword + '的图片，现在开始下载图片...')
    for each in pic_url:
        print('正在下载第' + str(i) + '张图片，图片地址:' + str(each))
        try:
            pic = requests.get(each, timeout=10)
            # requests.adapters.DEFAULT_RETRIES = 5
        except requests.exceptions.ConnectionError:
            print('【错误】当前图片无法下载')
            continue

        dir = '../images/' + keyword + '_' + str(i) + '.jpg'
        fp = open(dir, 'wb')
        fp.write(pic.content)
        fp.close()
        i += 1
        s = requests.session()
        s.keep_alive = False


if __name__ == '__main__':
    word = input("Input key word: ")
    # pageId = 0
    # # 这里我保存到第50页
    # for i in range(50):
    #     url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=" + str(
    #         pageId) + "&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
    #     pageId += 20
    url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&ct=201326592&v=flip'

    # url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' + word
    result = requests.get(url)
    dowmloadPic(result.text, word)

上面这个版本适用于download打开的一个页面，不会自动滚动下滑条。

# -*- coding:utf-8 -*-
import re
import requests
import traceback
import os


def dowmloadPic(html, keyword, startNum):
    kv = {'user-agent': 'Mozilla/5.0'}
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
    num = len(pic_url)
    i = startNum
    root = 'L:/pics/'
    print('找到关键词:' + keyword + '的图片，现在开始下载图片...')

    for each in pic_url:
        print('正在下载第' + str(i + 1) + '张图片，图片地址:' + str(each))
        path = root + each.split('/')[-1]
        # dir = root + keyword + str(i) + '.jpg'
        dir = root + keyword + 'v1_' + str(i) + '.jpg'
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                pic = requests.get(each, headers=kv, timeout=10)
                with open(dir, 'wb') as f:
                    f.write(pic.content)
                    f.close()

        except:
            traceback.print_exc()
            print('【错误】当前图片无法下载')
            continue
        i += 1

    return i


if __name__ == '__main__':

    kv = {'user-agent': 'Mozilla/5.0'}
    lastNum = 0
    words = ['篮球','排球']
    # words为一个列表，可以自动保存多个关键字的图片
    for word in words:
        # word = input("Input key word: ")
        if word.strip() == "exit":
            break
        pageId = 0
        # 此处的参数为需爬取的页数
        for i in range(10):
            url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=" + str(
                pageId) + "&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
            pageId += 20
            result = requests.get(url, headers=kv)
            lastNum = dowmloadPic(result.text, word, lastNum)

上面这个版本适用于可以滚动下滑条，download大量图片。

喜碧CatBrother

发布了25 篇原创文章 · 获赞 28 · 访问量 3万+

私信关注

基于Python的网页图片爬虫

猜你喜欢