爬取mm131图片

爬取mm131图片

import urllib.request
import os

for i in range(4200, 4461):
    os.mkdir('tupian/' + str(i))
    for j in range(60):
        try:
            url = 'http://img1.mm131.me/pic/' + str(i) + '/' + str(j) + '.jpg'
            print(url)
            # urllib.request.urlretrieve(url, 'lala.jpg')
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
                'Referer': 'https://www.sogou.com/link?url=DSOYnZeCC_o7btUgpK402wmc9YOcsOr4cOOT57O29F8'
            }
            request = urllib.request.Request(url=url, headers=headers)
            response = urllib.request.urlopen(request)
            with open('tupian/' + str(i) + '/' + str(j) + '.jpg', 'wb') as fp:
                fp.write(response.read())
        except Exception as e:
            print('下载失败,下载下一条')
            break

爬取妹子图

import urllib.request
import re
import time
import os

proxy={
    'http':'120.92.74.189:3128'
}


headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36',
    'Referer':'http://mzitu.com'
}

dirname = 'mzt'

if not os.path.exists(dirname):
	os.mkdir(dirname)

for page in range(2,3):

    url='http://www.mzitu.com/page/%s/' % page

    request=urllib.request.Request(url=url,headers=headers)
    handler = urllib.request.ProxyHandler(proxies=proxy)
    response=urllib.request.build_opener(handler).open(request)

    content=response.read().decode('utf8')

    pattern=re.compile(r'<li><a href="(.*?.com/(.*?))"')

    ret=pattern.findall(content)
    # print(ret)
    for i in ret:
        url1 = i[0]
        baname = i[1]
        name = os.path.join(dirname, baname)
        if not os.path.exists(name):
            os.mkdir(name)
        request1 = urllib.request.Request(url=url1, headers=headers)

        response1 = urllib.request.urlopen(request1)

        content1 = response1.read().decode('utf8')
        pattern1 = re.compile(r'<div class="main-image">.*?<img src="(.*?)\d+(.jpg)"')
        pattern2=re.compile(r'…</span>.*?<span>(\d+)</span>')

        ret1 = pattern1.findall(content1)[0]
        ret2=int(pattern2.findall(content1)[0])

        print(ret1)
        print(ret2)
        time.sleep(5)
        for i in range(1, ret2 + 1):
            i = "%02d" % i
            url = ret1[0] + str(i) + ret1[1]
            print(i)
            request = urllib.request.Request(url, headers=headers)
            response = urllib.request.build_opener(handler).open(request)
            filename = '%s.jpg' % i
            filepath = os.path.join(name, filename)
            with open(filepath, 'wb') as fb:
                fb.write(response.read())
            time.sleep(0.3)



















猜你喜欢

转载自blog.csdn.net/qq_43004728/article/details/84192634