Python爬取网站美女照片

上次无意之中看到一个网站，里面全是美女的照片，我就心想，哪天有时间了得把这网站的所有美女照片都得爬下来。今天有时间，写了点代码，爬去了网站的所有照片。附上战果！图片实在是太多了，爬了一个多小时，还在爬....

先附上所有的源代码：

# -*- coding: utf-8 -*-
"""
Created on Fri Nov  9 17:07:44 2018
@author: 小谢
"""
import requests
from bs4 import BeautifulSoup
import os
import random
import csv
import time
urls=[]
datas=[]
i=0
def Download(name,url,dirname):
    dir=dirname+"//"
    path=os.path.join(dir,name)
    print(path)
    response=requests.get(url)
    try:
        with open(path,"wb") as f:
            f.write(response.content)
            f.close()
            global i
            i=i+1
    except Exception as e:
        print(e)   
#获取每一个分类的URL和名字
def Geturl():
    resp=requests.get("http://www.27270.com/ent/meinvtupian/")
    resp.encoding="gbk"  #设置网页编码
    html=resp.text
    soup=BeautifulSoup(html,"html.parser")
    divSoup1=soup.find("div",attrs={"id":"NewTagListBox"})
    aas=divSoup1.find_all("a")
    for a in aas:
        tup=(a['href'],a.string)
        urls.append(tup)  #将主页面的各个分栏的链接和名字加入urls元组中
def GetImages(url,dirname):
    if os.path.exists(dirname):
        pass
    else:
        os.mkdir(dirname)   #创建目录
    resp=requests.get(url)
    resp.encoding="gbk"  #设置网页编码
    html=resp.text
    soup=BeautifulSoup(html,"html.parser")
    divSoup=soup.find("ul",attrs={'id':'Tag_list'})
    lis=divSoup.find_all("li")
    file=open("meinv.csv","a",newline="")
    csv_writer=csv.writer(file)
    for li in lis:
        img=li.find("img")
        alt=img['alt']
        name=alt+".jpg"      #图片的名字
        src=img['src']       #图片的下载地址
        tup=(name,src)
        datas.append(tup)
        for data in datas:
            csv_writer.writerow(data)
            Download(data[0],data[1],dirname)
    file.close()
def main():
    Geturl()
    for url in urls:
        ur=url[0][:-5]    #将每个分栏的url链接去除最后的 .html
        for i in range(11):
            i+=1
            if i==1:
                uuu=ur+".html"
                try:
                    GetImages(uuu,url[1])
                except Exception as e:
                    print("异常对象的类型是：%s"%type(e))
                    print("异常对象的内容是：%s"%type(e))
            else:
                uuu=ur+"_"+str(i)+"html"
                try:
                    GetImages(uuu,url[1])
                except Exception as e:
                    print("异常对象的类型是：%s"%type(e))
                    print("异常对象的内容是：%s"%type(e))
start=time.time()
main()
end=time.time()
miao=end-start
i=str(i)
print("一共爬去了%s张图片，花费了%s秒的时间！"%(i,miao))

网站链接：http://www.27270.com/ent/meinvtupian/

爬取网站的第一步，就是先分析网站的结构。我们可以看到，上面这里有分类

我们右键检查元素，发现这些分类都有规律

我们写一个函数获得每个分类的链接和名字，将链接和名字以元组的形式存储在我们的全局变量 urls中

扫描二维码关注公众号，回复： 4017878 查看本文章

def Geturl():
    resp=requests.get("http://www.27270.com/ent/meinvtupian/")
    resp.encoding="gbk"  #设置网页编码
    html=resp.text
    soup=BeautifulSoup(html,"html.parser")
    divSoup1=soup.find("div",attrs={"id":"NewTagListBox"})
    aas=divSoup1.find_all("a")
    for a in aas:
        tup=(a['href'],a.string)
        urls.append(tup)  #将主页面的各个分栏的链接和名字加入urls元组中

然后我们开始分析每一个分类的规律了，每一个分类都有很多栏，而每一个栏的url都有规律。

http://www.27270.com/tag/875.html
http://www.27270.com/tag/875_2.html
http://www.27270.com/tag/875_3.html
......

所有我们得构造每一个分类的链接，因为每一个分类的栏目数量不同，所以我们选了最多的11

def main():
    Geturl()
    for url in urls:
        ur=url[0][:-5]    #将每个分栏的url链接去除最后的 .html
        for i in range(11):
            i+=1
            if i==1:
                uuu=ur+".html"
                try:
                    GetImages(uuu,url[1])
                except Exception as e:
                    print("异常对象的类型是：%s"%type(e))
                    print("异常对象的内容是：%s"%type(e))
            else:
                uuu=ur+"_"+str(i)+"html"
                try:
                    GetImages(uuu,url[1])
                except Exception as e:
                    print("异常对象的类型是：%s"%type(e))
                    print("异常对象的内容是：%s"%type(e))

接下来的函数是获取图片的URL和名字，

def GetImages(url,dirname):
    if os.path.exists(dirname):
        pass
    else:
        os.mkdir(dirname)   #创建目录
    resp=requests.get(url)
    resp.encoding="gbk"  #设置网页编码
    html=resp.text
    soup=BeautifulSoup(html,"html.parser")
    divSoup=soup.find("ul",attrs={'id':'Tag_list'})
    lis=divSoup.find_all("li")
    file=open("meinv.csv","a",newline="")
    csv_writer=csv.writer(file)
    for li in lis:
        img=li.find("img")
        alt=img['alt']
        name=alt+".jpg"      #图片的名字
        src=img['src']       #图片的下载地址
        tup=(name,src)
        datas.append(tup)
        for data in datas:
            csv_writer.writerow(data)
            Download(data[0],data[1],dirname)
    file.close()

最后这个函数是下载图片

def Download(name,url,dirname):
    dir=dirname+"//"
    path=os.path.join(dir,name)
    print(path)
    response=requests.get(url)
    try:
        with open(path,"wb") as f:
            f.write(response.content)
            f.close()
            global i
            i=i+1
    except Exception as e:
        print(e)

Python爬取网站美女照片

猜你喜欢