python2下载汽车之家的图片
python2下载汽车之家的图片
import urllib2
import os
from bs4 import BeautifulSoup
import random
import urllib
import time
end = ['A','B','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']#没有J,会乱码
for e in end:
url = "https://www.autohome.com.cn/grade/carhtml/%s.html"%(e)
url_html = urllib2.urlopen(url).read()
url_content = BeautifulSoup(url_html ,'html.parser')#下载J,注掉,'html.parser'
names = url_content.find_all('h4')
n=-1
for i in url_content.find_all('a',attrs = {
'id':True}):
n= n+1
name = names[n].text
car_url ='https:'+ i.get('href')
#car_url = 'https://car.autohome.com.cn/pic/series/145.html#pvareaid=103448'
car_url_html = urllib2.urlopen(car_url).read()
car_url_content = BeautifulSoup(car_url_html,'html.parser')
try:
classes = car_url_content.find_all('dl',attrs = {
'class':"search-pic-cardl"})[0]
years = classes.find_all('dt')
indexs = ""
for year in years:
if '2018' in year.text:
indexs = years.index(year)
break
if indexs == "":
pass
else:
path = '路径'+name
if os.path.exists(path):
pass
else:
os.makedirs(path)
photourl_2018 = classes.find_all('ul')[indexs]
for j in photourl_2018.find_all('a'):
photourl_type2018='https://car.autohome.com.cn'+j.get('href')
photourl_type = urllib2.urlopen(photourl_type2018).read()
photourl_type = BeautifulSoup(photourl_type,'html.parser')
for eve in photourl_type.find_all('img',attrs = {
'src':True,'alt':True,'title':True})[0:3]:#我只想要车正面图片
eve_url = 'https:'+eve.get('src')
carphoto_name = str(random.uniform(1, 30))+".jpg"#图片名称,用的随机数
time.sleep(0.5)
urllib.urlretrieve(eve_url,path+'/'+carphoto_name)
except:
print(name)
pass