爬取图片方法一: requests
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
import re
import requests
import os
import shutil
suyan_url= 'http://www.xiaohuar.com/s-1-2069.html'
response = requests.get(suyan_url)
# print(response.content)
lists = re.findall(r'href="(?:.*?)".*?class="(?:.*?)"',response.text,re.S) #re.S 把文本信息转换成1行匹配
folder = 'aabb'
if os.path.exists(folder):
shutil.rmtree(folder)
os.mkdir(folder);
os.chdir(folder);
for each in lists:
imgurl = each.split(' ')[0][6:-1]
if imgurl[-4:] == '.jpg':
filename = imgurl.split('/')[-1]
img = requests.get(imgurl)
with open(filename,'wb') as f:
f.write(img.content)
爬取图片方法二:urllib.request
import urllib.request
import os
import random
import shutil
def url_open(url):
req = urllib.request.Request(url)
#模拟浏览器登陆 获取
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36')
proxies = ['124.207.82.166:8008', '218.89.14.142:8060', '49.64.86.43:8080', '101.231.104.82:80']
proexy = random.choice(proxies)
# 使用代理ip访问
proxy_support = urllib.request.ProxyHandler({'http':proexy})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page') + 23
b = html.find(']', a)
# print(html[a:b])
print(url)
return html[a:b]
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = [];
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a+255)
if b != -1:
img_addrs.append('http:' + html[a+9:b+4])
else:
b = a + 9
a = html.find('img src=', b)
for each in img_addrs:
print(each)
return img_addrs
def save_imgs(folder, img_addrs):
idx = 0
for each in img_addrs:
filename = each.split('/')[-1]
idx += 1
with open(filename, 'wb') as f:
print('-------' + str(idx))
img = url_open(each)
f.write(img)
def download_mm(folder='HHAA', pages=10):
if os.path.exists(folder):
shutil.rmtree(folder) #如果存在这个文件夹,则删除文件夹 并 删除里面的文件
os.mkdir(folder) #创建一个文件夹
os.chdir(folder) #将工作目录定位到当前的文件夹
print(os.path)
url = 'http://jandan.net/ooxx/'
page_num = int(get_page(url))
for i in range(pages):
page_num -= i
page_url = url + 'page-' + str(page_num) + '#comments'
print(page_url)
img_addrs = find_imgs(page_url)
save_imgs(folder, img_addrs)
if __name__ == '__main__':
download_mm()