爬取jiandan.net图片

因为requests请求网页源码不完整，只好用selenium模拟请求，但效率异常慢。

分为网页请求、网页解析、图片保存、循环调用三个主要步骤。

from bs4 import BeautifulSoup
from selenium import  webdriver
import re
import requests
import os

name=0
#网页请求
def get_html(url):
    html=''
    browser = webdriver.Chrome()
    browser.get(url)
    doc=browser.page_source
    browser.close()
    doc = BeautifulSoup(doc, 'lxml')
    for i in doc.select('.commentlist'):
        for item in i.select('p'):
            img1=str(item.select('a'))
            img2=str(item.select('img'))
            html=html+'\n'+img1+'\n'+img2
    return html

#网页解析
def parse_page(html):
    list_All = []
    pattern = re.compile('src="(.*?)".*?href="(.*?)".*?target', re.S)
    result = re.findall(pattern, html)
    for i in result:
        list_All.append(i[0])
        list_All.append(i[1])
    return list_All

#图片保存
def save_pictures(list_ALL):
    global name
    for url in list_ALL:
        try:
            response = requests.get(url)
            doc = response.content
            name += 1
            with open('D://pictures_crawling//{0}.jpg'.format(str(name)), 'wb') as f:
                f.write(doc)
        except:
            None

#循环
def main():
    start=int(input("开始页码（0-100)："))
    end=int(input("结束页码(0-100)："))
    print("图片保存位置：D://pictures_crawling...")
    os.mkdir("D://pictures_crawling")

    for url_section in range(start,end):
            url='http://jandan.net/ooxx/page-'+str(url_section)+'#comments'
            html=get_html(url)
            list_all=parse_page(html)
            print(list_all)
            save_pictures(list_all)

if __name__=='__main__':
    main()

慢的已经不想统计时间了，直接把time.per_counter（）删了。

爬取jiandan.net图片

猜你喜欢