因为requests请求网页源码不完整,只好用selenium模拟请求,但效率异常慢。
分为网页请求、网页解析、图片保存、循环调用三个主要步骤。
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import requests
import os
name=0
#网页请求
def get_html(url):
html=''
browser = webdriver.Chrome()
browser.get(url)
doc=browser.page_source
browser.close()
doc = BeautifulSoup(doc, 'lxml')
for i in doc.select('.commentlist'):
for item in i.select('p'):
img1=str(item.select('a'))
img2=str(item.select('img'))
html=html+'\n'+img1+'\n'+img2
return html
#网页解析
def parse_page(html):
list_All = []
pattern = re.compile('src="(.*?)".*?href="(.*?)".*?target', re.S)
result = re.findall(pattern, html)
for i in result:
list_All.append(i[0])
list_All.append(i[1])
return list_All
#图片保存
def save_pictures(list_ALL):
global name
for url in list_ALL:
try:
response = requests.get(url)
doc = response.content
name += 1
with open('D://pictures_crawling//{0}.jpg'.format(str(name)), 'wb') as f:
f.write(doc)
except:
None
#循环
def main():
start=int(input("开始页码(0-100):"))
end=int(input("结束页码(0-100):"))
print("图片保存位置:D://pictures_crawling...")
os.mkdir("D://pictures_crawling")
for url_section in range(start,end):
url='http://jandan.net/ooxx/page-'+str(url_section)+'#comments'
html=get_html(url)
list_all=parse_page(html)
print(list_all)
save_pictures(list_all)
if __name__=='__main__':
main()
慢的已经不想统计时间了,直接把time.per_counter()删了。