这是魔盾网上下载文件的一个爬虫脚本,题主想了很久没能通过携带cookie登录来下载,最终选择selenium自动化方式来登录,但是这种方式爬去速率有限,仅作为一个参考,也希望大家能够提供其他解决方式,互相学习:
直接上代码
# -*- coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.common.by import By # 查找元素的方法,此次用于搜索框查找 from selenium.webdriver.support.ui import WebDriverWait # 显示等待,设置最长等待时间,此次用于打开链接的最长等待时间 from selenium.webdriver.support import expected_conditions as EC # EC.presence_of_element_lovated()是确认元素是否已经出现了或者可点击等 from selenium.common.exceptions import TimeoutException # 超出指定的等待时间报异常 import time import json import re from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\test'} options.add_experimental_option('prefs', prefs) driver = webdriver.Chrome(options=options) # driver = webdriver.PhantomJS() # WebDriverdriver = new HtmlUnitDriver() wait = WebDriverWait(driver, 10) # 设置最长10秒的等待时间 # driver = webdriver.Chrome(executable_path=(r'C:\Python27\chromedriver.exe'), # chrome_options=chrome_options) # add missing support for chrome "send_command" to selenium webdriver driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command') params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': "E:\\test"}} command_result = driver.execute("send_command", params) def denglu(url, x): driver.get(url) # 请求访问的URL # driver.save_screenshot('modunyemian.png') if x == 0: submit = driver.find_element_by_xpath('//*[@id="navbar-collapse"]/ul/li[1]/a') # 定位安全分析入口按钮 submit.click() # 模拟提交按钮 submit = driver.find_element_by_xpath('/html/body/header/nav/nav/ul[2]/li[1]/a') # 定位登录按钮 submit.click() # 模拟提交按钮 input_one = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/form/fieldset/div[1]/input') # 定位用户名输入框 input_one.send_keys('xxxx') # 输入用户名 input_two = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/form/fieldset/div[3]/input') # 定位密码输入框 input_two.send_keys('xxxxx') # 输入密码 submit = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/form/fieldset/p[1]/button') # 定位登录按钮 submit.click() # 模拟点击提交按钮 else: pass def md5_data(): md5_list = driver.find_elements_by_xpath('//*[@id="files"]/div/table/tbody/tr') # 定位md5的链接 data_list = [] # 创建列表保存md5 for i in md5_list: # 遍历md5列表 temp = {} # print(i) temp['md5'] = i.find_element_by_xpath('./td[3]/a').get_attribute('href') # 定位链接属性 data_list.append(temp) for url in data_list: driver.get(url.get('md5')) html = driver.page_source # print(html) # submit = driver.find_element_by_xpath('//*[@id="file"]/div/table/tbody/tr[11]/td/a') # 定位样本下载按钮 if re.compile('样本下载').search(html): submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#file > div > table > tbody > tr:nth-child(11) > td > a'))) # 定位样本下载按钮 submit.click() # 模拟点击按钮 print('点击按钮成功') else: print('此文件没有下载按钮') return data_list # 返回列表 # def save_data(data_list): # 保存md5数据 # with open('md5.txt', 'a+') as f: # for data in data_list: # str_data = json.dumps(data, ensure_ascii=False) + ',\n' # f.write(str_data) # 写入数据 # print(data_list) # 打印下载保存的数据 def run(): # 执行运行 for x in range(0, 2): # 遍历翻页列表 url = 'https://www.maldun.com/analysis/page/' + str(x) + '/' # 翻页链接 if x == 0: url = 'https://www.maldun.com' # 如果首页为0,则访问主域名 denglu(url, x) md5_data() else: # 如果不是主域名,则访问翻页链接 denglu(url, x) md5_data() # save_data(md5_data()) if __name__ == '__main__': run()