关于需要selenium自动登录下载文件的网站,暂时不知如何其他登录方式,欢迎代价解惑,提供其他登录方式下载文件

这是魔盾网上下载文件的一个爬虫脚本,题主想了很久没能通过携带cookie登录来下载,最终选择selenium自动化方式来登录,但是这种方式爬去速率有限,仅作为一个参考,也希望大家能够提供其他解决方式,互相学习:

直接上代码

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By  # 查找元素的方法,此次用于搜索框查找
from selenium.webdriver.support.ui import WebDriverWait  # 显示等待,设置最长等待时间,此次用于打开链接的最长等待时间

from selenium.webdriver.support import expected_conditions as EC  # EC.presence_of_element_lovated()是确认元素是否已经出现了或者可点击等
from selenium.common.exceptions import TimeoutException  # 超出指定的等待时间报异常
import time
import json
import re
from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\test'}
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(options=options)
# driver = webdriver.PhantomJS()
# WebDriverdriver = new HtmlUnitDriver()
wait = WebDriverWait(driver, 10)  # 设置最长10秒的等待时间

# driver = webdriver.Chrome(executable_path=(r'C:\Python27\chromedriver.exe'),
#                               chrome_options=chrome_options)
# add missing support for chrome "send_command"  to selenium webdriver
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': "E:\\test"}}
command_result = driver.execute("send_command", params)


def denglu(url, x):
    driver.get(url)  # 请求访问的URL
    # driver.save_screenshot('modunyemian.png')
    if x == 0:

        submit = driver.find_element_by_xpath('//*[@id="navbar-collapse"]/ul/li[1]/a')  # 定位安全分析入口按钮
        submit.click()  # 模拟提交按钮
        submit = driver.find_element_by_xpath('/html/body/header/nav/nav/ul[2]/li[1]/a')  # 定位登录按钮
        submit.click()  # 模拟提交按钮
        input_one = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/form/fieldset/div[1]/input')  # 定位用户名输入框
        input_one.send_keys('xxxx')  # 输入用户名
        input_two = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/form/fieldset/div[3]/input')  # 定位密码输入框
        input_two.send_keys('xxxxx')  # 输入密码
        submit = driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/form/fieldset/p[1]/button')  # 定位登录按钮
        submit.click()  # 模拟点击提交按钮
    else:
        pass


def md5_data():
    md5_list = driver.find_elements_by_xpath('//*[@id="files"]/div/table/tbody/tr')  # 定位md5的链接
    data_list = []  # 创建列表保存md5
    for i in md5_list:  # 遍历md5列表
        temp = {}
        # print(i)
        temp['md5'] = i.find_element_by_xpath('./td[3]/a').get_attribute('href')  # 定位链接属性
        data_list.append(temp)
    for url in data_list:
        driver.get(url.get('md5'))
        html = driver.page_source
        # print(html)
        # submit = driver.find_element_by_xpath('//*[@id="file"]/div/table/tbody/tr[11]/td/a')  # 定位样本下载按钮
        if re.compile('样本下载').search(html):
            submit = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, '#file > div > table > tbody > tr:nth-child(11) > td > a')))  # 定位样本下载按钮
            submit.click()  # 模拟点击按钮
            print('点击按钮成功')
        else:
            print('此文件没有下载按钮')
    return data_list  # 返回列表


# def save_data(data_list):  # 保存md5数据
#     with open('md5.txt', 'a+') as f:
#         for data in data_list:
#             str_data = json.dumps(data, ensure_ascii=False) + ',\n'
#             f.write(str_data)  # 写入数据
#             print(data_list)  # 打印下载保存的数据


def run():  # 执行运行
    for x in range(0, 2):  # 遍历翻页列表
        url = 'https://www.maldun.com/analysis/page/' + str(x) + '/'  # 翻页链接
        if x == 0:
            url = 'https://www.maldun.com'  # 如果首页为0,则访问主域名
            denglu(url, x)
            md5_data()
        else:  # 如果不是主域名,则访问翻页链接
            denglu(url, x)
            md5_data()
            # save_data(md5_data())


if __name__ == '__main__':
    run()

猜你喜欢

转载自blog.csdn.net/qq_42707967/article/details/81065779