作业1:
from selenium import webdriver import time #获取驱动对象 driver = webdriver.Chrome() try: #自动登陆抽屉新热榜 #发送get请求 driver.get('https://dig.chouti.com/ ') #隐式等待 driver.implicitly_wait(10) #获取 '登陆' 按钮 send_tag = driver.find_element_by_id('login_btn') send_tag.click() #获取手机号输入框 username = driver.find_element_by_class_name('login-phone') username.send_keys('*******') time.sleep(1) #获取密码输入框 password = driver.find_element_by_class_name('pwd-password-input') password.send_keys('*******') time.sleep(1) #获取 '登陆' 按钮 login = driver.find_elements_by_link_text('登录') login[1].click() time.sleep(10) finally: driver.close()
作业2:
import requests import re import time headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def get_index(url): time.sleep(1) response1 = requests.get(url, headers=headers) return response1 def parse_index(text): ip_list1 = re.findall('<tr>.*?<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>', text, re.S) for ip_port in ip_list1: ip1 = ':'.join(ip_port) yield ip1 def test_ip(ip2): print('测试ip: %s' % ip2) try: proxies = {'https': ip2} # ip测试网站 ip_url1 = 'https://www.ipip.net/' # 使用有效与无效的代理对ip测试站点进行访问,若返回的结果为200则代表当前测试ip正常 response2 = requests.get(ip_url1, headers=headers, proxies=proxies, timeout=1) if response2.status_code == 200: return ip # 若ip代理无效则抛出异常 except Exception as e: print(e) # 使用代理爬取nba def spider_nba(good_ip1): url = 'https://china.nba.com/' proxies = {'https': good_ip1} response3 = requests.get(url, headers=headers, proxies=proxies) print(response3.status_code) print(response3.text) if __name__ == '__main__': base_url = 'https://www.kuaidaili.com/free/inha/{}/' for line in range(1, 2905): ip_url = base_url.format(line) response = get_index(ip_url) ip_list = parse_index(response.text) for ip in ip_list: good_ip = test_ip(ip) if good_ip: spider_nba(good_ip)
''' 今日内容: 1.requests之post请求 2.requests高级用法 3.selenium模块 4.万能破解登录 ''' ''' 请求url: 请求方式:post 请求头: # 上一次请求从哪里来 referer: https://github.com/login user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 请求体: commit: Sign in utf8: ✓ authenticity_token: j+v64L9wj7tLRsCwhFtjVM2Gt3zJnu6ZZZCu9J+oEbxQlIU+vJrH/uga9zGmcA6GWKGiqVZOVG6Opb5lOEYsWw== login: lyj68 password: 1998223689lyj webauthn-support: supported ''' # 1.访问login页获取token ''' 请求url: https://github.com/login 请求方式: get 响应头: Set-Cookie 请求头: User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 ''' import requests import re headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } response = requests.get(url='https://github.com/login',headers=headers) #print(response.text) authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" /> ',response.text, re.S)[0] print(authenticity_token) # 2.往session发送post请求 headers2 = { 'Referer': 'https://github.com/login', 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } form_data = { 'commit': 'Sign in', 'utf8': '✓', 'authenticity_token':authenticity_token, 'login': 'lyj68', 'password': '1998223689lyj', 'webauthn-support': 'supported' } # 把login页返回的cookies信息转换成字典 login_cookies = response.cookies.get_dict() # 往session地址发送post请求 # 携带请求头、请求体、login页的cookies信息 response2= requests.post(url='https://github.com/session',data=form_data,headers=headers2,cookies=login_cookies) print(response2.status_code) #print(response2.text) with open('gethub.html','w',encoding='utf-8') as f: f.write(response2.text)
# response响应 # import requests # response = requests.get("https://baidu.com") # #reponse 响应 # print(response.status_code) # 获取响应状态码 # print(response.url) #获取urlz地址 # print(response.encoding) #字符编码 # response.encoding='utf-8' # print(response.text) # 获取文本 # print(response.content) #获取二进制流 # print(response.headers) #获取页面头部信息 # print(response.history) #上一次跳转的地址 # # 1.返回cookies字典 2.返回cookies对象 # print(response.cookies) #获取cookies信息 # print(response.cookies.get_dict()) #获取cookies信息转换成字典 # print(response.cookies.items()) #获取cookies信息转换成字典 # print(response.encoding) # print(response.elapsed) #访问时间 #证书验证(大部分网站都是https) # import requests # 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端 # response = requests.get('https://www.xiaohuar.com') # print(response.status_code) # # # 改进1:去掉报错,但是会报警告 # import requests # response = requests.get('https://www.xiaohuar.com', verify=False) # # 不验证证书,报警告,返回200 # print(response.status_code) # # # 改进2:去掉报错,并且去掉警报信息 # import requests # import urllib3 # urllib3.disable_warnings() # 关闭警告 # response = requests.get('https://www.xiaohuar.com', verify=False) # print(response.status_code) # 改进3:加上证书 # 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书 # 知乎\百度等都是可带可不带 # 有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站 # import urllib3 # # urllib3.disable_warnings() # 关闭警告 # response = requests.get( # 'https://www.xiaohuar.com', # # verify=False, # cert=('/path/server.crt', '/path/key')) # print(response.status_code) # 官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies # 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情) import requests proxies={ # 带用户名密码的代理,@符号前是用户名与密码 # 'http':'http://tank:123@localhost:9527', # 'http':'http://localhost:9527', 'https':'https://localhost:9527' } response=requests.get('https://www.12306.cn',proxies=proxies) print(response.status_code) # # 支持socks代理,安装:pip install requests[socks] # import requests # proxies = { # 'http': 'socks5://user:pass@host:port', # 'https': 'socks5://user:pass@host:port' # } # respone=requests.get('https://www.12306.cn', # proxies=proxies) # # print(respone.status_code)
import requests import re import time HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def get_index(url): time.sleep(1) response = requests.get(url, headers=HEADERS) return response def parse_index(text): ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S) for ip_port in ip_list: ip = ':'.join(ip_port) yield ip def test_ip(ip): print('测试ip: %s' % ip) try: proxies = { 'https': ip } # ip测试网站 ip_url = 'https://www.ipip.net/' # 使用有效与无效的代理对ip测试站点进行访问,若返回的结果为200则代表当前测试ip正常 response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1) if response.status_code == 200: return ip # 若ip代理无效则抛出异常 except Exception as e: print(e) # 使用代理爬取nba def spider_nba(good_ip): url = 'https://china.nba.com/' proxies = { 'https': good_ip } response = requests.get(url, headers=HEADERS, proxies=proxies) print(response.status_code) print(response.text) if __name__ == '__main__': base_url = 'https://www.xicidaili.com/nn/{}' for line in range(1, 3677): ip_url = base_url.format(line) response = get_index(ip_url) ip_list = parse_index(response.text) for ip in ip_list: # print(ip) good_ip = test_ip(ip) if good_ip: # 真是代理,开始测试 spider_nba(good_ip)
''' selenium模块: 1.什么是selenium? 最初是一个自动化测试工具,可以使用它帮我们驱动浏览器自动去执行某些 自定义好的操作。例如在页面中执行JS代码、跳过登录验证. 2.为什么要使用selenium? (1).优点: 使用requests模块登录需要分析大量的复杂通信流程,使用selenium 可以轻松跳过登录验证。 (2)缺点: 浏览器会加载css.js.图片.视频...数据,爬虫效率相比requests模块要低 3.如何使用selenium? 下载selenium模块: pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium 下载: http://npm.taobao.org/mirrors/chromedriver/2.38/ ''' from selenium import webdriver # 用来驱动浏览器的 # 调用得到一个动作链对象,破解滑动验证码的时候用的,可以拖动图片 from selenium.webdriver import ActionChains # 按照什么方式查找属性,By.ID,By.CSS_SELECTOR,By.Class from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 #selenium之第一次 from selenium import webdriver import time # 通过谷歌浏览器驱动打开谷歌浏览器 # webdriver.Chrome(r'chromedriver.exe的绝对路径') # chrome = webdriver.Chrome(r'D:\python学习\chromedriver_win32\chromedriver.exe') # chrome = webdriver.Chrome() # # 若try出现异常 # try: # # 往lyj博客主页发送get请求 # # chrome.get('https://www.cnblogs.com/lyj68/') # # # 参数1:驱动对象 参数2:等待时间 # wait = WebDriverWait(chrome,10) # #1.访问百度 # chrome.get('http://www.baidu.com') # # 2.查找input输入框 # # input_tag = wait.until( # # 调用EC的presence_of_element_located() # EC.presence_of_element_located((By.ID,'kw'))) # # 参数1:查找属性的方式 # 参数2:属性的名字 # # #3.搜索一拳超人 # input_tag.send_keys("一拳超人") # #4.按键回车键 # input_tag.send_keys(Keys.ENTER) # # time.sleep(3) # finally: # # 关闭浏览器 # chrome.close() chrome = webdriver.Chrome() # 若try出现异常 try: # 往lyj博客主页发送get请求 # chrome.get('https://www.cnblogs.com/lyj68/') # 参数1:驱动对象 参数2:等待时间 wait = WebDriverWait(chrome,10) #1.访问百度 chrome.get('https://www.jd.com/') # 2.查找input输入框 input_tag = wait.until( # 调用EC的presence_of_element_located() EC.presence_of_element_located((By.ID,'key'))) # 参数1:查找属性的方式 # 参数2:属性的名字 #3.搜索一拳超人 input_tag.send_keys("唐诗三百首") #4.根据class属性名称查找标签 search_button = wait.until(EC.presence_of_element_located((By.CLASS_NAME,'button'))) # 5.点击搜索按钮 search_button.click() time.sleep(3) finally: # 关闭浏览器 chrome.close()
# from selenium import webdriver # import time # # driver = webdriver.Chrome() # # try: # driver.get('https://china.nba.com/') # driver.implicitly_wait(10) # # news_tag = driver.find_element_by_class_name("nav-news") # print(news_tag) # print(news_tag.tag_name) # time.sleep(10) # # finally: # driver.close() from selenium import webdriver import time driver = webdriver.Chrome() try: driver.get('https://www.baidu.com/') driver.implicitly_wait(10) # 1、find_element_by_link_text # 通过链接文本去找 #发布 # send_tag = driver.find_element_by_link_text('登录') # send_tag.click() # time.sleep(1) # 2、find_element_by_id # 通过id去找 # 3、find_element_by_class_name login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin pass-link') # login_tag = # 4、find_element_by_partial_link_text # 5、find_element_by_name # 6、find_element_by_css_selector # 7、find_element_by_tag_name finally: driver.close()
# from selenium import webdriver # 用来驱动浏览器的 # import time # # ''' # 隐式等待 # ''' # # 获取驱动对象、 # driver = webdriver.Chrome() # # try: # # 显式等待: 等待某个元素加载 # # 参数1: 驱动对象 参数2: 等待时间 # # wait = WebDriverWait(chrome, 10) # # driver.get('https://china.nba.com/') # # # 隐式等待: 等待页面所有元素加载 # driver.implicitly_wait(10) # news_tag = driver.find_element_by_class_name('nav-news') # # 获取标签对象 # print(news_tag) # # 获取标签的名字 # print(news_tag.tag_name) # # # time.sleep(10) # # finally: # driver.close() from selenium import webdriver # 用来驱动浏览器的 import time ''' ===============所有方法=================== element是查找一个标签 elements是查找所有标签 1、find_element_by_link_text 通过链接文本去找 2、find_element_by_id 通过id去找 3、find_element_by_class_name 4、find_element_by_partial_link_text 5、find_element_by_name 6、find_element_by_css_selector 7、find_element_by_tag_name ''' # 获取驱动对象、 driver = webdriver.Chrome() try: # 往百度发送请求 driver.get('https://www.baidu.com/') driver.implicitly_wait(10) # 1、find_element_by_link_text 通过链接文本去找 # 根据登录 # send_tag = driver.find_element_by_link_text('登录') # send_tag.click() # 2、find_element_by_partial_link_text 通过局部文本查找a标签 login_button = driver.find_element_by_partial_link_text('登') login_button.click() time.sleep(1) # 3、find_element_by_class_name 根据class属性名查找 login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin') login_tag.click() time.sleep(1) # 4、find_element_by_name 根据name属性查找 username = driver.find_element_by_name('userName') username.send_keys('15622792660') time.sleep(1) # 5、find_element_by_id 通过id属性名查找 password = driver.find_element_by_id('TANGRAM__PSP_10__password') password.send_keys('*******') time.sleep(1) # 6、find_element_by_css_selector 根据属性选择器查找 # 根据id查找登录按钮 login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit') # driver.find_element_by_css_selector('.pass-button-submit') login_submit.click() # 7、find_element_by_tag_name 根据标签名称查找标签 div = driver.find_element_by_tag_name('div') print(div.tag_name) time.sleep(10) finally: driver.close()
小结:
今天有学到好多知识yep