写一起太过冗长,那就分开写吧
PyQuery库
安装&调用
pip install pyquery
from pyquery import PyQuery
走你
'字符串初始化'
from pyquery import PyQuery as pq
html = '''
<select node-type="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option value="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
print(temp('option')) #打印option标签
-->
<option value="zh-cn" selected="selected">中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
'url初始化'
from pyquery import PyQuery as pq
url = 'http://www.baidu.com'
temmp = pq(url)
print(temp('head'))
-->
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head>
最基本的选择查找
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
from pyquery import PyQuery as pq
temp = pq(html)
print(temp('#changeLanguage .zh-cn')) #id=changeLanguage标签下的class=zh-cn的标签
-->
<option class="zh-cn" selected="selected">中文(简体)</option>
查找子元素
from pyquery import PyQuery as pq
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
lis1 = temp('#changeLanguage')
print(lis1)
-->
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected="selected">中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select>
lis2 = lis1.find('option')
print(lis2)
-->
<option class="zh-cn" selected="selected">中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
父元素查找
from pyquery import PyQuery as pq
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
lis1 = temp('option')
parent = lis1.parent() #lis1.parents()可以查找所有的父类
print(parent)
-->
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected="selected">中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select>
兄弟标签
XXXX.siblings('关键字')
遍历
XXX.items()
获取信息
获取文本
from pyquery import PyQuery as pq
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
lis1 = temp('option').text()
print(lis1)
-->
中文(简体) 中文(臺灣) 中文(香港) English
lis2 = temp('option').eq(1).text()
print(lis2)
-->
中文(臺灣)
lis3 = temp('option[class]').text()
print(lis3)
-->
中文(简体)
获取属性
'遇到 class 用 . 遇到 id 用 #'
'目标:获取<option value="zh-tw">中文(臺灣)</option>中的value属性'
from pyquery import PyQuery as pq
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
print(temp('option[value]')) #打印属性含有value的option标签
-->
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
print(temp('option[value]').eq(0).attr.value) #包含有option的标签有两个,eq(0)是选第一个标签,之后用attr选取其中的value属性
-->
zh-hk
print(temp('option[value]').eq(1).attr.value)
-->
en
DOM操作
增、删 addClass removeClass
from pyquery import PyQuery as pq
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
cla = temp('option[class]')
print(cla) #原代码
-->
<option class="zh-cn" selected="selected">中文(简体)</option>
remcla = cla.removeClass("zh-cn") #删除class属性下的"zh-cn"
print(remcla)
-->
<option class="" selected="selected">中文(简体)</option>
addcla = cla.addClass("zh-cn") #增加class属性下的"zh-cn"
print(addcla)
-->
<option class="zh-cn" selected="selected">中文(简体)</option>
增、改 attr css
print(cla)
-->
<option class="zh-cn" selected="selected">中文(简体)</option> #原代码
att = cla.attr('a','b')
print(att)
-->
<option class="zh-cn" selected="selected" a="b">中文(简体)</option> #加入a,b后代码
att1 = cla.attr('class','b')
-->
<option class="b" selected="selected" a="b">中文(简体)</option> #修改class属性
style = cla.css('lala','hehe') #增加style属性
print(style)
-->
<option class="b" selected="selected" a="b" style="lala: hehe">中文(简体)</option>
删除 remove(前面的修改指令是 removeClass)
from pyquery import PyQuery as pq
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
dell = temp.find('option[class]').remove()
print(temp)
-->
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
#少了一行
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select>
伪类选择器
from pyquery import PyQuery as pq
html = '''
<select id="changeLanguage" suda-data="key=tblog_home_click&value=language_versions_click">
<option class="zh-cn" selected>中文(简体)</option>
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
</select></p>
'''
temp = pq(html)
fir = temp('option:first-child')
print(fir)
-->
<option class="zh-cn" selected="selected">中文(简体)</option> #打印第一个option
temp('option:last-child') #最后一个
temp('option:nth-child(2)') #第二个
temp('option:gt(2)') #从0开始,除去0,1,2个
-->
<option value="en">English</option>
temp('option:gt(0)') #除去第0个
-->
<option value="zh-tw">中文(臺灣)</option>
<option value="zh-hk">中文(香港)</option>
<option value="en">English</option>
temp('option:nth-child(2n)') #获取第偶数个标签 0,2,4,6
temp('option:contains(关键字)') #获取带有关键字内容的标签
selenium库
自动化测试工具
解决爬虫的js渲染的问题
基本操作
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome() #chrome为驱动对象
try:
url = 'https://www.baidu.com'
browser.get(url) #.get 获取网址
input = browser.find_element_by_id('kw')#寻找关键字'kw'赋值为input
input.send_keys('Python') #输入关键字'Python'
input.send_keys(Keys.ENTER) #Keys.ENTER = 回车键
wait = WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #等待ID为'content_left'的元素加载出来
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source) #网页源码
finally:
browser.close()
-->
https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=Python&rsv_pq=a6b5304700084dbf&rsv_t=dd6dDj90wfh%2F0Id0HfaFrnzUALKv%2Bg9e%2BrAiJ%2BZpwh%2BVZfgw%2BJHHqS2kFOw&rqlang=cn&rsv_enter=1&rsv_sug3=6&rsv_sug2=0&inputT=157&rsv_sug4=157 #current_url
[{'domain': '.baidu.com', 'httpOnly': False, 'name': 'H_PS_PSSID', 'path': '/', 'secure': False, 'value': '1444_28777_21098_28775_28724_28839_28585_28604_22160'}, {'domain': '.baidu.com', 'httpOnly': False, 'name': 'delPer', 'path': '/', 'secure': False, 'value': '0'}, {'domain': '.baidu.com', 'expiry': 3703655028.667185, 'httpOnly': False, 'name': 'BAIDUID', 'path': '/', 'secure': False, 'value': '7F845AF3F32D33D833EFABFC88D4D009:FG=1'}, {'domain': '.baidu.com', 'expiry': 3703655028.667236, 'httpOnly': False, 'name': 'BIDUPSID', 'path': '/', 'secure': False, 'value': '7F845AF3F32D33D833EFABFC88D4D009'}, {'domain': '.baidu.com', 'expiry': 3703655028.66726, 'httpOnly': False, 'name': 'PSTM', 'path': '/', 'secure': False, 'value': '1556171380'}, {'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BD_HOME', 'path': '/', 'secure': False, 'value': '0'}, {'domain': '.baidu.com', 'expiry': 1556257784.182555, 'httpOnly': False, 'name': 'BDORZ', 'path': '/', 'secure': False, 'value': 'B490B5EBF6F3CD402E515D22BCDA1598'}, {'domain': 'www.baidu.com', 'expiry': 1557035382, 'httpOnly': False, 'name': 'BD_UPN', 'path': '/', 'secure': False, 'value': '12314353'}, {'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BD_CK_SAM', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.baidu.com', 'httpOnly': False, 'name': 'PSINO', 'path': '/', 'secure': False, 'value': '1'}, {'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BDSVRTM', 'path': '/', 'secure': False, 'value': '731'}, {'domain': 'www.baidu.com', 'expiry': 1556173976, 'httpOnly': False, 'name': 'H_PS_645EC', 'path': '/', 'secure': False, 'value': '49c1oN5vV6lzznYyhwr%2F6WBMA1K2llgLh7Zv98vKt2Knh478J2E8jbbSaFs'}] #cookies
<!DOCTYPE html><!--STATUS OK--><html xmlns="http://www.w3.org/1999/xhtml"><head><script charset="utf-8" async="" src="https://ss0.bdstatic.com/-0U0bnSm1A5BphGlnYG/tam-ogel/5d4e9b24-dcc5-483a-b6da-be1e9e621891.js"></script>
<meta http-equiv="content-type" content="text/html;charset=utf-8" /><style data-for="result" id="css_result" type="text/css">body{color:#333;background:#fff;padding:6px 0 0;margin:0;position:relative;min-width:900px}body,th,td,.p1,.p2{font-family:arial}p,form,ol,ul,li,dl,dt,dd,h3{margin:0;padding:0;list-style:none}input{padding-top:0;padding-bottom:0;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;box-sizing:border-box}table,img{border:0}td{font-size:9pt;line-height:18px}em{font-style:normal;color:#c00}a em{text-decoration:underline}cite{font-style:normal;color:green}.m,a.m{color:#666}a.m:visited{color:#606}.g,a.g{color:green}.c{color:#77c}.f14{font-size:14px}.f10{font-size:10.5pt}.f16{font-size:16px}.f13{font-size:13px}.bg{background-image:url(https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/img/icons_5859e57.png);_background-image:url(https://ss1.bdstatic.com/5eN #源码
只复制了部分
声明浏览器对象
from selenium import webdriver
browser = webdriver.Chrome() #各种浏览器
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantomJS()
browser = webdriver.Safari()
访问页面
from selenium import webdriver
browser = webdriver.Chrome()
url = 'http://www.taobao.com'
browser.get(url)
print(browser.page_source)
browser.close()
--> 部分结果
<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" lang="zh-CN" class="ks-webkit537 ks-webkit ks-chrome73 ks-chrome"><head><script charset="utf-8" src="https://tce.taobao.com/api/mget.htm?callback=jsonpXctrl107&tce_sid=1947787&tce_vid=0&tid=&tab=&topic=&count=&env=online&cna=undefined" async=""></script><script src="https://ald.taobao.com/recommend2.htm?appId=20140506002%2C20140506001%2C03014&_ksTS=1556172174027_94&callback=jsonp95" async=""></script><script src="https://textlink.simba.taobao.com/?name=tbhs&cna&nn=&count=13&pid=430266_1006&_ksTS=1556172173993_74&callback=jsonp75" async=""></script>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" co
查找元素
单个元素
from selenium import webdriver
url = 'https://www.taobao.com'
browser = webdriver.Chrome()
browser.get(url)
input_first = browser.find_element_by_id('q') #find_elemnt_id 查找id为q的元素
input_second = browser.find_element_by_css_selector('#q') #css_elector查找id=q的元素
input_third = browser.find_element_by_xpath('//*[@id="q"]') #xpath查找属性为id=q的元素
print(input_first,input_second,input_third)
browser.close()
-->
<selenium.webdriver.remote.webelement.WebElement (session="f0cdb89a7f77fa887553eff4ecfbe60a", element="0.5708762229589062-1")> <selenium.webdriver.remote.webelement.WebElement (session="f0cdb89a7f77fa887553eff4ecfbe60a", element="0.5708762229589062-1")> <selenium.webdriver.remote.webelement.WebElement (session="f0cdb89a7f77fa887553eff4ecfbe60a", element="0.5708762229589062-1")>
单个元素的通用查找方式
from selenium import webdriver
from selenium.webdriver.common.by import By
url = 'http://www.taobao.com'
browser = webdriver.Chrome()
browser.get(url)
input_first = browser.find_element(By.ID,'q')
print(input_first)
browser.close()
-->
<selenium.webdriver.remote.webelement.WebElement (session="2cbc003859a694da29b57bc925f5c0a0", element="0.47061304187449315-1")>
查找多个元素
from selenium import webdriver
from selenium.webdriver.common.by import By
url = 'http://www.tabao.com'
browser = webdriver.Chrome()
browser.get(url)
li = browser.find_elements(By.CSS_SELECTOR,'.service-bd li')
print(li)
browser.close()
-->部分结果
[<selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-1")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-2")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-3")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-4")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.3845
元素交互操作
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input = browser.find_element(By.ID,'q') #搜索框id 为 q
input.send_keys('iphone')
time.sleep(1) #1s之后
input.clear() #搜索框清空
input.send_keys('ipad')
button = browser.find_element_by_class_name('btn-search') #btn-search为搜索按钮坐标
button.click()
交互动作
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
browser.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
browser.switch_to.frame('iframeResult')
source = browser.find_element(By.ID,'draggable')
target = browser.find_element(By.ID,'droppable')
actions = ActionChains(browser) #声明actions 动作链
actions.drag_and_drop(source,target)
actions.perform() #执行actions
执行js
实现进度条拖拽
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com')
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')
获取元素信息
获取属性
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
logo = browser.find_element(By.XPATH,'//a[@class="zu-top-link-logo"]')
print(logo)
print(logo.get_attribute('id'))
-->
<selenium.webdriver.remote.webelement.WebElement (session="329421fa6d6f9596906140f0a924a4a4", element="0.5485149819316133-1")>
zh-top-link-logo
获取文本、ID、位置、标签名、大小
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
a = browser.find_elements(By.XPATH,'//li/a[@class="zu-top-nav-link"]')
for li in a:
print(li.text)
print(li.id)
print(li.location)
print(li.tag_name)
print(li.size)
-->
首页
0.4477185237510821-1
{'x': 486, 'y': 0}
a
{'height': 45, 'width': 54}
话题
0.4477185237510821-2
{'x': 540, 'y': 0}
a
{'height': 45, 'width': 54}
发现
0.4477185237510821-3
{'x': 594, 'y': 0}
a
{'height': 45, 'width': 54}
元素等待
影式等待
'如果查找元素的时候,没有立即得到响应,隐式等待会等待一段时间再查找DOM,默认时间为0'
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.baidu.com')
style = browser.find_element(By.XPATH,'//area[@style="outline:none;"]')
print(style)
-->
<selenium.webdriver.remote.webelement.WebElement (session="95a2fbc651f41f0604be1d5b385bc56a", element="0.5739750677363769-1")>
显式等待
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome()
browser.get('http://www.taobao.com')
wait = WebDriverWait(browser,10)
input = wait.until(EC.presence_of_element_located((By.ID,‘q’)))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search')))
print(input,button)
-->
<selenium.webdriver.remote.webelement.WebElement (session="b2ff6e85088f6f38025aef18a6a79a17", element="0.7102273460896695-1")> <selenium.webdriver.remote.webelement.WebElement (session="b2ff6e85088f6f38025aef18a6a79a17", element="0.7102273460896695-2")>
前进、后退
time.sleep(1) #等待1s
browser.forward() #前进
browser.back() #后退
cookies
browser.get_cookies() #获取cookies
browser.add_cookies({'name':'name','XX':'XX'}) #add一些字典形式的cookies
browser.delete_all_cookies() #删掉
通用的切换选项卡
使用js和window.open完成切换
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get('https://www.baidu.com') #第一个选项卡打开百度
browser.execute_script('window.open()') #新建选项卡
browser.execute_script('window.open()') #再来新建一个方便学习
print(browser.window_handles)
browser.switch_to_window(browser.window_handles[2]) #切换到第三个选项卡
browser.get('https:www.taobao.com') #输入淘宝
sleep.time(1) #等一下
browser.switch_to_window(browser.window_handles[0]) #切换到第一个选项卡
browser.get('https://python.org') #输入python