在许多网站中我们会用到登录才能看到的页面,或者需要浏览器加载的静态js显示出来的页面才能获取其中的内容
我们就需要模拟登录或者模拟成浏览器
requests(比较复杂)
比如用模拟登录gitbug网站
import requests
from bs4 import BeautifulSoup
# 模拟表单提交
def main():
resp = requests.get('https://github.com/login')
if resp.status_code != 200:
return
cookes = resp.cookies.get_dict()
soup = BeautifulSoup(resp.text, 'lxml')
utf8_vale = soup.select_one('form input[name=utf8]').attrs['value']
anth_token = soup.select_one('form input[name=authenticity_token]').attrs['value']
# 模拟登陆
data = {
'utf8': utf8_vale,
'authenticity_token': anth_token,
'login': '登录帐号',
'password': '密码'
}
resp = requests.post('https://github.com/session', data=data,
cookies=cookes)
print(resp.text)
if __name__ == '__main__':
main()
robobrowser
早有人已经封装好了该函数因此需要下载
pip install robobrowser
更为简单清晰的运用了上面的方式
import robobrowser
# 模拟表单提交
def main():
b = robobrowser.RoboBrowser()
b.open('https://github.com/login')
f = b.get_form(action='/session')
f['login'].value = '帐号'
f['password'].value = '密码'
b.submit_form(f)
# 获取所有的<a>标签
for a_tar in b.select('a[href]'):
print(a_tar.attrs['href'])
if __name__ == '__main__':
main()
模拟打开浏览器
这里就需要下载对应浏览器的驱动,并且需要放在python的环境变量中才能运兴
pip install selenium
from selenium import webdriver
from bs4 import BeautifulSoup
# 模拟浏览器内核提交
from selenium.webdriver.common.keys import Keys
def main():
# 下载对应驱动
driver = webdriver.Chrome()
# 打开网页
driver.get('https://v.taobao.com/v/content/live?spm=a21xh.11312869.liveList.6.75a8627fSrhNKg&catetype=704')
# 找到搜索框
ele = driver.find_element_by_css_selector('input[placeholder="输入关键词搜索"]')
# 输入运动然后查找该页面中所有的图片地址
ele.send_keys('运动')
ele.send_keys(Keys.ENTER)
soup = BeautifulSoup(driver,'lxml')
for img_tag in soup.body.select('img[src]'):
print(img_tag.attrs['src'])
if __name__ == '__main__':
main()