爬虫【进阶】(套路二)

GitHub爬虫

import requests
from bs4 import BeautifulSoup

req1 = requests.get('https://github.com/login')
soup = BeautifulSoup(req1.text, features='lxml')
tag = soup.find(name='input', attrs={'name': 'authenticity_token'})	#在源代码的form表单中
authenticity_token = tag.get('value')
cookie1 = req1.cookies.get_dict()


form_data = {
    'authenticity_token': authenticity_token,
    'commit': 'Sign in',
    'utf8': '✓',
    'login': 'fds',
    'password': 'fdsa',
    'webauthn-support': 'supported',
}

req2 = requests.post(url='https://github.com/session',data=form_data,cookies = cookie1 )
cookie2 = req2.cookies.get_dict()
cookie1.update(cookie2)

req3 = requests.get(url='https://github.com/settings/repositories',cookies = cookie1)
soup2 = BeautifulSoup(req3.text,features='lxml')
# print(soup2.text)
list_group = soup2.find(name='div', class_='col-9 float-left')

# print(list_group)
# print(list_group)
p_list = list_group.find(name = 'p',class_ = 'js-collaborated-repos-empty')
print(p_list.text)

在上面的代码中,只要是获取网页源代码form表单中的authenticity_token

猜你喜欢

转载自blog.csdn.net/weixin_43265998/article/details/89163362