GitHub爬虫
import requests
from bs4 import BeautifulSoup
req1 = requests.get('https://github.com/login')
soup = BeautifulSoup(req1.text, features='lxml')
tag = soup.find(name='input', attrs={'name': 'authenticity_token'}) #在源代码的form表单中
authenticity_token = tag.get('value')
cookie1 = req1.cookies.get_dict()
form_data = {
'authenticity_token': authenticity_token,
'commit': 'Sign in',
'utf8': '✓',
'login': 'fds',
'password': 'fdsa',
'webauthn-support': 'supported',
}
req2 = requests.post(url='https://github.com/session',data=form_data,cookies = cookie1 )
cookie2 = req2.cookies.get_dict()
cookie1.update(cookie2)
req3 = requests.get(url='https://github.com/settings/repositories',cookies = cookie1)
soup2 = BeautifulSoup(req3.text,features='lxml')
# print(soup2.text)
list_group = soup2.find(name='div', class_='col-9 float-left')
# print(list_group)
# print(list_group)
p_list = list_group.find(name = 'p',class_ = 'js-collaborated-repos-empty')
print(p_list.text)
在上面的代码中,只要是获取网页源代码form
表单中的authenticity_token
。