网站:https://www.genome.jp//kegg-bin/show_organism?org=??
# encoding utf-8
# time: 2021/8/18
# crawler
from time import sleep
import requests
from fake_useragent import UserAgent
from lxml import etree
import re
# other
from csv import reader
import pandas
def crawler(paramsList, contentContainer):
print("start crawler...")
headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
baseUrl = 'https://www.genome.jp//kegg-bin/show_organism?org={}'
for param in paramsList:
print('the data for {}...'.format(param))
response = requests.get(url=baseUrl.format(param), headers=headers, verify=False)
tree = etree.HTML(response.text)
index = tree.xpath('//tr/td/table//tr//td[@nowrap]//text()')
for i in range(len(index)):
index[i] = index[i].replace('\xa0\xa0\xa0\xa0', '')
value = tree.xpath('//tr/td/table//tr/td[2]')
for i in range(len(value)):
value[i] = str(etree.tostring(value[i]), 'utf-8')
value[i] = value[i].replace('<td>', '')
value[i] = value[i].replace('</td>', '')
value[i] = value[i].replace('<br/>', '')
value[i] = value[i].replace(' ', ';')
value[i] = value[i].replace(' ', '')
value[i] = re.sub('<[^>]*>','', value[i])
dictContent = dict()
for i in range(len(index)):
dictContent[index[i]] = value[i]
print(dictContent)
contentContainer.append(dictContent)
# sleep(1)
return contentContainer
def main():
csvFile = './kegg.csv'
paramsLsit = list()
with open(csvFile, 'rt') as f:
text = reader(f, delimiter=',')
for factor in list(text):
paramsLsit.append(factor[0])
f.close()
contentContainer = list()
data = crawler(paramsList=paramsLsit, contentContainer=contentContainer)
dataCsv = pandas.DataFrame(data)
dataCsv.to_csv('./data', index=False)
print("crawler over!")
if __name__ == '__main__':
main()
此处会遇到证书检验的问题,因此把verify设置为false即可。如果不设置verify则可能导致客户端接受报文报错