1.表格如果已经存在,就在原先的基础上继续添加新的,保持元数据的存在
2.如果表格不存在,就创建再添加元素
目录结构:
代码:
0211_wanfang.py
import re
import time
from bs4 import BeautifulSoup
import requests
from requests import RequestException
import conference_wf
def get_page(url):
try:
# 添加User-Agent,放在headers中,伪装成浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
return None
except RequestException as e:
print(e)
return None
def get_url(html):
url_list = []
pattern = re.compile("this.id,'(.*?)'", re.S)
ids = pattern.findall(html)
for id in ids:
url_list.append('http://www.wanfangdata.com.cn/details/detail.do?_type=conference&id=' + id)
return url_list
def get_info(url):
conference_wf.main(url)
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
key_word = input('请输入搜索关键词:') # 可以交互输入 也可以直接指定
# 从哪一页开始爬 爬几页
start_page = int(input('请输入爬取的起始页:'))
base_url = 'http://www.wanfangdata.com.cn/search/searchList.do?beetlansyId=aysnsearch&searchType=conference&pageSize=20&page={}&searchWord={}&showType=detail&order=common_sort_time&isHit=&isHitUnit=&firstAuthor=false&navSearchType=conference&rangeParame='
first_url=base_url.format(start_page,key_word)
htm1 = requests.get(first_url, headers=headers)
soup = BeautifulSoup(htm1.text, 'html.parser')
# 总页数
#pagesum = soup.find('span', class_='searchPageWrap_all').get_text()
pagesum=5
for page in range(int(start_page), int(pagesum)):
new_url = base_url.format(page, key_word)
# 爬取当前页面 发送请求、获取响应
html = get_page(new_url)
# 解析响应 提取当前页面所有论文的url
url_list = get_url(html)
for url in url_list:
# 获取每篇论文的详细信息
get_info(url)
time.sleep(2) # 间隔2s
conference_wf.py
import os
import re
import requests
import xlrd
import xlutils.copy
import xlwt
from bs4 import BeautifulSoup
from requests import RequestException
def get_html(url):
try:
# 添加User-Agent,放在headers中,伪装成浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
return None
except RequestException as e:
print(e)
return None
def parse_html(html,url):
#使用beautifulSoup进行解析
soup = BeautifulSoup(html,'lxml')
#题目
title = soup.select('[style="font-weight:bold;"]')[0].text
# 作者
author = soup.select('[onclick^="authorHome"]')
authors=""
for author_a in author:
authors= authors+author_a.text+';'
# 第一作者单位
unit = soup.select('[class^="unit_nameType"]')
if unit:
unit = unit[0].text
# 关键词
keyword = soup.select(
'[title="知识脉络分析"][href="#"][onclick^="wfAnalysis"]') # 返回列表 ^表示以什么开头 找到title=x,href=x,onclick=x的节点
keywords = ''
for word in keyword:
keywords = keywords + word.text + ';'
#摘要
abstract = soup.select('.abstract')[0].textarea
if abstract:
abstract = abstract.text.strip()
else:
abstract=''
#会议名称
conference = soup.select('[href="#"][onclick^="searchResult"]')[0].text
print(conference)
#会议时间
pattern = re.compile('会议时间.*?<div class="info_right">(.*?)</div>', re.S)
date = pattern.findall(html)
if date:
date = date[0].strip()
# 发表时间
pattern = re.compile('在线出版日期.*?<div class="info_right author">(.*?)</div>', re.S)
online_date = pattern.findall(html)
if online_date:
online_date = online_date[0].strip()
paper = [title, authors, unit, keywords, abstract,conference, date, online_date,
url]
print(paper)
return paper
def save_p(paper):
if not os.path.exists('会议论文.xls'):
wb = xlwt.Workbook()
sheet = wb.add_sheet('sheet1')
title = ['题目', '作者', '第一作者单位', '关键词','摘要', '会议名称', '会议时间','发表时间', '链接']
for i in range(len(title)):
sheet.write(0, i, title[i]) #在第0行写入标题
wb.save('会议论文.xls')
wb = xlrd.open_workbook('会议论文.xls')
sheet = wb.sheet_by_index(0)
rows = sheet.nrows #当前行数
print(rows)
ws = xlutils.copy.copy(wb)
sheet = ws.get_sheet(0)
for i in range(len(paper)):
sheet.write(rows, i, paper[i])
ws.save('会议论文.xls')
def main(url):
#发送请求、获取响应
html = get_html(url)
#解析响应
paper = parse_html(html, url)
#数据存储
save_p(paper)