学习笔记之万方数据爬取

最近帮老师做个项目，需要用到文章摘要和关键词数据集。网上找了一下，没有很适合我的，所以自己写了一个小程序爬取了一些。现在把程序记录起来，供以后学习和需要的朋友使用。

# coding:utf-8

import urllib2
import re
import time

#获取大的标签url
def page_link_1(url_1):
url_list = []
for i in range(1,500):
url_i = url_1 + str(i)
url_list.append(url_i)
return url_list
# request = urllib2.Request(url_1)
# response = urllib2.urlopen(request).read()
# print response

#获取大的标签下的页面中文章摘要和关键词页面的url
def page_link_2(url_2_list, user_agent):
paper_title_dict = {}
headers = {'User_Agent': user_agent}
i = 1
for url_i in url_2_list:
print 'the i_th page:%d' %i
i += 1
request = urllib2.Request(url_i, headers=headers)
html = urllib2.urlopen(request).read()
title = re.findall(r'<div class="sortby_div">(.*?)<p class="pager_space">',html, re.I|re.S)
if len(title) != 0:
title_list = re.findall(r'<li class="title_li">(.*?)</li>', title[0], re.I|re.S)
for each_title in title_list:
each_title_1 = re.findall(r'class="abs_img"></a>(.*?)</a>', each_title, re.I|re.S)[0]
each_title_2 = re.findall(r'''<a href='(.*?)' target="_blank">(.*)''', each_title_1,re.I|re.S|re.L)[0]
paper_title_dict[each_title_2[1]] = each_title_2[0]
else:
continue
time.sleep(2)
return paper_title_dict

#找到文章的摘要和关键词
def page_link_3(url_3_dict):
paper_list = []
for each_title in url_3_dict:
abstract_and_keywords = []
request = urllib2.Request(url_3_dict[each_title])
response = urllib2.urlopen(request).read()
abstract_1 = re.findall('<div id="completeAbstract" style="display: none">(.*?)<p id="collapse">', response,
re.I | re.S)
if len(abstract_1) == 0:
continue
abstract = abstract_1[0].strip()
abstract_and_keywords.append(abstract)

keywords = re.findall('<t>关键词</t>(.*?)</tr>', response, re.I | re.S)[0]
keywords_list = re.findall('<a href=(.*?)>(.*?)</a>', keywords, re.I | re.S | re.M)
for each in keywords_list:
string = re.findall('<(.*)>', each[1], re.I | re.S)
if len(string) == 0:
abstract_and_keywords.append(each[1].strip())
paper_list.append(abstract_and_keywords)
time.sleep(2)
return paper_list

if __name__== '__main__':
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0'

#这里url连接需要改，我用的是学校图书馆已经购买的万方数据库连接
url_1 = 'http://s.g.wanfangdata.com.cn/Paper.aspx?q=%E4%B8%93%E4%B8%9A%3a%22%E4%B8%AD%E5%9B%BD%E8%AF%AD%E8%A8%'

url_1_list = page_link_1(url_1)
paper_title_dict = page_link_2(url_1_list, user_agent)
paper_list = page_link_3(paper_title_dict)
print 'the number of paper is %d' %len(paper_list)
k = 1

#将文章摘要和关键词写入文件
with open('paper_abstract_keywords_language.txt', 'a') as f:
for each_paper in paper_list:
keywords_length = len(each_paper)
if keywords_length != 0:
for i in range(keywords_length):
f.writelines(each_paper[i] + '+')
f.writelines('\n')
k += 1
else:
continue
print 'the number of writed paper is %d' %k

学习笔记之万方数据爬取

猜你喜欢