学习笔记之万方数据爬取

        最近帮老师做个项目,需要用到文章摘要和关键词数据集。网上找了一下,没有很适合我的,所以自己写了一个小程序爬取了一些。现在把程序记录起来,供以后学习和需要的朋友使用。

           # coding:utf-8


import urllib2
import re
import time

 #获取大的标签url
def page_link_1(url_1):
    url_list = []
    for i in range(1,500):
        url_i = url_1 + str(i)
        url_list.append(url_i)
    return url_list
    # request = urllib2.Request(url_1)
    # response = urllib2.urlopen(request).read()
    # print response

#获取大的标签下的页面中文章摘要和关键词页面的url
def page_link_2(url_2_list, user_agent):
    paper_title_dict = {}
    headers = {'User_Agent': user_agent}
    i = 1
    for url_i in url_2_list:
        print 'the i_th page:%d' %i
        i += 1
        request = urllib2.Request(url_i, headers=headers)
        html = urllib2.urlopen(request).read()
        title = re.findall(r'<div class="sortby_div">(.*?)<p class="pager_space">',html, re.I|re.S)
        if len(title) != 0:
            title_list = re.findall(r'<li class="title_li">(.*?)</li>', title[0], re.I|re.S)
            for each_title in title_list:
                each_title_1 = re.findall(r'class="abs_img"></a>(.*?)</a>', each_title, re.I|re.S)[0]
                each_title_2 = re.findall(r'''<a href='(.*?)' target="_blank">(.*)''', each_title_1,re.I|re.S|re.L)[0]
                paper_title_dict[each_title_2[1]] = each_title_2[0]
        else:
            continue
        time.sleep(2)
    return paper_title_dict

#找到文章的摘要和关键词
def page_link_3(url_3_dict):
    paper_list = []
    for each_title in url_3_dict:
        abstract_and_keywords = []
        request = urllib2.Request(url_3_dict[each_title])
        response = urllib2.urlopen(request).read()
        abstract_1 = re.findall('<div id="completeAbstract" style="display: none">(.*?)<p id="collapse">', response,
                              re.I | re.S)
        if len(abstract_1) == 0:
            continue
        abstract = abstract_1[0].strip()
        abstract_and_keywords.append(abstract)


        keywords = re.findall('<t>关键词</t>(.*?)</tr>', response, re.I | re.S)[0]
        keywords_list = re.findall('<a href=(.*?)>(.*?)</a>', keywords, re.I | re.S | re.M)
        for each in keywords_list:
            string = re.findall('<(.*)>', each[1], re.I | re.S)
            if len(string) == 0:
                abstract_and_keywords.append(each[1].strip())
        paper_list.append(abstract_and_keywords)
        time.sleep(2)
    return paper_list


if __name__== '__main__':
    user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0'

    #这里url连接需要改,我用的是学校图书馆已经购买的万方数据库连接
    url_1 = 'http://s.g.wanfangdata.com.cn/Paper.aspx?q=%E4%B8%93%E4%B8%9A%3a%22%E4%B8%AD%E5%9B%BD%E8%AF%AD%E8%A8%'

    url_1_list = page_link_1(url_1)
    paper_title_dict = page_link_2(url_1_list, user_agent)
    paper_list = page_link_3(paper_title_dict)
    print 'the number of paper is %d' %len(paper_list)
    k = 1

#将文章摘要和关键词写入文件
    with open('paper_abstract_keywords_language.txt', 'a') as f:
        for each_paper in paper_list:
            keywords_length = len(each_paper)
            if keywords_length != 0:
                for i in range(keywords_length):
                    f.writelines(each_paper[i] + '+')
                f.writelines('\n')
                k += 1
            else:
                continue
    print 'the number of writed paper is %d' %k

      

猜你喜欢

转载自blog.csdn.net/zhangye_2017/article/details/78835902