[静态定向爬虫]远程教育杂志

远程教育杂志链接
http://dej.zjtvu.edu.cn/
2018年第二期第二刊:
http://dej.zjtvu.edu.cn//oa/darticle.aspx?type=view&id=201802002
分析页面…CRTL+U 没啥好分析的
用时间戳来区分不同期刊 像:201X0YZZZ
凑时间戳用了rjust方法,右对齐填充数字
一个静态页面
用BeautifulSoap库就可以完成
比较暴力
唯一的坑就是论文格式不是固定的..比如不是每篇文章都会有DOI 所以只设置了title的try-except 这是原来没有想到的
一开始因为自信没有用traceback调试 导致手动调试甚至用上了二分法调bug2333
然后traceback 3分钟就调完了所有bug orz
另外直接写到文件里,不需要用列表存字典,多余的操作

import traceback
import requests
from bs4 import BeautifulSoup

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(html,fpath):
    soup = BeautifulSoup(html, 'html.parser')
    dict = {}
    try:
        Tag = soup.find('span', attrs={'id': 'LbTitleC'})
        dict['论文名称'] = Tag.string
        Tag = soup.find('span', attrs={'id': 'LbTitleE'})
        dict['title'] = Tag.string
    except:
        return ""
    Tag = soup.find('span', attrs={'id': 'LbAuthorC'})
    dict['作者'] = Tag.string
    Tag = soup.find('span', attrs={'id': 'LbAuthorE'})
    dict['Author(s)'] = Tag.string
    Tag = soup.find('span', attrs={'id': 'LbUnitC'})
    dict['作者'] = dict['作者'] + Tag.string
    Tag = soup.find('span', attrs={'id': 'LbUnitE'})
    dict['Author(s)'] = dict['Author(s)'] + Tag.string
    Tag = soup.find('span', attrs={'id': 'LbKeyC'})
    dict['关键字'] = Tag.get_text()
    Tag = soup.find('span', attrs={'id': 'LbKeyE'})
    dict['Keywords'] = Tag.get_text()
    Tag = soup.find('span', attrs={'id': 'LbFLH'})
    dict['分类号'] = Tag.string

    Tag = soup.find('span', attrs={'id': 'LbDOI'})
    dict['DOI'] = Tag.get_text()

    Tag = soup.find('span', attrs={'id': 'TbWXBSM'})
    dict['文献标志码'] = Tag.string
    Tag = soup.find('span', attrs={'id': 'LbZY'})
    dict['摘要'] = Tag.string
    Tag = soup.find('span', attrs={'id': 'LbZYE'})
    dict['Abstract'] = Tag.string
    Tag = soup.find('span', attrs={'id': 'lbxswx'})
    dict['相似文献/references'] = Tag.string
    Tag = soup.find('span', attrs={'id': 'LbMemory'})
    dict['备注/Memo'] = Tag.string
    with open(fpath, 'a', encoding='utf-8') as f:
        f.write(str(dict) + '\n')

def main():
    output_file='D:/PagesDetails.txt'
    start_url='http://dej.zjtvu.edu.cn//oa/darticle.aspx?type=view&id='
    for year in range(2013,2019):
        for m in range(1,6):
            for i in range(1,100):
                try:
                    url=start_url+str(year)+'0'+str(m)+str(i).rjust(3,'0')
                    html=getHTMLText(url)
                    parsePage(html,output_file)
                except:
                    continue


main()

跑完大概是这样的 字典类型:
这里写图片描述

猜你喜欢

转载自blog.csdn.net/joovo/article/details/79811367