论文爬取系统 | (3) 爬虫子类

项目Github地址

每个会议有其独特之处，在定义完基类之后，我们为每个会议单独写一个类，定义其特有的函数，这些子类都继承自基类。由于AAAI、IJCAI对应的会议都来自各自的官网，所以各自单独写一个类，而ACL系列所有的会议都来自一个网站，所以可以统一写一个类。在各个子类中定义如何在各自论文页面中提取论文pdf下载链接以及论文标题的方法。

ACLSeries

class ACLSeries(BasicSpider):

    def __init__(self,opt):
        super(ACLSeries, self).__init__()
        self.opt = opt

    def get_content(self,url,year):

        page = self.get_page(url)
        soup = BeautifulSoup(page, 'lxml')
        tag = soup.select('#title a')[0]
        pdf_url = tag['href']
        title = tag.get_text().strip()
        print("论文pdf链接:" + str(pdf_url))

        if '/' in title:
            title = title.replace('/','or')
        print("论文标题:"+str(title))

        self.saveFile(pdf_url,title,year)

IJCAI

class IJCAI(BasicSpider):

    def __init__(self,opt):
        super(IJCAI, self).__init__()
        self.opt = opt

    def get_content(self,url,year):

        page = self.get_page(url)
        soup = BeautifulSoup(page, 'lxml')
        pdf_url = soup.select('.btn-download')

        if pdf_url==[]:
            pattern = re.compile('<p><a href="(.*?)">PDF</a></p>',re.S)
            pdf_url = pattern.findall(page)[0]
            pdf_url = 'https://www.ijcai.org'+pdf_url
        else:
            pdf_url = pdf_url[0]['href']

        print("论文pdf链接:"+str(pdf_url))
        pattern = re.compile('<h1>(.*?)</h1>',re.S)
        res = pattern.findall(page)

        if res == []:
            pattern = re.compile('<p>(.*?)<br />.*?<i>.*?</i>.*?</p>',re.S)
            res = pattern.findall(page)

        title = res[0].strip()
        if '/' in title:
            title = title.replace('/','or')
        print("论文标题:"+str(title))

        self.saveFile(pdf_url,title,year)

AAAI

class AAAI(BasicSpider):

    def __init__(self,opt):
        super(AAAI, self).__init__()
        self.opt = opt

    def get_content(self,url,year):
        if year == 2019:

            page = self.get_page(url)
            soup = BeautifulSoup(page, 'lxml')
            pdf_url = soup.select('.pdf')
            title = soup.select('.page_title')
            pdf_url = pdf_url[0]['href']
            title = title[0].get_text().strip()

        else:

            url = url.replace('view', 'viewPaper')

            while True:
                try:
                    chrome_options = Options()
                    chrome_options.add_argument('--headless')
                    chrome_options.add_argument('--disable-gpu')
                    # browser = webdriver.Chrome(executable_path="E:\\chromedriver",chrome_options=chrome_options)
                    browser = webdriver.Chrome(chrome_options=chrome_options)
                    browser.get(url)
                    browser.implicitly_wait(10)
                    pdf_url = browser.find_element_by_css_selector('#paper a')
                    pdf_url = pdf_url.get_attribute('href')
                    pdf_url = pdf_url.replace('view', 'viewFile')
                    title = browser.find_element_by_css_selector('#title')
                    title = title.text.strip()
                    if browser:
                        browser.close()
                    break
                except NoSuchElementException:
                    if browser:
                        browser.close()
                    print('selenium fail,重试。')
                    continue
            '''
            page = get_page(url)
            soup = BeautifulSoup(page, 'lxml')
            pdf_url = soup.select('#paper a')
            pdf_url = pdf_url[0]['href']
            pdf_url = pdf_url.replace('view', 'viewFile')
            print(pdf_url)
            title = soup.select('#title')
            title = title[0].get_text().strip()
            '''

        if '/' in title:
            title = title.replace('/', 'or')
        print("论文pdf链接:" + str(pdf_url))
        print("论文标题:" + str(title))

        self.saveFile(pdf_url,title,year)

CoreJT

发布了365 篇原创文章 · 获赞 712 · 访问量 13万+

他的留言板关注

论文爬取系统 | (3) 爬虫子类

猜你喜欢