博客迁址

新博客地址 http://4ct10n.cn

近期会将新的文章发表到新博客上 ,如果有什么问题还请大家纠正

QQ:1792034533
Email:[email protected]

ps:贴上自己的导出csdn的代码成md格式

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-10-21 23:19:58
# @Author  : 4ct10n ([email protected])
# @Link    : http://example.org

import requests
import sys  
from bs4 import BeautifulSoup


def Get_all_page(url):
    pages = []
    lists = []
    base_url = 'http://blog.csdn.net'
    # get pages
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')
    page = soup.find_all(id='papelist')[0].find_all('a')
    num = page[len(page)-1]['href'].split('/')
    num = int(num[len(num)-1])
    base = page[len(page)-1]['href'][0:-len(str(num))]
    pages = [base+str(i) for i in range(1,num+1)]
    # get lists
    for ps in pages:
        res = requests.get(base_url+ps)
        soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')     
        ls = soup.find_all('div',attrs={'id':'article_list'})
        ls = [i['href'] for i in ls[0].find_all('a')][::3]
        # print ps,len(ls),ls
        lists += ls

        # lists = list(set(lists))
    lists = [base_url+i for i in lists]
    return lists

def get_content(url,path):

    # url = 'http://blog.csdn.net/qq_31481187/article/details/78163593'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')
    source = '<link rel="stylesheet" type="text/css" href="http://static.blog.csdn.net/css/csdn_blog_detail.min.css">\n'
    de = soup.find_all(attrs={'name':"description"})
    tit = soup.find_all(attrs={'class':"link_title"})
    tim = soup.find_all(attrs={'class':"link_postdate"})
    cate = soup.find_all('div',attrs={'class':"category_r"})
    con = soup.find_all('div',attrs={'class':'markdown_views'})

    title = tit[0].get_text().strip()
    # print '|'+title+'
    description = de[0].attrs['content'].strip(' ')
    time = tim[0].string
    category = cate[0].find_all('span')[0].get_text().split(u'\uff08')[0]

    string = '---\n'
    string += 'title: '+title+'\n'
    string += 'tags: ['+category+']'+'\n'
    string += 'date: '+time+'\n'
    string += '---\n'
    string += description+'\n'
    string += '<!-- more -->'+'\n'
    string += source+str(con[0])
    # content = con[0].find_all('code')[0]
    f = open(path+'/'+title+'.md','w')
    f.write(string)
    print 'export :',title 
    # print str(con[0])

# print Get_all_page('http://blog.csdn.net/qq_31481187')
# print 

if __name__ == '__main__':
    reload(sys)  
    sys.setdefaultencoding('utf-8')
    url =  raw_input('url:') #'http://blog.csdn.net/qq_31481187/'
    path = raw_input('store_path:')#'/tmp/blog/'
    ALL = raw_input('export ALL ?yes/no:')
    if ALL=='yes':
        urls = Get_all_page(url)
        for link in urls: 
            get_content(link,path)      
    elif ALL=='no':
        get_content(url,path)
    # for link in urls: 
    #   print link
    #   get_content(link,path)
发布了99 篇原创文章 · 获赞 51 · 访问量 71万+

猜你喜欢

转载自blog.csdn.net/qq_31481187/article/details/78314920