新博客地址 http://4ct10n.cn
近期会将新的文章发表到新博客上 ,如果有什么问题还请大家纠正
QQ:1792034533
Email:[email protected]
ps:贴上自己的导出csdn的代码成md格式
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2017-10-21 23:19:58
# @Author : 4ct10n ([email protected])
# @Link : http://example.org
import requests
import sys
from bs4 import BeautifulSoup
def Get_all_page(url):
pages = []
lists = []
base_url = 'http://blog.csdn.net'
# get pages
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')
page = soup.find_all(id='papelist')[0].find_all('a')
num = page[len(page)-1]['href'].split('/')
num = int(num[len(num)-1])
base = page[len(page)-1]['href'][0:-len(str(num))]
pages = [base+str(i) for i in range(1,num+1)]
# get lists
for ps in pages:
res = requests.get(base_url+ps)
soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')
ls = soup.find_all('div',attrs={'id':'article_list'})
ls = [i['href'] for i in ls[0].find_all('a')][::3]
# print ps,len(ls),ls
lists += ls
# lists = list(set(lists))
lists = [base_url+i for i in lists]
return lists
def get_content(url,path):
# url = 'http://blog.csdn.net/qq_31481187/article/details/78163593'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')
source = '<link rel="stylesheet" type="text/css" href="http://static.blog.csdn.net/css/csdn_blog_detail.min.css">\n'
de = soup.find_all(attrs={'name':"description"})
tit = soup.find_all(attrs={'class':"link_title"})
tim = soup.find_all(attrs={'class':"link_postdate"})
cate = soup.find_all('div',attrs={'class':"category_r"})
con = soup.find_all('div',attrs={'class':'markdown_views'})
title = tit[0].get_text().strip()
# print '|'+title+'
description = de[0].attrs['content'].strip(' ')
time = tim[0].string
category = cate[0].find_all('span')[0].get_text().split(u'\uff08')[0]
string = '---\n'
string += 'title: '+title+'\n'
string += 'tags: ['+category+']'+'\n'
string += 'date: '+time+'\n'
string += '---\n'
string += description+'\n'
string += '<!-- more -->'+'\n'
string += source+str(con[0])
# content = con[0].find_all('code')[0]
f = open(path+'/'+title+'.md','w')
f.write(string)
print 'export :',title
# print str(con[0])
# print Get_all_page('http://blog.csdn.net/qq_31481187')
# print
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
url = raw_input('url:') #'http://blog.csdn.net/qq_31481187/'
path = raw_input('store_path:')#'/tmp/blog/'
ALL = raw_input('export ALL ?yes/no:')
if ALL=='yes':
urls = Get_all_page(url)
for link in urls:
get_content(link,path)
elif ALL=='no':
get_content(url,path)
# for link in urls:
# print link
# get_content(link,path)