Python爬虫-Beautiful Soup-当当图书目录(1)-进阶
思路:
- 1. 把公共的方法部分提取出来:getResponseContent(self,url);
- 2. mylog.py不变,新增myfun.py脚本,修改getBookKindInfo.py脚本;
脚本说明:
- 1. mylog.py:日志
- 2. getBookKindInfo.py:图书目录
- 3. myfun.py:日志
myfun.py
#! /usr/bin/env python
#-*- coding:utf-8 -*-
'''
Created on 2018-4-12
@author: Administrator
'''
import urllib2
from mylog import MyLog as mylog
class MyFun(object):
def __init__(self):
self.log=mylog()
def getResponseContent(self,url):
try:
response=urllib2.urlopen(url.encode('utf8'))
except:
self.log.error(u'python 返回 URL:%s 数据失败' %url)
else:
self.log.info(u'Python 返回URL:%s A数据成功' %url)
return response.read()
getBookKindInfo.py
#! /usr/bin/env python
#-*- coding:utf-8 -*-
'''
Created on 2018-4-10
@author: Administrator
获取当当图书种类, 大类名称+大类url,小类名称+小类url
'''
import re
from bs4 import BeautifulSoup
from myfun import MyFun as myfun
class BookKindItem(object):
'''图书种类'''
name=None # 种类名称
url=None # 种类的url
class GetBookKindItem(object):
def __init__(self):
self.urls=[ ]
def getUrls(self):
URL=r'http://category.dangdang.com/?ref=www-0-C'
htmlContent=myfun().getResponseContent(URL)
soup=BeautifulSoup(htmlContent,'lxml', from_encoding='gbk') #此处改为utf8则取不“全图书类别”
#大类
DL = []
#小类
XL = []
#outside ---外层的div
#_li ---li层
for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"):
# 图书大类
item_dl=BookKindItem();
item_dl.name=outsideDiv.div.a.string;
item_dl.url=outsideDiv.div.a.get("href");
DL.append(item_dl);
# for e in DL:
# print (' %s----%s' % (e.name, e.url));
# 图书小类
for _li in outsideDiv.find("ul").find_all("li"):
if _li.a.string == "更多":
continue
else:
item_xl=BookKindItem();
item_xl.name=_li.a.string;
item_xl.url=_li.a.get("href");
XL.append(item_xl);
# for e in XL:
# print (' %s----%s' % (e.name, e.url));
return DL, XL
if __name__ == '__main__':
# url=u'http://tieba.baidu.com/f?kw=%E6%9D%83%E5%8A%9B%E7%9A%84%E6%B8%B8%E6%88%8F&ie=utf-8&pn=50'
# GTI=GetBookKindItem()
#首先获取相关链接从KindLinks
kls=GetBookKindItem()
#书籍的链接数据
bdata=kls.getUrls()
print (' ## 图书大类' );
for e in bdata[0]:
print (' %s----%s' % (e.name, e.url));
print (' ## 图书小类' );
for e in bdata[1]:
print (' %s----%s' % (e.name, e.url));