Python爬虫-Beautiful Soup-当当图书目录(1)
第一次用python + Beautiful Soup爬些数据,用当当图书目录作为练习了。
思路:
- 1. 获取当当的图书类别:类报名称 + 链接 url
效果:
脚本说明:
- 1. mylog.py:日志
- 2. getBookKindInfo.py:图书目录
mylog.py
# !/usr/bin/env python
#-*- coding:utf-8 -*-
'''
Created on 2018-4-10
@author: Administrator
'''
import logging
import getpass
import sys
#### 定义MyLog类
class MyLog(object):
"""这个类用于创建一个自用的log"""
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#### 日志文件名
self.logFile = sys.argv[0][0:-3] + '.log' #日志文件名
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s')
"""日志显示到屏幕上并输出到日志文件夹内"""
logHand=logging.FileHandler(self.logFile)
logHand.setFormatter(self.formatter)
logHand.setLevel(logging.ERROR) #只有错误才会被记录到logfile中
logHandSt=logging.StreamHandler()
logHandSt.setFormatter(self.formatter)
self.logger.addHandler(logHand)
self.logger.addHandler(logHandSt)
"""日志的5个级别对应以下的5个函数"""
def debug(self,msg):
self.logger.debug(msg)
def info(self,msg):
self.logger.info(msg)
def warn(self,msg):
self.logger.warn(msg)
def error(self,msg):
self.logger.error(msg)
def critical(self,msg):
self.logger.critical(msg)
if __name__ == '__main__':
mylog=MyLog()
mylog.debug("我是一个debug")
mylog.info("我是一个info")
mylog.warn("我是一个warn")
mylog.error("我是一个error")
mylog.critical("我是一个critical")
getBookKindInfo.py
#! /usr/bin/env python
#-*- coding:utf-8 -*-
'''
Created on 2018-4-10
@author: Administrator
获取当当图书种类, 大类名称+大类url,小类名称+小类url
'''
import re
from bs4 import BeautifulSoup
import urllib2
from mylog import MyLog as mylog
class BookKindItem(object):
'''图书种类'''
name=None # 种类名称
url=None # 种类的url
class GetBookKindItem(object):
'''获取当当图书种类'''
def getResponseContent(self,url):
try:
response=urllib2.urlopen(url.encode('utf8'))
except:
self.log.error(u'python 返回 URL:%s 数据失败' %url)
else:
self.log.info(u'Python 返回URL:%s A数据成功' %url)
return response.read()
def __init__(self):
self.urls=[ ]
self.log=mylog()
# self.getUrls()
def getUrls(self):
URL=r'http://category.dangdang.com/?ref=www-0-C'
htmlContent=self.getResponseContent(URL)
soup=BeautifulSoup(htmlContent,'lxml', from_encoding='gbk') #此处改为utf8则取不“全图书类别”
#大类
DL = []
#小类
XL = []
#outside ---外层的div
#_li ---li层
for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"):
# 图书大类
item_dl=BookKindItem();
item_dl.name=outsideDiv.div.a.string;
item_dl.url=outsideDiv.div.a.get("href");
DL.append(item_dl);
# for e in DL:
# print (' %s----%s' % (e.name, e.url));
# 图书小类
for _li in outsideDiv.find("ul").find_all("li"):
if _li.a.string == "更多":
continue
else:
item_xl=BookKindItem();
item_xl.name=_li.a.string;
item_xl.url=_li.a.get("href");
XL.append(item_xl);
# for e in XL:
# print (' %s----%s' % (e.name, e.url));
return DL, XL
if __name__ == '__main__':
# url=u'http://tieba.baidu.com/f?kw=%E6%9D%83%E5%8A%9B%E7%9A%84%E6%B8%B8%E6%88%8F&ie=utf-8&pn=50'
# GTI=GetBookKindItem()
#首先获取相关链接从KindLinks
kls=GetBookKindItem()
#书籍的链接数据
bdata=kls.getUrls()
print (' ## 图书大类' );
for e in bdata[0]:
print (' %s----%s' % (e.name, e.url));
print (' ## 图书小类' );
for e in bdata[1]:
print (' %s----%s' % (e.name, e.url));