python—爬取中国彩票网的双色球数据,保存txt与xls格式。object has no attribute ‘pipelines’
一、保存txt格式的源代码文件:
1、源码文件 getWinningNum.py
root@kali:~/python/zhcw# ls
getWinningNum.log getWinningNum.py mylog.py mylog.pyc
root@kali:~/python/zhcw# cat getWinningNum.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import requests
import re
from bs4 import BeautifulSoup
import urllib2
from mylog import MyLog as mylog
class DoubleColorBallItem(object):
date = None #开奖日期
order = None #当年的顺序
red1 = None #第一个红球号码
red2 = None #第二个红球号码
red3 = None #第三个红球号码
red4 = None #第四个红球号码
red5 = None #第五个红球号码
red6 = None #第六个红球号码
blue = None #蓝色球号码
money = None #彩池金额
firstPrize = None #一等奖中奖人数
secondPrize = None #二等奖中奖人数
class GetDoubleColorBallNumber(object):#用于获取双色球中奖号码,返回一个txt文件
def __init__(self):
self.urls = []
self.log = mylog()
self.getUrls()
self.items = self.spider(self.urls)
self.pipelines(self.items)
def getUrls(self):#获取数据来源网页
URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
htmlContent = self.getResponseContent(URL)
soup = BeautifulSoup(htmlContent,'lxml')
tag = soup.find_all(re.compile('p'))[-1]
pages = tag.strong.get_text()
for i in xrange(1,int(pages)+1):
url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html'
self.urls.append(url)
self.log.info(u'添加URL:%s到URLS \r\n' %url)
def getResponseContent(self,url):#单独一个函数返回,后期代理抓包使用
try:
response = urllib2.urlopen(url.encode('utf8'))
except:
self.log.error(u'Python 返回URL:%s 数据失败\r\n' %url)
else:
self.log.info(u'Python 返回URL:%s 数据失败\r\n' %url)
return response.read()
def spider(self, urls):#从获取的数据中过滤得到中奖信息
items = []
for url in urls:
htmlContent = self.getResponseContent(url)
soup = BeautifulSoup(htmlContent,'lxml')
tags = soup.find_all('tr',attrs={})
for tag in tags:
if tag.find('em'):
item = DoubleColorBallItem()
tagTd = tag.find_all('td')
item.date = tagTd[0].get_text()
item.order = tagTd[1].get_text()
tagEm = tagTd[2].find_all('em')
item.red1 = tagEm[0].get_text()
item.red2 = tagEm[1].get_text()
item.red3 = tagEm[2].get_text()
item.red4 = tagEm[3].get_text()
item.red5 = tagEm[4].get_text()
item.red6 = tagEm[5].get_text()
item.blue = tagEm[6].get_text()
item.money = tagTd[3].find("strong").get_text()
item.firstPrize = tagTd[4].find("strong").get_text()
item.secondPrize = tagTd[5].find("strong").get_text()
items.append(item)
self.log.info(u'获取日期为:%s 的数据成功' %(item.date))
return items
def pipeliens(self, items):
fileName = u'双色球.txt'.encode('GBK')
with open(fileName,'w') as fp:
for item in items:
fp.write('%s %s \t %s %s %s %s %s %s %s \t %s \t %s %s \n' %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize))
self.log.info(u'将日期为:%s的数据存入"%s"...' %(item.data,filename.decode('GBK')))
if __name__ == "__main__":
GDCBN = GetDoubleColorBallNumber()
2、源码文件 mylog.py
root@kali:~/python/zhcw# ls
getWinningNum.log getWinningNum.py mylog.py mylog.pyc
root@kali:~/python/zhcw# cat mylog.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import logging
import getpass
import sys
class MyLog(object):#类MyLog的构造函数
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#日志文件名
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
#日志显示到屏幕上并输出到日志文件内
self.logHand = logging.FileHandler(self.logFile,encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)
#日志的5个级别对应以下的5个函数
def debug(self,msg):
self.logger.debug(msg)
def info(self,msg):
self.logger.info(msg)
def warn(self,msg):
self.logger.warn(msg)
def error(self,msg):
self.logger.error(msg)
def critical(self,msg):
self.logger.critical(msg)
if __name__ == "__mian__":
mylog = MyLog()
mylog.debug(u"I 'm debug 测试中文")
mylog.info("I 'm info")
mylog.warn("I 'm warn")
mylog.info(u"I 'm error 测试中文")
mylog.critical("I 'm critical")
3、保存txt格式的脚本运行情况:
.....................
.........................
...........................
2018-01-12 00:13:19,570 INFO root 添加URL:http://kaijiang.zhcw.com/zhcw/html/ssq/list_108.html到URLS
2018-01-12 00:13:19,570 INFO root 添加URL:http://kaijiang.zhcw.com/zhcw/html/ssq/list_109.html到URLS
2018-01-12 00:13:19,571 INFO root 添加URL:http://kaijiang.zhcw.com/zhcw/html/ssq/list_110.html到URLS
2018-01-12 00:13:19,571 INFO root 添加URL:http://kaijiang.zhcw.com/zhcw/html/ssq/list_111.html到URLS
2018-01-12 00:13:20,795 INFO root Python 返回URL:http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html 数据失败
2018-01-12 00:13:22,577 INFO root 获取日期为:2018-01-11 的数据成功
2018-01-12 00:13:22,579 INFO root 获取日期为:2018-01-09 的数据成功
2018-01-12 00:13:22,581 INFO root 获取日期为:2018-01-07 的数据成功
2018-01-12 00:13:22,582 INFO root 获取日期为:2018-01-04 的数据成功
2018-01-12 00:13:22,583 INFO root 获取日期为:2018-01-02 的数据成功
2018-01-12 00:13:22,584 INFO root 获取日期为:2017-12-31 的数据成功
2018-01-12 00:13:22,586 INFO root 获取日期为:2017-12-28 的数据成功
2018-01-12 00:13:22,587 INFO root 获取日期为:2017-12-26 的数据成功
2018-01-12 00:13:22,588 INFO root 获取日期为:2017-12-24 的数据成功
2018-01-12 00:13:22,589 INFO root 获取日期为:2017-12-21 的数据成功
2018-01-12 00:13:22,591 INFO root 获取日期为:2017-12-19 的数据成功
2018-01-12 00:13:22,592 INFO root 获取日期为:2017-12-17 的数据成功
2018-01-12 00:13:22,592 INFO root 获取日期为:2017-12-14 的数据成功
2018-01-12 00:13:22,593 INFO root 获取日期为:2017-12-12 的数据成功
2018-01-12 00:13:22,594 INFO root 获取日期为:2017-12-10 的数据成功
2018-01-12 00:13:22,595 INFO root 获取日期为:2017-12-07 的数据成功
2018-01-12 00:13:22,595 INFO root 获取日期为:2017-12-05 的数据成功
2018-01-12 00:13:22,596 INFO root 获取日期为:2017-12-03 的数据成功
2018-01-12 00:13:22,597 INFO root 获取日期为:2017-11-30 的数据成功
2018-01-12 00:13:22,598 INFO root 获取日期为:2017-11-28 的数据成功
Traceback (most recent call last):
File "getWinningNum.py", line 87, in <module>
GDCBN = GetDoubleColorBallNumber()
File "getWinningNum.py", line 30, in __init__
self.pipelines(self.items)
AttributeError: 'GetDoubleColorBallNumber' object has no attribute 'pipelines'
root@kali:~/python/zhcw#
二、保存xls格式的源代码文件
1、源码文件 getWinningNum_excel.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import requests
import re
from bs4 import BeautifulSoup
import urllib2
from mylog import MyLog as mylog
from SaveExcel import SaveBallDate
class DoubleColorBallItem(object):
date = None #开奖日期
order = None #当年的顺序
red1 = None #第一个红球号码
red2 = None #第二个红球号码
red3 = None #第三个红球号码
red4 = None #第四个红球号码
red5 = None #第五个红球号码
red6 = None #第六个红球号码
blue = None #蓝色球号码
money = None #彩池金额
firstPrize = None #一等奖中奖人数
secondPrize = None #二等奖中奖人数
class GetDoubleColorBallNumber(object):#用于获取双色球中奖号码,返回一个txt文件
def __init__(self):
self.urls = []
self.log = mylog()
self.getUrls()
self.items = self.spider(self.urls)
self.pipelines(self.items)
self.log.info('beging save data to excel \r\n')
SaveBallData(self.items)
self.log.info('save data to excel end ...\r\n')
def getUrls(self):#获取数据来源网页
URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
htmlContent = self.getResponseContent(URL)
soup = BeautifulSoup(htmlContent,'lxml')
tag = soup.find_all(re.compile('p'))[-1]
pages = tag.strong.get_text()
for i in xrange(1,int(pages)+1):
url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html'
self.urls.append(url)
self.log.info(u'添加URL:%s到URLS \r\n' %url)
def getResponseContent(self,url):#单独一个函数返回,后期代理抓包使用
try:
response = urllib2.urlopen(url.encode('utf8'))
except:
self.log.error(u'Python 返回URL:%s 数据失败\r\n' %url)
else:
self.log.info(u'Python 返回URL:%s 数据失败\r\n' %url)
return response.read()
def spider(self, urls):#从获取的数据中过滤得到中奖信息
items = []
for url in urls:
htmlContent = self.getResponseContent(url)
soup = BeautifulSoup(htmlContent,'lxml')
tags = soup.find_all('tr',attrs={})
for tag in tags:
if tag.find('em'):
item = DoubleColorBallItem()
tagTd = tag.find_all('td')
item.date = tagTd[0].get_text()
item.order = tagTd[1].get_text()
tagEm = tagTd[2].find_all('em')
item.red1 = tagEm[0].get_text()
item.red2 = tagEm[1].get_text()
item.red3 = tagEm[2].get_text()
item.red4 = tagEm[3].get_text()
item.red5 = tagEm[4].get_text()
item.red6 = tagEm[5].get_text()
item.blue = tagEm[6].get_text()
item.money = tagTd[3].find("strong").get_text()
item.firstPrize = tagTd[4].find("strong").get_text()
item.secondPrize = tagTd[5].find("strong").get_text()
items.append(item)
self.log.info(u'获取日期为:%s 的数据成功' %(item.date))
return items
def pipeliens(self, items):
fileName = u'双色球.txt'.encode('GBK')
with open(fileName,'w') as fp:
for item in items:
fp.write('%s %s \t %s %s %s %s %s %s %s \t %s \t %s %s \n' %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize))
self.log.info(u'将日期为:%s的数据存入"%s"...' %(item.data,filename.decode('GBK')))
if __name__ == "__main__":
GDCBN = GetDoubleColorBallNumber()
2、源码文件 SaveExcel.py
# -*- coding: utf-8 -*-
import xlwt
class SaveBallDate(object):
def __inti__(self,items):
self.items = items
self.run(self.items)
def run(self,items):
fileName = u'双色球.xls'.encode('GBK')
book = xlwt.Workbook(encodeing='utf8')
sheet = book.add_sheet('ball',cell_overwrite_ok = True)
sheet.write(0,0,u'开奖日期'.encode('utf8'))
sheet.write(0,1,u'期号'.encode('utf8'))
sheet.write(0,2,u'红1'.encode('utf8'))
sheet.write(0,3,u'红2'.encode('utf8'))
sheet.write(0,4,u'红3'.encode('utf8'))
sheet.write(0,5,u'红4'.encode('utf8'))
sheet.write(0,6,u'红5'.encode('utf8'))
sheet.write(0,7,u'红6'.encode('utf8'))
sheet.write(0,8,u'蓝'.encode('utf8'))
sheet.write(0,9,u'销售金额'.encode('utf8'))
sheet.write(0,10,u'一等奖'.encode('utf8'))
sheet.write(0,12,u'二等奖'.encode('utf8'))
i = 1
while i <= len(items):
item = items[-1]
sheet.write(i,0,item.date)
sheet.write(i,1,item.order)
sheet.write(i,2,item.red1)
sheet.write(i,3,item.red2)
sheet.write(i,4,item.red3)
sheet.write(i,5,item.red4)
sheet.write(i,6,item.red5)
sheet.write(i,7,item.red6)
sheet.write(i,8,item.blue)
sheet.write(i,9,item.money)
sheet.write(i,10,item.firstPrize)
sheet.write(i,11,item.secondPrize)
i += 1
book.save(fileName)
if __name__ == "__main__":
pass
3、源码文件 mylog.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import logging
import getpass
import sys
class MyLog(object):#类MyLog的构造函数
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#日志文件名
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
#日志显示到屏幕上并输出到日志文件内
self.logHand = logging.FileHandler(self.logFile,encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)
#日志的5个级别对应以下的5个函数
def debug(self,msg):
self.logger.debug(msg)
def info(self,msg):
self.logger.info(msg)
def warn(self,msg):
self.logger.warn(msg)
def error(self,msg):
self.logger.error(msg)
def critical(self,msg):
self.logger.critical(msg)
if __name__ == "__mian__":
mylog = MyLog()
mylog.debug(u"I 'm debug 测试中文")
mylog.info("I 'm info")
mylog.warn("I 'm warn")
mylog.info(u"I 'm error 测试中文")
mylog.critical("I 'm critical")
4、运行情况
2018-01-13 20:19:38,615 INFO Administrator 获取日期为:2003-08-31 的数据成功
2018-01-13 20:19:38,617 INFO Administrator 获取日期为:2003-08-28 的数据成功
...........................................................................
...........................................................................
...........................................................................
2018-01-13 20:19:39,444 INFO Administrator 获取日期为:2003-03-27 的数据成功
2018-01-13 20:19:39,447 INFO Administrator 获取日期为:2003-03-23 的数据成功
2018-01-13 20:19:39,448 INFO Administrator 获取日期为:2003-03-20 的数据成功
2018-01-13 20:19:39,450 INFO Administrator 获取日期为:2003-03-16 的数据成功
2018-01-13 20:19:39,453 INFO Administrator 获取日期为:2003-03-13 的数据成功
2018-01-13 20:19:39,454 INFO Administrator 获取日期为:2003-03-09 的数据成功
2018-01-13 20:19:39,457 INFO Administrator 获取日期为:2003-03-06 的数据成功
2018-01-13 20:19:39,459 INFO Administrator 获取日期为:2003-03-02 的数据成功
2018-01-13 20:19:39,460 INFO Administrator 获取日期为:2003-02-27 的数据成功
2018-01-13 20:19:39,461 INFO Administrator 获取日期为:2003-02-23 的数据成功
Traceback (most recent call last):
File "<ipython-input-4-3a22468d5a14>", line 1, in <module>
runfile('F:/SOFT/pythonpro/getWinningNum_excel.py', wdir='F:/SOFT/pythonpro')
File "C:\ProgramData\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\ProgramData\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 86, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "F:/SOFT/pythonpro/getWinningNum_excel.py", line 91, in <module>
GDCBN = GetDoubleColorBallNumber()
File "F:/SOFT/pythonpro/getWinningNum_excel.py", line 31, in __init__
self.pipelines(self.items)
AttributeError: 'GetDoubleColorBallNumber' object has no attribute 'pipelines'