#!/usr/bin/python
-- coding:UTF-8 --
@Author : Anic.Mo
@Time : 2018/6/18 12:51
@File : scrapyballs.py
#彩票双色球数据
import time,re
import requests
from requests.exceptions import ReadTimeout
import xlwt
from xlrd import open_workbook
from xlutils.copy import copy
import threading
from queue import Queue
def create_data_sheet():
file = xlwt.Workbook(encoding='utf-8')
sheet = file.add_sheet(u'doubleball', cell_overwrite_ok=True)
sheet.write(0, 0, "日期")
sheet.write(0, 1, "期数")
sheet.write(0, 2, "第一个红球")
sheet.write(0, 3, "第二个红球")
sheet.write(0, 4, "第三个红球")
sheet.write(0, 5, "第四个红球")
sheet.write(0, 6, "第五个红球")
sheet.write(0, 7, "第六个红球")
sheet.write(0, 8, "蓝球")
file.save(datafile)
def update_data_sheet(file, num=[]):
#print(num)
current_file = open_workbook(file, formatting_info=True)
current_rows = current_file.sheets()[0].nrows
#print(current_rows)
new_file = copy(current_file)
sheet = new_file.get_sheet(0)
row = current_rows
for i in range(0,len(num)):
sheet.write(row, 0, num[i][0])
sheet.write(row, 1, num[i][1])
sheet.write(row, 2, num[i][2])
sheet.write(row, 3, num[i][3])
sheet.write(row, 4, num[i][4])
sheet.write(row, 5, num[i][5])
sheet.write(row, 6, num[i][6])
sheet.write(row, 7, num[i][7])
sheet.write(row, 8, num[i][8])
row += 1
time.sleep(1)
new_file.save(datafile)
def get_Total_Page():
com = re.compile(r"class=\"pg\".*?<strong>(.*?)</strong>",re.DOTALL+re.IGNORECASE+re.MULTILINE)
try:
response = requests.get(base_url, headers = header, timeout=10)
response.encoding = "utf-8"
#print (response.text)
last_page = re.findall(com,response.text)[0]
print("="*60)
print("Total Page: %s " % last_page)
print("="*60)
return last_page
except Exception as e:
print("Error: %s " % e)
def get_current_page(page):
print("\n It`s going to load the %s page data." % page)
url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_"+str(page)+".html"
rule = r"<tr>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\" style=\"padding-left:10px;\">.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em>(.*?)</em></td>"
try:
response = requests.get(url, headers = header, timeout=10)
time.sleep(2)
response.encoding = "utf-8"
content = response.text
#print(content)
num = re.findall(rule, content, re.S | re.M)
'''
for i in range(0,len(num)):
print(num[i])
'''
#print(type(num))
queueLock.acquire()
update_data_sheet(datafile, num)
queueLock.release()
except Exception as e:
print("Error: %s " % e)
class myThread(threading.Thread):
def __init__(self, q):
threading.Thread.__init__(self)
self.__q = q
def run(self):
while not self.__q.empty():
page = self.__q.get()
get_current_page(page)
time.sleep(1)
if name == 'main':
global base_url, header, datafile
datafile = "doubleball.xls"
base_url = "http://kaijiang.zhcw.com/zhcw/inc/ssq/ssq_wqhg.jsp"
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}
total = int(get_Total_Page())
create_data_sheet()
queueLock = threading.Lock()
workQueue = Queue()
threads = []
thread_count = 10
queueLock.acquire()
for page in range(1, total+1):
workQueue.put(page)
queueLock.release()
for thread in range(1, thread_count):
thread = myThread(workQueue)
thread.start()
threads.append(thread)
for t in threads:
t.join()