版权声明:仅限学习使用 https://blog.csdn.net/u014590889/article/details/84926880
import re
import requests
import time
#宏定义
#title = 'https://8*8*5*r*i*.com'
title = 'http://www.gaoqing.la/'
txtRoute = 'D:\\MySeGF\\'
contextGF = []
logList = []
failFlag = 0
logPrintDebug = 0
#函数定义
def getTime():
#curTime = time.strftime('%Y.%m.%d',time.localtime(time.time()))
curTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
IsWriteLog('curTime:',curTime)
return curTime
def getFileTime():
#curTime = time.strftime('%Y.%m.%d',time.localtime(time.time()))
curTime = time.strftime('%Y%m%d_%H%M%S',time.localtime(time.time()))
IsWriteLog('curTime:',curTime)
return curTime
def IsWriteLog(first, *args):
if 0==logPrintDebug:
print(first, end=' ')
logList.append(first)
if 1==logPrintDebug:
print(first, end=' ')
for v in args:
if 0 == logPrintDebug:
print(v)
logList.append(v)
if 1 == logPrintDebug:
print(v)
def serchChildHtml(secTitle):
global failFlag
try:
respose = requests.get(secTitle)
respose.encoding = 'utf-8'
IsWriteLog('respose.status_code:',respose.status_code)# 响应的状态码
#print('respose.content:',respose.content) #返回字节信息
#print('respose.text:',respose.text) #返回文本内容
titleB = re.findall(r'<title>(.*?)</title>', respose.text, re.S)[0]
IsWriteLog('titleB:', titleB)
contextGF.append(titleB + ':' + secTitle)
urls=re.findall(r'class="text-overflow".*?href="(.*?)"',respose.text,re.S) #re.S 把文本信息转换成1行匹配
IsWriteLog('urls:',urls)
index=0
for cont in urls:
url = title + cont
IsWriteLog('url:',url)
result = requests.get(url)
result.encoding = 'utf-8'
IsWriteLog('result.status_code:', respose.status_code)
mp4_url = re.findall(r'.*?download.*?"(.*?)"', result.text, re.S)[0]
IsWriteLog('mp4_url:', mp4_url)
#fileName = mp4_url.split('/')[-1]
fileName = re.findall(r'.*?var downurls.*?"(.*?)高清下载', result.text, re.S)[0]
IsWriteLog(fileName)
index = index + 1
onelist = str(index).rjust(3,' ')+ ' : ' + fileName.ljust(10,' ') + ' ' + '\n' + mp4_url
getTime()
IsWriteLog('onelist:',onelist)
contextGF.append(onelist)
except:
failFlag = failFlag + 1
def writeToTxt(context,flag):
global failFlag
#ticks = time.time()
TetDown = txtRoute + str(getFileTime()) + '_' + str(flag) + '.txt'
IsWriteLog(TetDown)
try:
file_handle = open(TetDown,mode='w')
for html in context:
file_handle.write(html)
file_handle.write('\n')
file_handle.close()
except:
failFlag = failFlag + 1
def writeToLocal():
writeToTxt(contextGF,str(failFlag))
if 0 != failFlag:
writeToTxt(logList,'log')
if (__name__ == "__main__"):
contextGF.append(getTime())
IsWriteLog(contextGF)
beginRes=requests.get(title)
contextGF.append(title)
IsWriteLog('beginRes.status_code:',beginRes.status_code)# 响应的状态码
titleL=re.findall(r'class=""><a.*?href=.*?"(.*?)".*?target="_blank"',beginRes.text,re.S) #re.S 把文本信息转换成1行匹配
IsWriteLog('titleL:',titleL)
for shortUrl in titleL:
res = re.match('/html', shortUrl)
res1 = re.match('/html/news', shortUrl)
if None != res:
if None == res1:
IsWriteLog(shortUrl)
secTitle = title + shortUrl
IsWriteLog('secTitle:', secTitle)
contextGF.append(secTitle)
serchChildHtml(secTitle)
time.sleep(5.5) # 休眠1秒
writeToLocal()