# -*-coding:UTF-8-*-
import sys
import re
import urllib
import requests
import threading
import os
from PyQt4 import QtCore, QtGui
filepath = 'D:\\youminimge'
if (not os.path.exists(filepath)):
os.mkdir(filepath)
def clear():
for filename in os.listdir(filepath.decode("utf-8")):
# print filename
os.remove(os.path.join(filepath, filename))
clear()
src = '<img class="picact".*?src="(.*?)"'
detail1 = '<img class="picact".*?<br>(.*?)</p>'
member = '<p align="center"><.*?</p>'
titlelist = []
urllist = []
img = re.compile(src)
detail = re.compile(detail1)
mem = re.compile(member)
def gethtml(url):
# 获取网页所有源码
r = requests.get(url)
return r.content.replace('\n', "")
def getimg(html, s):
# 下载图片
global img
try:
memlist = re.findall(mem, html)
inde = 0
for i in memlist:
# print i
imglist = re.findall(img, i)
titles = re.findall(detail, i.replace(' ', '').replace(' ', ''))
for j in imglist:
# 下载图片
# print j
tit = titles[0] if len(titles) > 0 else ["A", "B", "C", "D", "E"][inde]
inde += 1
# print tit
urllib.urlretrieve(j, filepath + '\\' + str(s) + "--" + unicode(tit.decode("utf-8")) + j[-4:])
except:
pass
class getmy(threading.Thread):
# 创建多线程
def __init__(self, url, begin, end):
threading.Thread.__init__(self)
self.url = url
self.begin = begin
self.end = end
def run(self):
try:
for i in range(self.begin, self.end + 1):
s = i
if i == 1:
i = ''
else:
i = '_' + str(i)
murl = self.url[:-6] + str(i) + self.url[-6:]
print murl
getimg(gethtml(murl), s)
except:
pass
def getbaseurl():
url = 'http://www.gamersky.com/ent/' # 游民每日图片发布页
r = requests.get(url)
s = r.content
urlhtm1 = '<a class="img1 countHit" target="_blank" .*?</a>' # 寻找图片发布页网址所在的html区域
urlhtm2 = '<a class="img2 countHit" target="_blank" .*?</a>'
herfhtm = 'https:.*?shtml' # 图片发布页网址
title = '<div class="txt">(.*?)</div>'
urs1 = re.compile(urlhtm1)
urs2 = re.compile(urlhtm2)
# urs=re.compile(herfhtm)
urllist1 = re.findall(urs1, s) # 查找所有最新图片发布页网址
urllist2 = re.findall(urs2, s)
divlist = urllist1 + urllist2
print urllist1
for i in divlist:
urllist.append(re.search(herfhtm, i).group())
title1 = re.search(title, i).group(1)
titlelist.append(unicode(title1.decode("utf-8")))
class HelloPyQt(QtGui.QWidget):
def __init__(self, fruit, parent=None):
super(HelloPyQt, self).__init__(parent)
self.setWindowTitle("游民星空".decode("utf-8"))
self.fruit = fruit
self.btnPress = QtGui.QPushButton("clear")
self.fruits = QtGui.QListWidget()
self.fruits.addItems(fruit)
self.fruits.itemDoubleClicked.connect(self.item_double_clcik)
layout = QtGui.QVBoxLayout()
layout.addWidget(self.fruits)
layout.addWidget(self.btnPress)
self.setLayout(layout)
self.btnPress.clicked.connect(self.btnPress_Clicked)
def btnPress_Clicked(self):
clear()
def item_double_clcik(self, item):
targeturl = urllist[self.fruits.row(item)]
print targeturl
threads = []
i = 1
j = 5
# 每个线程下载5页,一共下载50页
for s in range(10):
threads.append(getmy(targeturl, i, j))
i += 5
j += 5
for t in threads:
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
getbaseurl()
app = QtGui.QApplication(sys.argv)
mainWindow = HelloPyQt(titlelist)
mainWindow.show()
sys.exit(app.exec_())
python 爬去游民每日囧图2019版
猜你喜欢
转载自blog.csdn.net/andylou_1/article/details/88233035
今日推荐
周排行