版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/XBR_2014/article/details/81609865
# coding:utf-8
import threading
import urllib
import re,sys
import time
import hashlib
import os
from urllib import pathname2url
def crawl_data(Chinese,English):
user_agent ='"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36"'
headers = { 'User-Agent' : user_agent }
md5 = ''
while True:
print Chinese
url = 'http://www.pm25.in/' + pathname2url(Chinese).encode('gbk') # 爬虫目标网址,确保IP地址没被封
print url
html = urllib.urlopen(url)
text = html.read()
contents = re.findall('<td(?:.*?)?>(.*?)</td>',text,re.S)#正则pm2.5等污染物数据
data_time = re.findall("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",text,re.S) #正则寻找当前时间 例如,2016-04-13 20:10:00
print data_time
md52 = hashlib.md5()
md52.update(data_time[0])
if md52.hexdigest() == md5:
time.sleep(3600) # 自动休眠,每一个小时爬一次数据
continue
md5 = md52.hexdigest()
file_path = 'D:\\PM25\\' + English
print file_path
mkdir(file_path) #调用函数
tempdata = open(file_path+'\\' + English + '-' + \
data_time[0].split(' ')[0] + '-' + \
data_time[0].split(' ')[1][0:2] + \
'.txt','a')
for index in range(0,len(contents),11):
tempdata.write(','.join(contents[index:index+11])+ '\n')
tempdata.close()
# print 'PM2.5更新时间:'
# print data_time[0]
# print "当前时间:"
# print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))# 显示当前时间
time.sleep(3600)
def mkdir(path):
print path
folder = os.path.exists(path)
if not folder: #判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) #makedirs 创建文件时如果路径不存在会创建这个路径
else:
"文件夹已存在!"
if __name__ == "__main__":
file = open('D:/crawl_PM25/cities.txt')
lines = file.readlines()
cities = []
threads = []
for line in lines:
cities.append(line.strip())
for city in cities:
English = city.split(",")[0]
Chinese = city.split(",")[1]
threads.append(threading.Thread(target = crawl_data,args = (Chinese,English)))
time.sleep(0.5)
for thread in threads:
thread.start()
for thread in threads:
thread.join()