多线程实时爬取PM2.5在线数据

# coding:utf-8

import threading
import urllib
import re,sys
import time
import hashlib
import os
from urllib import pathname2url

def crawl_data(Chinese,English):
    user_agent ='"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 \
             (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36"'
    headers = { 'User-Agent' : user_agent }
    md5 = ''
    while True:
        print Chinese
        url = 'http://www.pm25.in/' + pathname2url(Chinese).encode('gbk')  # 爬虫目标网址，确保IP地址没被封  
        print url
        html = urllib.urlopen(url)
        text = html.read()       
        contents = re.findall('<td(?:.*?)?>(.*?)</td>',text,re.S)#正则pm2.5等污染物数据
        
        data_time = re.findall("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",text,re.S) #正则寻找当前时间 例如，2016-04-13 20:10:00
        print data_time
        md52 = hashlib.md5()
        md52.update(data_time[0])
        
        if md52.hexdigest() == md5:
            time.sleep(3600) # 自动休眠，每一个小时爬一次数据
            continue
         
        md5 = md52.hexdigest()
        file_path = 'D:\\PM25\\'  + English 
        print file_path
        mkdir(file_path)             #调用函数 
        tempdata = open(file_path+'\\' + English +  '-' + \
                        data_time[0].split(' ')[0] + '-' + \
                        data_time[0].split(' ')[1][0:2] + \
                        '.txt','a')
        for index in range(0,len(contents),11):
            tempdata.write(','.join(contents[index:index+11])+ '\n')         
        tempdata.close()
#         print 'PM2.5更新时间:'
#         print data_time[0]
#         print "当前时间:"
#         print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))# 显示当前时间    
        time.sleep(3600)
        
def mkdir(path):  
    print path
    folder = os.path.exists(path)  
    if not folder:                 #判断是否存在文件夹如果不存在则创建为文件夹  
        os.makedirs(path)          #makedirs 创建文件时如果路径不存在会创建这个路径    
    else:  
        "文件夹已存在！" 
          
if __name__ == "__main__":
    file = open('D:/crawl_PM25/cities.txt')
    lines = file.readlines()
    cities = []
    threads = []
    for line in lines:  
        cities.append(line.strip())    
    for city in cities:
        English = city.split(",")[0]
        Chinese = city.split(",")[1]
        threads.append(threading.Thread(target = crawl_data,args = (Chinese,English)))
        time.sleep(0.5)         
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
多线程实时爬取PM2.5在线数据

猜你喜欢