AI&BigData three：使用定时器抓取数据

要求:

监管信息

主要字段：来源（如证监会）、类型（如：证监会要闻/行政处罚）、标题、时间、文章地址、内容（非必须，正文）

抓取频率：每日9点、12点、4点

# -*- coding: utf-8 -*-
"""
Created on Wed May 02 16:43:10 2018

@author: TY
"""

# coding:utf8
#引入时间模块
# -*- coding: utf-8 -*-
# !/usr/bin/python
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import codecs
import time
import datetime
import re
import json
#import cx_Oracle
from sqlalchemy import create_engine
import sys

#定义一个爬虫函数，用来实现爬虫功能
def pachong():
    # 把爬虫程序放在这个类里
    print(u'这个程序要开始疯狂的运转啦')
    # 定义一个数组用来装不同的文件夹名字
    f_name = ['hydt/', 'cxjs/gongshi/']
    # print f_name
    for t in range(2):
        url = 'http://www.sac.net.cn/hyfw/' + f_name[t]
        print url
        print f_name[t]
        # t = time.localtime()
        # print "当前时间: %s " % time.asctime(t)
        # 可以把数字调到50，调到1是为了节省运行时间
        for i in range(50):
            if i == 0:
                url1 = url + 'index.html'
            else:
                url1 = url + 'index_' + str(i) + '.html'
            print url1
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            directory = {'User-Agent': user_agent}
            response = requests.get(url1, headers=directory)
            # 告诉他这段文本是以utf8方式编码 所以要以utf8方式解码
            response.encoding = 'utf-8'
            bs = BeautifulSoup(response.text, 'html.parser')
            # title = bs.find('title').string[5:12]
            # print title
            # print bs
            fp = 'C://pchomework/' + f_name[t]
            # 使用if语句判断这个文件夹是否会出错，若果会错就打印
            if os.path.exists(fp) == False:
                os.mkdir(fp)
            print url[0:21]
            table = bs.find_all('table')
            # print table
            for a in table:
                b = a.find_all('td', attrs={'class': 'pad_le30 hei_000'})
                # print b
                for c in b:
                    # findall返回的是list，list没有href
                    d = c.find_all('a')
                    # print d
                    # print type(d)
                    # print d.get('href')
                    # print d.attrs.get('href')
                    # print d[0]#得到列表的第一个元素
                    url2 = url + d[0].get('href')
                    print  d[0].string
                    print url2
                    if t == 0:
                        time = url2[42:50]
                    else:
                        time = url2[50:58]
                    print time
                    user_agent1 = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
                    directory1 = {'User-Agent': user_agent1}
                    response1 = requests.get(url2, headers=directory1)
                    # 告诉他这段文本是以utf8方式编码 所以要以utf8方式解码
                    response1.encoding = 'utf-8'
                    bs1 = BeautifulSoup(response1.content, 'html.parser')
                    code_div = bs1.find('div', attrs={'class': 'hei14'})
                    k = ''
                    if code_div != None:
                        # get_text使用来获取标签中的文本
                        k = code_div.get_text()
                        # code_div = bs1.find('div', attrs={'class': 'post_text'})
                    # print code_div
                    news_title = u'C://pchomework/' + f_name[t] + d[0].string + u'txt'
                    fp = codecs.open(news_title, 'w', u'utf-8')
                    fp.write(k)
                    fp.close()
            print '================================================================================================'
        print '程序结束'
    print '爬虫已经工作完毕！'


# 定义一个函数，用来判断时间
def main(h, m):
    #判断当地时间与设定时间是否吻合
    if h == 9 and m == 0:
        pachong()
        #break
    elif h == 12 and m == 0:
        pachong()
        #break
    elif h==16 and m==0:
        pachong()
        #break
    #下面这个判断句是用来测试当前时间的
    '''elif h==13 and m==32:
        pachong()'''
    else:
        # 每隔60秒检测一次
        print '主人稍安勿燥，爬虫正在等待时间。。。'
        #每隔3600秒调用一次nowtime函数，重新做一次判断
        time.sleep(3600)
        nowtime()
    print '程序结束！'

#定义一个函数，用于提取当前时间
def nowtime():
    #输出当前时间，并且赋值给hour和minute
    now = datetime.datetime.now()
    print(now.hour, now.minute)
    hour = now.hour
    minute = now.minute
    main(hour,minute)

#调用nowtime函数，判断时间
nowtime()

AI&BigData three：使用定时器抓取数据

猜你喜欢