要求:
监管信息
主要字段:来源(如证监会)、类型(如:证监会要闻/行政处罚)、标题、时间、文章地址、内容(非必须,正文)
抓取频率:每日9点、12点、4点
# -*- coding: utf-8 -*- """ Created on Wed May 02 16:43:10 2018 @author: TY """ # coding:utf8 #引入时间模块 # -*- coding: utf-8 -*- # !/usr/bin/python import requests import pandas as pd from bs4 import BeautifulSoup import os import codecs import time import datetime import re import json #import cx_Oracle from sqlalchemy import create_engine import sys #定义一个爬虫函数,用来实现爬虫功能 def pachong(): # 把爬虫程序放在这个类里 print(u'这个程序要开始疯狂的运转啦') # 定义一个数组用来装不同的文件夹名字 f_name = ['hydt/', 'cxjs/gongshi/'] # print f_name for t in range(2): url = 'http://www.sac.net.cn/hyfw/' + f_name[t] print url print f_name[t] # t = time.localtime() # print "当前时间: %s " % time.asctime(t) # 可以把数字调到50,调到1是为了节省运行时间 for i in range(50): if i == 0: url1 = url + 'index.html' else: url1 = url + 'index_' + str(i) + '.html' print url1 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' directory = {'User-Agent': user_agent} response = requests.get(url1, headers=directory) # 告诉他这段文本是以utf8方式编码 所以要以utf8方式解码 response.encoding = 'utf-8' bs = BeautifulSoup(response.text, 'html.parser') # title = bs.find('title').string[5:12] # print title # print bs fp = 'C://pchomework/' + f_name[t] # 使用if语句判断这个文件夹是否会出错,若果会错就打印 if os.path.exists(fp) == False: os.mkdir(fp) print url[0:21] table = bs.find_all('table') # print table for a in table: b = a.find_all('td', attrs={'class': 'pad_le30 hei_000'}) # print b for c in b: # findall返回的是list,list没有href d = c.find_all('a') # print d # print type(d) # print d.get('href') # print d.attrs.get('href') # print d[0]#得到列表的第一个元素 url2 = url + d[0].get('href') print d[0].string print url2 if t == 0: time = url2[42:50] else: time = url2[50:58] print time user_agent1 = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11' directory1 = {'User-Agent': user_agent1} response1 = requests.get(url2, headers=directory1) # 告诉他这段文本是以utf8方式编码 所以要以utf8方式解码 response1.encoding = 'utf-8' bs1 = BeautifulSoup(response1.content, 'html.parser') code_div = bs1.find('div', attrs={'class': 'hei14'}) k = '' if code_div != None: # get_text使用来获取标签中的文本 k = code_div.get_text() # code_div = bs1.find('div', attrs={'class': 'post_text'}) # print code_div news_title = u'C://pchomework/' + f_name[t] + d[0].string + u'txt' fp = codecs.open(news_title, 'w', u'utf-8') fp.write(k) fp.close() print '================================================================================================' print '程序结束' print '爬虫已经工作完毕!' # 定义一个函数,用来判断时间 def main(h, m): #判断当地时间与设定时间是否吻合 if h == 9 and m == 0: pachong() #break elif h == 12 and m == 0: pachong() #break elif h==16 and m==0: pachong() #break #下面这个判断句是用来测试当前时间的 '''elif h==13 and m==32: pachong()''' else: # 每隔60秒检测一次 print '主人稍安勿燥,爬虫正在等待时间。。。' #每隔3600秒调用一次nowtime函数,重新做一次判断 time.sleep(3600) nowtime() print '程序结束!' #定义一个函数,用于提取当前时间 def nowtime(): #输出当前时间,并且赋值给hour和minute now = datetime.datetime.now() print(now.hour, now.minute) hour = now.hour minute = now.minute main(hour,minute) #调用nowtime函数,判断时间 nowtime()