Python爬虫学习篇-爬取建筑市场监管公共服务平台

Python爬虫学习篇-爬取建筑市场监管公共服务平台

代码仅供学习交流,请勿用于非法用途,如有侵权,请点击这里联系作者删除

代码仅供学习交流,请勿用于非法用途,如有侵权,请点击这里联系作者删除

代码仅供学习交流,请勿用于非法用途,如有侵权,请点击这里联系作者删除

代码中涉及到响应解密,还有每天企业id、人员id的解密算法,这里代码仅仅提供学习交流,就不放出来了。

近期比较忙,更新开发学习分享比较少,有开发学习问题可以 QQ点击这里 与我探讨哦

一、代码实现

# -*- coding:utf-8 -*-
import requests
import re
import hashlib
from decrypter import decrypt, encryptId, decryptId
import configparser
import MySQLdb
import time
import random


cf = configparser.ConfigParser()
try:
    cf.read("config.ini")
except Exception as e:
    print("程序目录下不存在config.ini配置文件~")
    exit(0)


def getConf(sec, key):
    try:
        return cf.get(sec, key)
    except Exception as e:
        print("未得到以下配置:" + sec + " - " + key)
        exit(0)

# -------------------------------------------------
offset = str(getConf("app-sys", "offset"))
limit = str(getConf("app-sys", "limit"))
# 数据库账号
mysql_user = getConf("Mysql-Database", "user")
# 数据库密码
mysql_password = getConf("Mysql-Database", "password")
# 数据库名称
mysql_database = getConf("Mysql-Database", "database")
mysql_host = getConf("Mysql-Database", "host")
mysql_port = getConf("Mysql-Database", "port")
# token
token = getConf("web-param", "token")
min_sleep = int(getConf("app-sys", "min_sleep"))
max_sleep = int(getConf("app-sys", "max_sleep"))
timeout = 20
retry = 3
headers = {
    "Referer": "http://jzsc.mohurd.gov.cn/data/company",
    "timeout": "30000",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
    "accessToken": token
}


def getSleepTime():
    return float(random.randint(min_sleep, max_sleep) / 1000)


def execSql(sql):
    try:
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database,
                                   charset='utf8')
            cursor = conn.cursor()
            cursor.execute(sql)
            conn.commit()
            return True
        except Exception as e:
            pass
    except Exception as e:
        pass
    return False


def querySql(sql):
    try:
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database,
                                   charset='utf8')
            cursor = conn.cursor()
            cursor.execute(sql)
            return cursor.fetchall()
        except Exception as e:
            pass
    except Exception as e:
        pass
    return


def getHash(s):
    m = hashlib.md5()
    m.update(s.encode("utf-8"))
    return m.hexdigest()


def updateToken(url):
    global token
    global headers
    global cf
    while True:
        try:
            cf.read("config.ini")
        except Exception as e:
            print("程序目录下不存在config.ini配置文件~")
            exit(0)
        token = getConf("web-param", "token")
        print("token失效,请更换~\n目前token: %s" % token)
        time.sleep(5)
        headers = {
            "Referer": "http://jzsc.mohurd.gov.cn/data/company",
            "timeout": "30000",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
            "accessToken": token
        }
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            content = resp.content.decode("utf-8")
            content = decrypt(content)
            code = str(content['code'])
            if code == "200":
                print("token更新成功~")
                return
        except Exception as e:
            pass


def getHtml(url):
    for i in range(retry):
        # print("url: %s times: %d " % (url, i + 1))
        try:
           resp = requests.get(url, headers=headers, timeout=timeout)
           content = resp.content.decode("utf-8")
           content = decrypt(content)
           code = str(content['code'])
           if code == "408":
               updateToken(url)
               continue
           time.sleep(getSleepTime())
           return content
        except Exception as e:
            pass
    return


def getCompanyList(page):
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?pg=" + str(page) + "&pgsz=15&total=450"
    html = getHtml(url)
    if html:
        try:
            return html['data']['list']
        except Exception as e:
            pass
    return


def getAreas(s):
    req = re.compile("(.*)-(.*)")
    try:
        return re.findall(req, s)[0]
    except Exception as e:
        pass
    return


def tsToDate(ts):
    tsList = list(str(ts))
    tsList.insert(-3, ".")
    cuttTs = float("".join(tsList))
    time_local = time.localtime(cuttTs)
    return time.strftime("%Y{y}%m{m}%d", time_local).format(y='-', m='-')


def getCompanyDetail(id, tid):
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/compDetail?compId=" + id
    html = getHtml(url)
    print("获取企业基本信息~")
    if html:
        try:
            compMap = html['data']['compMap']
            print("企业名称:%s " % compMap['QY_NAME'])
            addStatus = addCompany(tid, compMap['QY_NAME'])
            if addStatus:
                company = []
                company.append(tid)
                try:
                    company.append(compMap['QY_ORG_CODE'])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_FR_NAME'])
                except Exception as e:
                    company.append("")
                QY_REGION_NAME = ""
                try:
                    QY_REGION_NAME = getAreas(compMap['QY_REGION_NAME'])
                except Exception as e:
                    pass
                try:
                    company.append(QY_REGION_NAME[0])
                except Exception as e:
                    company.append("")
                try:
                    company.append(QY_REGION_NAME[1])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_ADDR'])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_NAME'])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_GSZCLX_NAME'])
                except Exception as e:
                    company.append("")
                print("企业基本信息:%s ~" % str(company))
                companyStatus = execSql("insert jianshe_qiyes(tid, hao, ren, diqu, city, qiye_dizhi, qiye_ming, qiye_leixing) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(company[0]), company[1], company[2], company[3], company[4], company[5], company[6], company[7]))
                if companyStatus:
                    upStatus = updateCompany(tid, "qiye_jiben")
                    return True
                return False
        except Exception as e:
            pass
    print("不存在该企业~")
    return False


def getCaDetailList(id, tid):
    '''
    企业资质证书信息
    :param id:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/caDetailList?qyId=" + id + "&pg=0&pgsz=15"
    html = getHtml(url)
    print("获取企业资质证证书信息~")
    if html:
        data = ""
        try:
            data = html['data']['pageList']['list']
            for _ in data:
                ca = []
                ca.append(tid)
                try:
                    ca.append(getHash(str(tid) + _['APT_NAME']))
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_CERTNO'])
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_NAME'])
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_TYPE_NAME'])
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(tsToDate(_['APT_GET_DATE']))
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(tsToDate(_['APT_EDATE']))
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_GRANT_UNIT'])
                except Exception as e:
                    ca.append("")
                sql = "insert jianshe_zizhis(tid, `hash`, hao, ming, leibie, riqi, youxiaoqi, jiguan) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(ca[0]), ca[1], ca[2], ca[3], ca[4], ca[5], ca[6], ca[7])
                caStatus = execSql(sql)
                if caStatus:
                    updateCompany(tid, "qiye_zizhi")
                    return True
                return False
        except Exception as e:
            print("无企业资质信息~")
            return
    print("无企业资质信息~")
    return


def getRegStaffList(id, tid):
    '''
    企业人员信息
    :param id:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/regStaffList?qyId=" + str(id) + "&pg=0&pgsz=15"
    html = getHtml(url)
    print("获取企业人员新信息~")
    if html:
        try:
            data = ""
            try:
                data = html['data']['pageList']['list']
            except Exception as e:
                return
            for _ in data:
                reg = []
                reg.append(tid)
                rid = ""
                try:
                    rid = int(decryptId(_['RY_ID']))
                    reg.append(rid)
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['REG_SEAL_CODE'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['RY_NAME'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['IDCARD'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['REG_TYPE_NAME'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['REG_PROF_NAME'])
                except Exception as e:
                    reg.append("")
                print("企业人员信息:%s ~" % str(reg))
                sql = "insert jianshe_qiyes_renyuans(tid, rid, hao, xingming, shenfengzheng, leibie, zhuanye) values(%d, %d, '%s', '%s', '%s', '%s', '%s')" % (reg[0], reg[1], reg[2], reg[3], reg[4], reg[5], reg[6])
                regStatus = execSql(sql)
                if regStatus:
                    updateCompany(tid, "qiye_renyuan")
                    # 人员基本信息
                    getStaff(_['RY_ID'], rid, tid)
                    # 人员执业注册信息
                    getRegCert(_['RY_ID'], rid, tid)
                    # getRegCert("D2D2D3D4D3D2D2DAD3DBD2D1D1D1D3DAD4D6", rid, tid)
                    return
        except Exception as e:
            pass
    print("无企业人员信息~")
    return


def staffPerformanceListSys(id, rid, tid):
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffPerformanceListSys?staffId=" + str(id) + "&pg=0"
    resp = getHtml(url)
    if resp:
        data = ""
        try:
            data = resp['data']['pageList']['list']
        except Exception as e:
            return
        if data and len(data) > 0:
            for _ in data:
                per = []
                per.append(int(rid))
                try:
                    per.append(_['PRJNUM'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['PRJNAME'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['PROVINCE'] + _['CITY'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['PRJTYPENUM'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['BUILDCORPNAME'])
                except Exception as e:
                    per.append("")
                sql = "insert jianshe_renyuans_yejis(rid, xiangmu_bianhao, xiangmu_mingcheng, xiangmu_shudi, xiangmu_leibie, jianshe_danwei) values(%d, '%s', '%s',  '%s', '%s', '%s')" % (per[0], str(per[1]), str(per[2]), str(per[3]), str(per[4]), str(per[5]))
                perStatus = execSql(sql)
                if perStatus:
                    # updateCompany(tid, "")
                    return True
    return False


def getRegCert(staffId, rid, tid):
    '''
    人员执业注册信息
    :param staffId:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId)
    html = getHtml(url)
    print("获取人员职业注册信息~")
    if html:
        data = ""
        try:
            data = html['data']
        except Exception as e:
            return
        regCert = ""
        try:
            regCert = data['regCertList'][0]
        except Exception as e:
            return
        staff = []
        staff.append(rid)
        try:
            staff.append(regCert['REG_TYPE_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['REG_PROF_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['QY_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['REG_CERTNO'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['CERT_REG_NO'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(tsToDate(regCert['REG_EDATE']))
        except Exception as e:
            staff.append("")
        print("人员职业注册信息:%s ~" % str(staff))
        sql = "insert jianshe_renyuans_zhiyes(rid, zhuce_leibie, zhuce_zhuanye, zhuce_danwei, zhengshu_bianhao, zhiye_yinzhanghao, youxiaoqi) values(%d, '%s', '%s', '%s', '%s', '%s', '%s')" % (int(staff[0]), str(staff[1]), str(staff[2]), str(staff[3]), str(staff[4]), str(staff[5]), str(staff[6]))
        staffStatus = execSql(sql)
        if staffStatus:
            updateCompany(tid, "renyuan_zizhi")
            return True
    print("无人员职业注册信息~")
    return False


def getStaff(staffId, rid, tid):
    '''
    人员基本信息
    :param staffId:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId)
    html = getHtml(url)
    print("获取人员基本信息~")
    if html:
        data = ""
        try:
            data = html['data']
        except Exception as e:
            return
        staffMap = ""
        try:
            staffMap = data['staffMap']
        except Exception as e:
            return
        staff = []
        staff.append(rid)
        try:
            staff.append(staffMap['RY_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(staffMap['RY_SEX_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(staffMap['RY_CARDTYPE_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(staffMap['IDCARD'])
        except Exception as e:
            staff.append("")
        print("人员基本信息: %s ~" % str(staff))
        sql = "insert jianshe_renyuans_basics(rid, xingming, xingbie, zhengjian_leixing, zhengjian_haoma) values(%d, '%s', '%s', '%s', '%s')" % (int(staff[0]), staff[1], staff[2], staff[3], staff[4])
        staffStatus = execSql(sql)
        if staffStatus:
            updateCompany(tid, "renyuan_jiben")
            return True
    print("无人员基本信息~")
    return False


def getRandCompanyId():
    companyList = getCompanyList(0)
    try:
        return companyList[0]['QY_ID']
    except Exception as e:
        pass
    return


def qyueryMaxId():
    try:
        conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, host=mysql_host, port=int(mysql_port),
                               charset='utf8')
        cursor = conn.cursor()
        cursor.execute("select tid from jianshe_qiyes order by tid desc limit " + offset + "," + limit)
        try:
            return cursor.fetchall()[0][0]
        except Exception as e:
            return
    except Exception as e:
        return


def getRangeId():
    sql = "select tid from jianshe_xinxi_bulu order by tid desc limit " + offset + "," + limit
    res = querySql(sql)
    rangeId = []
    dateRangeList = []
    if res and len(res) > 0:
        metaDayRange = []
        before = 0
        for r in res:
            curr = int(r[0])
            if before == 0:
                metaDayRange.append(curr)
                before = curr
            else:
                dis = abs(curr - before)
                # print("%d - %d = %d" % (curr, before, dis))
                if dis > 100000:
                    if metaDayRange and len(metaDayRange) >= 2:
                        dateRangeList.append(list(set(metaDayRange)))
                    metaDayRange = []
                    before = 0
                else:
                    metaDayRange.append(curr)
                    before = curr
        if metaDayRange and len(metaDayRange) >= 2:
            dateRangeList.append(list(set(metaDayRange)))
    else:
        print("数据库未查询到匹配范围数据~")
        exit(0)
    if dateRangeList and len(dateRangeList) > 0:
        for dateRange in dateRangeList:
            dateRange.sort()
            nums = len(dateRange)
            for index, val in enumerate(dateRange):
                if nums - 1 == index:
                    continue
                next = dateRange[index + 1]
                if next - val > 1:
                    for i in range(val + 1, next):
                        rangeId.append(i)
    else:
        print("数据库未查询到匹配范围数据~")
        exit(0)
    return rangeId


def addCompany(tid, name):
    sql = "insert jianshe_xinxi_bulu(tid, qiye_ming, qiye_jiben, qiye_gongshang, qiye_lianxi, qiye_zizhi, qiye_renyuan, qiye_xiangmu, qiye_anxu, renyuan_jiben, renyuan_zizhi, renyuan_xiangmu, xiangmu_jiben, xiangmu_zhaobiao, xiangmu_shigongtu, xiangmu_hetong, xiangmu_shigongxuke, xiangmu_jungongyashou) values(%d, '%s', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)" % (int(tid), name)
    return execSql(sql)


def updateCompany(id, con):
    sql = "update jianshe_xinxi_bulu set " + str(con) + "=1 where tid = " + str(id)
    return execSql(sql)


def main():
    randtid = getRandCompanyId()
    rangId = getRangeId()
    for id in rangId:
        print("================================")
        QY_ID = encryptId("00" + str(id), randtid)
        print("明文id:%s 密文id:%s ~" % ("00" + str(id), QY_ID))
        tid = int(decryptId(QY_ID))
        companyStatus = getCompanyDetail(QY_ID, tid)
        if companyStatus:
            # 企业资质证书信息
            getCaDetailList(QY_ID, tid)
            # 企业人员信息
            getRegStaffList(QY_ID, tid)
        print("================================")


if __name__ == '__main__':
    main()

代码中涉及到响应解密,还有每天企业id、人员id的解密算法,这里代码仅仅提供学习交流,就不放出来了。

近期比较忙,更新开发学习分享比较少,有开发学习问题可以 QQ点击这里 与我探讨哦

微信请扫描下方二维码

在这里插入图片描述

发布了51 篇原创文章 · 获赞 38 · 访问量 4万+

猜你喜欢

转载自blog.csdn.net/qq_41287993/article/details/104387506