Python爬虫学习篇-爬取建筑市场监管公共服务平台
代码仅供学习交流,请勿用于非法用途,如有侵权,请点击这里联系作者删除
代码仅供学习交流,请勿用于非法用途,如有侵权,请点击这里联系作者删除
代码仅供学习交流,请勿用于非法用途,如有侵权,请点击这里联系作者删除
代码中涉及到响应解密,还有每天企业id、人员id的解密算法,这里代码仅仅提供学习交流,就不放出来了。
近期比较忙,更新开发学习分享比较少,有开发学习问题可以 QQ点击这里 与我探讨哦
一、代码实现
# -*- coding:utf-8 -*-
import requests
import re
import hashlib
from decrypter import decrypt, encryptId, decryptId
import configparser
import MySQLdb
import time
import random
cf = configparser.ConfigParser()
try:
cf.read("config.ini")
except Exception as e:
print("程序目录下不存在config.ini配置文件~")
exit(0)
def getConf(sec, key):
try:
return cf.get(sec, key)
except Exception as e:
print("未得到以下配置:" + sec + " - " + key)
exit(0)
# -------------------------------------------------
offset = str(getConf("app-sys", "offset"))
limit = str(getConf("app-sys", "limit"))
# 数据库账号
mysql_user = getConf("Mysql-Database", "user")
# 数据库密码
mysql_password = getConf("Mysql-Database", "password")
# 数据库名称
mysql_database = getConf("Mysql-Database", "database")
mysql_host = getConf("Mysql-Database", "host")
mysql_port = getConf("Mysql-Database", "port")
# token
token = getConf("web-param", "token")
min_sleep = int(getConf("app-sys", "min_sleep"))
max_sleep = int(getConf("app-sys", "max_sleep"))
timeout = 20
retry = 3
headers = {
"Referer": "http://jzsc.mohurd.gov.cn/data/company",
"timeout": "30000",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
"accessToken": token
}
def getSleepTime():
return float(random.randint(min_sleep, max_sleep) / 1000)
def execSql(sql):
try:
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database,
charset='utf8')
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
return True
except Exception as e:
pass
except Exception as e:
pass
return False
def querySql(sql):
try:
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database,
charset='utf8')
cursor = conn.cursor()
cursor.execute(sql)
return cursor.fetchall()
except Exception as e:
pass
except Exception as e:
pass
return
def getHash(s):
m = hashlib.md5()
m.update(s.encode("utf-8"))
return m.hexdigest()
def updateToken(url):
global token
global headers
global cf
while True:
try:
cf.read("config.ini")
except Exception as e:
print("程序目录下不存在config.ini配置文件~")
exit(0)
token = getConf("web-param", "token")
print("token失效,请更换~\n目前token: %s" % token)
time.sleep(5)
headers = {
"Referer": "http://jzsc.mohurd.gov.cn/data/company",
"timeout": "30000",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
"accessToken": token
}
try:
resp = requests.get(url, headers=headers, timeout=timeout)
content = resp.content.decode("utf-8")
content = decrypt(content)
code = str(content['code'])
if code == "200":
print("token更新成功~")
return
except Exception as e:
pass
def getHtml(url):
for i in range(retry):
# print("url: %s times: %d " % (url, i + 1))
try:
resp = requests.get(url, headers=headers, timeout=timeout)
content = resp.content.decode("utf-8")
content = decrypt(content)
code = str(content['code'])
if code == "408":
updateToken(url)
continue
time.sleep(getSleepTime())
return content
except Exception as e:
pass
return
def getCompanyList(page):
url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?pg=" + str(page) + "&pgsz=15&total=450"
html = getHtml(url)
if html:
try:
return html['data']['list']
except Exception as e:
pass
return
def getAreas(s):
req = re.compile("(.*)-(.*)")
try:
return re.findall(req, s)[0]
except Exception as e:
pass
return
def tsToDate(ts):
tsList = list(str(ts))
tsList.insert(-3, ".")
cuttTs = float("".join(tsList))
time_local = time.localtime(cuttTs)
return time.strftime("%Y{y}%m{m}%d", time_local).format(y='-', m='-')
def getCompanyDetail(id, tid):
url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/compDetail?compId=" + id
html = getHtml(url)
print("获取企业基本信息~")
if html:
try:
compMap = html['data']['compMap']
print("企业名称:%s " % compMap['QY_NAME'])
addStatus = addCompany(tid, compMap['QY_NAME'])
if addStatus:
company = []
company.append(tid)
try:
company.append(compMap['QY_ORG_CODE'])
except Exception as e:
company.append("")
try:
company.append(compMap['QY_FR_NAME'])
except Exception as e:
company.append("")
QY_REGION_NAME = ""
try:
QY_REGION_NAME = getAreas(compMap['QY_REGION_NAME'])
except Exception as e:
pass
try:
company.append(QY_REGION_NAME[0])
except Exception as e:
company.append("")
try:
company.append(QY_REGION_NAME[1])
except Exception as e:
company.append("")
try:
company.append(compMap['QY_ADDR'])
except Exception as e:
company.append("")
try:
company.append(compMap['QY_NAME'])
except Exception as e:
company.append("")
try:
company.append(compMap['QY_GSZCLX_NAME'])
except Exception as e:
company.append("")
print("企业基本信息:%s ~" % str(company))
companyStatus = execSql("insert jianshe_qiyes(tid, hao, ren, diqu, city, qiye_dizhi, qiye_ming, qiye_leixing) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(company[0]), company[1], company[2], company[3], company[4], company[5], company[6], company[7]))
if companyStatus:
upStatus = updateCompany(tid, "qiye_jiben")
return True
return False
except Exception as e:
pass
print("不存在该企业~")
return False
def getCaDetailList(id, tid):
'''
企业资质证书信息
:param id:
:return:
'''
url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/caDetailList?qyId=" + id + "&pg=0&pgsz=15"
html = getHtml(url)
print("获取企业资质证证书信息~")
if html:
data = ""
try:
data = html['data']['pageList']['list']
for _ in data:
ca = []
ca.append(tid)
try:
ca.append(getHash(str(tid) + _['APT_NAME']))
except Exception as e:
ca.append("")
try:
ca.append(_['APT_CERTNO'])
except Exception as e:
ca.append("")
try:
ca.append(_['APT_NAME'])
except Exception as e:
ca.append("")
try:
ca.append(_['APT_TYPE_NAME'])
except Exception as e:
ca.append("")
try:
ca.append(tsToDate(_['APT_GET_DATE']))
except Exception as e:
ca.append("")
try:
ca.append(tsToDate(_['APT_EDATE']))
except Exception as e:
ca.append("")
try:
ca.append(_['APT_GRANT_UNIT'])
except Exception as e:
ca.append("")
sql = "insert jianshe_zizhis(tid, `hash`, hao, ming, leibie, riqi, youxiaoqi, jiguan) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(ca[0]), ca[1], ca[2], ca[3], ca[4], ca[5], ca[6], ca[7])
caStatus = execSql(sql)
if caStatus:
updateCompany(tid, "qiye_zizhi")
return True
return False
except Exception as e:
print("无企业资质信息~")
return
print("无企业资质信息~")
return
def getRegStaffList(id, tid):
'''
企业人员信息
:param id:
:return:
'''
url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/regStaffList?qyId=" + str(id) + "&pg=0&pgsz=15"
html = getHtml(url)
print("获取企业人员新信息~")
if html:
try:
data = ""
try:
data = html['data']['pageList']['list']
except Exception as e:
return
for _ in data:
reg = []
reg.append(tid)
rid = ""
try:
rid = int(decryptId(_['RY_ID']))
reg.append(rid)
except Exception as e:
reg.append("")
try:
reg.append(_['REG_SEAL_CODE'])
except Exception as e:
reg.append("")
try:
reg.append(_['RY_NAME'])
except Exception as e:
reg.append("")
try:
reg.append(_['IDCARD'])
except Exception as e:
reg.append("")
try:
reg.append(_['REG_TYPE_NAME'])
except Exception as e:
reg.append("")
try:
reg.append(_['REG_PROF_NAME'])
except Exception as e:
reg.append("")
print("企业人员信息:%s ~" % str(reg))
sql = "insert jianshe_qiyes_renyuans(tid, rid, hao, xingming, shenfengzheng, leibie, zhuanye) values(%d, %d, '%s', '%s', '%s', '%s', '%s')" % (reg[0], reg[1], reg[2], reg[3], reg[4], reg[5], reg[6])
regStatus = execSql(sql)
if regStatus:
updateCompany(tid, "qiye_renyuan")
# 人员基本信息
getStaff(_['RY_ID'], rid, tid)
# 人员执业注册信息
getRegCert(_['RY_ID'], rid, tid)
# getRegCert("D2D2D3D4D3D2D2DAD3DBD2D1D1D1D3DAD4D6", rid, tid)
return
except Exception as e:
pass
print("无企业人员信息~")
return
def staffPerformanceListSys(id, rid, tid):
url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffPerformanceListSys?staffId=" + str(id) + "&pg=0"
resp = getHtml(url)
if resp:
data = ""
try:
data = resp['data']['pageList']['list']
except Exception as e:
return
if data and len(data) > 0:
for _ in data:
per = []
per.append(int(rid))
try:
per.append(_['PRJNUM'])
except Exception as e:
per.append("")
try:
per.append(_['PRJNAME'])
except Exception as e:
per.append("")
try:
per.append(_['PROVINCE'] + _['CITY'])
except Exception as e:
per.append("")
try:
per.append(_['PRJTYPENUM'])
except Exception as e:
per.append("")
try:
per.append(_['BUILDCORPNAME'])
except Exception as e:
per.append("")
sql = "insert jianshe_renyuans_yejis(rid, xiangmu_bianhao, xiangmu_mingcheng, xiangmu_shudi, xiangmu_leibie, jianshe_danwei) values(%d, '%s', '%s', '%s', '%s', '%s')" % (per[0], str(per[1]), str(per[2]), str(per[3]), str(per[4]), str(per[5]))
perStatus = execSql(sql)
if perStatus:
# updateCompany(tid, "")
return True
return False
def getRegCert(staffId, rid, tid):
'''
人员执业注册信息
:param staffId:
:return:
'''
url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId)
html = getHtml(url)
print("获取人员职业注册信息~")
if html:
data = ""
try:
data = html['data']
except Exception as e:
return
regCert = ""
try:
regCert = data['regCertList'][0]
except Exception as e:
return
staff = []
staff.append(rid)
try:
staff.append(regCert['REG_TYPE_NAME'])
except Exception as e:
staff.append("")
try:
staff.append(regCert['REG_PROF_NAME'])
except Exception as e:
staff.append("")
try:
staff.append(regCert['QY_NAME'])
except Exception as e:
staff.append("")
try:
staff.append(regCert['REG_CERTNO'])
except Exception as e:
staff.append("")
try:
staff.append(regCert['CERT_REG_NO'])
except Exception as e:
staff.append("")
try:
staff.append(tsToDate(regCert['REG_EDATE']))
except Exception as e:
staff.append("")
print("人员职业注册信息:%s ~" % str(staff))
sql = "insert jianshe_renyuans_zhiyes(rid, zhuce_leibie, zhuce_zhuanye, zhuce_danwei, zhengshu_bianhao, zhiye_yinzhanghao, youxiaoqi) values(%d, '%s', '%s', '%s', '%s', '%s', '%s')" % (int(staff[0]), str(staff[1]), str(staff[2]), str(staff[3]), str(staff[4]), str(staff[5]), str(staff[6]))
staffStatus = execSql(sql)
if staffStatus:
updateCompany(tid, "renyuan_zizhi")
return True
print("无人员职业注册信息~")
return False
def getStaff(staffId, rid, tid):
'''
人员基本信息
:param staffId:
:return:
'''
url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId)
html = getHtml(url)
print("获取人员基本信息~")
if html:
data = ""
try:
data = html['data']
except Exception as e:
return
staffMap = ""
try:
staffMap = data['staffMap']
except Exception as e:
return
staff = []
staff.append(rid)
try:
staff.append(staffMap['RY_NAME'])
except Exception as e:
staff.append("")
try:
staff.append(staffMap['RY_SEX_NAME'])
except Exception as e:
staff.append("")
try:
staff.append(staffMap['RY_CARDTYPE_NAME'])
except Exception as e:
staff.append("")
try:
staff.append(staffMap['IDCARD'])
except Exception as e:
staff.append("")
print("人员基本信息: %s ~" % str(staff))
sql = "insert jianshe_renyuans_basics(rid, xingming, xingbie, zhengjian_leixing, zhengjian_haoma) values(%d, '%s', '%s', '%s', '%s')" % (int(staff[0]), staff[1], staff[2], staff[3], staff[4])
staffStatus = execSql(sql)
if staffStatus:
updateCompany(tid, "renyuan_jiben")
return True
print("无人员基本信息~")
return False
def getRandCompanyId():
companyList = getCompanyList(0)
try:
return companyList[0]['QY_ID']
except Exception as e:
pass
return
def qyueryMaxId():
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, host=mysql_host, port=int(mysql_port),
charset='utf8')
cursor = conn.cursor()
cursor.execute("select tid from jianshe_qiyes order by tid desc limit " + offset + "," + limit)
try:
return cursor.fetchall()[0][0]
except Exception as e:
return
except Exception as e:
return
def getRangeId():
sql = "select tid from jianshe_xinxi_bulu order by tid desc limit " + offset + "," + limit
res = querySql(sql)
rangeId = []
dateRangeList = []
if res and len(res) > 0:
metaDayRange = []
before = 0
for r in res:
curr = int(r[0])
if before == 0:
metaDayRange.append(curr)
before = curr
else:
dis = abs(curr - before)
# print("%d - %d = %d" % (curr, before, dis))
if dis > 100000:
if metaDayRange and len(metaDayRange) >= 2:
dateRangeList.append(list(set(metaDayRange)))
metaDayRange = []
before = 0
else:
metaDayRange.append(curr)
before = curr
if metaDayRange and len(metaDayRange) >= 2:
dateRangeList.append(list(set(metaDayRange)))
else:
print("数据库未查询到匹配范围数据~")
exit(0)
if dateRangeList and len(dateRangeList) > 0:
for dateRange in dateRangeList:
dateRange.sort()
nums = len(dateRange)
for index, val in enumerate(dateRange):
if nums - 1 == index:
continue
next = dateRange[index + 1]
if next - val > 1:
for i in range(val + 1, next):
rangeId.append(i)
else:
print("数据库未查询到匹配范围数据~")
exit(0)
return rangeId
def addCompany(tid, name):
sql = "insert jianshe_xinxi_bulu(tid, qiye_ming, qiye_jiben, qiye_gongshang, qiye_lianxi, qiye_zizhi, qiye_renyuan, qiye_xiangmu, qiye_anxu, renyuan_jiben, renyuan_zizhi, renyuan_xiangmu, xiangmu_jiben, xiangmu_zhaobiao, xiangmu_shigongtu, xiangmu_hetong, xiangmu_shigongxuke, xiangmu_jungongyashou) values(%d, '%s', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)" % (int(tid), name)
return execSql(sql)
def updateCompany(id, con):
sql = "update jianshe_xinxi_bulu set " + str(con) + "=1 where tid = " + str(id)
return execSql(sql)
def main():
randtid = getRandCompanyId()
rangId = getRangeId()
for id in rangId:
print("================================")
QY_ID = encryptId("00" + str(id), randtid)
print("明文id:%s 密文id:%s ~" % ("00" + str(id), QY_ID))
tid = int(decryptId(QY_ID))
companyStatus = getCompanyDetail(QY_ID, tid)
if companyStatus:
# 企业资质证书信息
getCaDetailList(QY_ID, tid)
# 企业人员信息
getRegStaffList(QY_ID, tid)
print("================================")
if __name__ == '__main__':
main()