import requests
from lxml import etree
import re
import json
from pyquery import PyQuery
import time
page = 1
while page<6:
if page==1:
url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004002/about.html'
page+=1
else:
url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004002/{}.html'.format(page)
page += 1
r = requests.get(url)
s = PyQuery(r.text)
tender_names = s('#main > div > div.ewb-info-bd > ul > li > div > a').items()
get_dates = s('#main > div > div.ewb-info-bd > ul > li > span').items()
for i,j in zip(tender_names,get_dates):
original_url = 'http://www.hbbidcloud.cn{}'.format(i.attr('href'))
tender_name = i.attr('title')
get_date = j.text()
if 'HBSU' in tender_name:
register_num = 'HBSU' + str(tender_name).split('HBSU')[1].split(')')[0]
else:
register_num = ''
print('!!!!!!',tender_name)
detail_r = requests.get(original_url)
detail_s = PyQuery(detail_r.text)
text1 = detail_s('body > div.ewb-container > div.ewb-article > div > div.ewb-article-info').text().replace(' ', '').replace(' ', '').replace('\n', '')
tenderee = re.findall('招标人:(.*?)代理机构', text1)[0]
print(tenderee)
if '电话' in text1:
tel = re.findall('电话:(.*?)电话',text1)[0]
else:
tel = ''
print(tel)
agency = re.findall('代理机构:(.*?)地址', text1)[0]
print(agency)
# nums = re.findall(u'编号[::为]?([\(\)\w\d-]{7,25})', text1)
# if nums:
# item['register_num'] = nums[0]
dates = re.findall(u'截止时间.*?(20\d\d\D{1,5}\d{1,2}\D{1,5}\d{1,2})', text1)
if not dates:
dates = re.findall(u'应于(20\d\d\D{1,5}\d{1,2}\D{1,5}\d{1,2})日.{0,10}前', text1)
if not dates:
dates = re.findall(u'至(20\d\d\D{1,5}\d{1,2}\D{1,5}\d{1,2})', text1)
item_deadline = '' if not dates else dates[0] + '日'
print(item_deadline)
if '万元' in text1:
tender_scale = re.split('万元', text1)[0]
tender_scale = re.split(':|:|,|投资|概算|约为|约', tender_scale)[-1]
if 4 < len(tender_scale) < 12:
if not tender_scale:
item_tender_scale = ''
else:
item_tender_scale = tender_scale.replace('额为', '').replace('额', '') + '万元'
else:
item_tender_scale = ''
elif '元' in text1:
tender_scale = re.split('元', text1)[0]
tender_scale = re.split(':|:|,|;|投资|概算|约为|约', tender_scale)[-1]
if 4 < len(tender_scale) < 12:
item_tender_scale = tender_scale + '元'
else:
item_tender_scale = ''
elif '亿' in text1:
tender_scale = re.split('亿', text1)[0]
tender_scale = re.split(':|:|,|;|投资|概算|约为|约', tender_scale)[-1]
item_tender_scale = tender_scale.replace('额', '') + '亿'
else:
item_tender_scale = ''
print(item_tender_scale)
qualified = ''
# if ('投标申请人条件' in text1 or '申请人资格' in text1) and '人资质类别和等级' not in text1:
# qualified = re.split('申请人[条资][格件][::]',text1,1)[1]
if '人资质类别和等级' in text1:
qualified = re.split('人资质类别和等级[::]', text1, 1)[1]
elif '投标人资质条件' in text1 or '投标人资格要求' in text1 or '投标人资格条件' in text1 or '投标人资质要求' in text1:
qualified = re.split('投标人资[质格][要条][求件]', text1, 1)[1]
elif '投标企业资质条件' in text1 or '投标企业资格要求' in text1 or '投标企业资格条件' in text1 or '投标企业资质要求' in text1:
qualified = re.split('投标企业资[质格][要条][求件][:|:]', text1, 1)[1]
elif '投标人须具' in text1:
qualified = re.split('投标人须具[有备]', text1, 1)[1]
elif '投标人条件:' in text1 or '投标人要求:' in text1:
qualified = re.split('投标人[条要][求件][::]', text1, 1)[1]
elif '投标人的资格要求:' in text1:
qualified = re.split('投标人的资格要求:', text1, 1)[1]
elif '申请人资格要求' in text1:
qualified = re.split('申请人资格要求', text1, 1)[1]
if '一级' in text1 or '二级' in text1 or '三级' in text1:
qualified4 = re.findall('[:,。.](.*?[一二三]级)(.*?)[:,。.]', qualified)
if not qualified4:
qualified4 = re.findall('(.*?[一二三]级)(.*?)[:,。.]', qualified)
qualified4 = qualified4[0][0] + qualified4[0][1]
else:
qualified4 = qualified4[0][0] + qualified4[0][1]
elif '壹' in text1 or '贰' in text1 or '弎' in text1 or '甲' in text1 or '乙' in text1 or '丙' in text1 or '叁' in text1:
qualified4 = re.findall('[:,。.](.*?[壹贰弎叁甲乙丙]级)(.*?)[:,。.]', qualified)
qualified4 = qualified4[0][0] + qualified4[0][1]
else:
qualified4 = re.split('[。,]', qualified)[0]
if len(qualified4) > 255:
item_bidder_qualification = qualified4.split(',')[0].replace('3', '')
else:
item_bidder_qualification = qualified4.replace('3', '')
print(item_bidder_qualification)
try:
leader_qualification = re.split('总监理工程师|项目经理|项目负责人|项目总监|负责人', text1, 1)[1]
if '存在控股' in leader_qualification:
item_leader_qualification = ''
else:
item_leader_qualification = re.split('[;,。]', leader_qualification)[0].replace(':', '').replace(':',
'').replace(
'或', '').replace(',', '').replace('、', '')
except:
item_leader_qualification = ''
print(item_leader_qualification)