爬取天眼查第一步获取id的代码思路。

反正就是抓源码,然后正则表达式。没什么技术含量。爬的也是最基础的。

result2=[]
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoSuchWindowException
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from bs4.element import Tag
import time
from urllib.parse import quote
import csv
import random
import re
company=’’
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap[“phantomjs.page.settings.userAgent”] = (
“Mozilla/5.0 (Windows NT 10.0; WIN64; X64; rv:63.0) Gecko/20100101 Firefox/63.0”
)
driver = webdriver.PhantomJS(executable_path=‘D:/anaconda3/Scripts/phantomjs.exe’, desired_capabilities=dcap)

#创建webdriver访问的url
def full_url(company):
url_head = ‘http://www.tianyancha.com/search?key=
url_full = url_head + quote(company) + ‘&searchType=company’
return url_full
url=full_url(company)
url

driver.get(url)
import time
#等待5秒,更据动态网页加载耗时自定义
time.sleep(5)

获取网页内容

content = driver.page_source.encode(‘utf-8’)
driver.close()
print(content)
soup = BeautifulSoup(content, ‘lxml’)

ll=k1.find_all(name=‘div’,attrs={‘class’:'search-result-single '})

k1=soup.select(’#web-content’)

str(k2)

print(type(k1))

k2=str(k1[0])

print(type(k2))

print(ll)

k=str(ll[0])

print(type(k))

pattern =re.compile(r’data-id="[0-9]*’) # 查找数字

# pattern2=re.compile(r’人员规模小于[0-9]*’)

result1=pattern.findall(k2)

# result2=pattern2.findall(k)

result2.append(result1[0])

print(result2)

猜你喜欢

转载自blog.csdn.net/qq_18251703/article/details/85763322