1. 网址
https://news.163.com/
2. 页面解析
上面这部分可直接在源码里找到
下面这部分通过js异步加载
3. 异步加载部分
向下刷新页面,查找发起请求的地址;url变化也比较明显;找到接口后可直接获取数据
4. pc端接口
打开f12 ,选择移动端,刷新页面,即可跳转到移动端页面,如下
5. pc 端解析
移动端也是通过js异步加载,向下刷新查找数据接口;数据接口比较明显,可直接访问
6. 源码参考
import re
import json
import aiohttp
import asyncio
class Spider(object):
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
async def fetch(self, session, url):
"""
获取网页源码
:param session:
:param url:
:return:
"""
async with session.get(url, headers=self.headers) as response:
# response.text()可以指定编码解码
return await response.text(encoding='utf-8')
async def parser(self, html):
"""
解析网页数据
:param html:
:return:
"""
data_list = json.loads(re.findall(r'artiList\((.*)\)', html)[0])['BBM54PGAwangning']
for data in data_list:
# 文档id
docid = data['docid']
# 来源
source = data['source']
# 标题
title = data['title']
# 优先权
priority = data['priority']
# 详情页
url = data['url']
# 评论数
commentCount = data['commentCount']
# 摘要
digest = data['digest']
# 首页图片
imgsrc = data['imgsrc']
# 发布时间
ptime = data['ptime']
print(title)
items = {
'文档id': docid,
'来源': source,
'标题': title,
'优先权': priority,
'详情页': url,
'评论数': commentCount,
'摘要': digest,
'首页图片': imgsrc,
'发布时间': ptime
}
# print(items)
with open('wangyi.json', 'a', encoding='utf-8') as f:
f.write(json.dumps(items, ensure_ascii=False) + '\n')
async def download(self, url, table):
"""
处理网页
:param url:
:return:
"""
# 设置最大连接数和忽略证书错误
async with aiohttp.TCPConnector(limit=10, verify_ssl=False) as tc:
# 创建一个clientsession对象
async with aiohttp.ClientSession(connector=tc) as session:
html = await self.fetch(session, url)
await self.parser(html)
if __name__ == '__main__':
import time
t0 = time.time()
urls = ['https://3g.163.com/touch/reconstruct/article/list/BBM54PGAwangning/{}-10.html'.format(i*10) for i in
range(31)]
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(Spider().download(url, Spider().data)) for url in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
print(time.time()-t0)
# 0.8280472755432129