import json
import requests
from lxml import etree
class JieMianSpider(object):
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
}
self.proxies = {
# "https:": "https://113.87.160.73:8181",
"https:": "https://218.72.110.144:18118",
}
def get_page(self, url):
response = requests.get(url, headers=self.headers, proxies=self.proxies)
response.encoding = response.apparent_encoding
return response.text
def run(self):
with open('jiedian.txt', 'w') as f:
page = 0
while True:
page += 1
url = "https://a.jiemian.com/index.php?m=lists&a=cLists&id=242&type=card¬id=2080130,2074075,2070788&page={0}".format(page)
# 1.发起请求
response = self.get_page(url)
# 2.获取json数据转换成dict类型
res_dict = json.loads(response[1:-1])
# print(res_dict)
res_data = res_dict['rst']
# 3.获取节点,解析数据
html = etree.HTML(res_data)
el_objs = html.xpath('//div[@class="news-img"]')
result = []
for el in el_objs:
url = el.xpath('./a/@href')
img = el.xpath('./a/img/@src')
title = el.xpath('./a/@title')
result.append({
"url": url,
"img": img,
"title": title
})
result = json.dumps(result, ensure_ascii=False)
print(result)
f.write(result)
if page==20:break
if __name__ == '__main__':
JM_spider = JieMianSpider()
JM_spider.run()