街拍图片的获取。
在网上搜索到一个项目,获取头条的街拍美图,于是跟着一起写下了这些代码,代码的思路很简明,下一步是如何将这个项目应用到更多的场景:
全部代码如下,包括数据库MongoDB的操作。
代码块
from urllib.parse import urlencode
import requests
from requests.exceptions import RequestException
import json
from bs4 import BeautifulSoup
import re
import lxml
from config import *
import pymongo
import os
from hashlib import md5
from multiprocessing import Pool
client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]
def get_page_index(offset,keyword): # offset来控制图集的下拉页出 项,keyword=街拍保证是街拍这一页
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3
}
url = 'http://www.toutiao.com/search_content/? ' + urlencode(data)
resonpse = requests.get(url) # 这里得到是json格式数据,要在原网页上来看,要取出什么键值对
try:
if resonpse.status_code == 200:
return resonpse.text
return None
except RequestException :
print('请求索引页出错')
return None
def parse_page_index(html): # 图集这一页需要得到article的URL,并且以列表形式返回。先得到的是json变量,然后取出里面data这个k值中包括
#article的这个信息
data = json.loads(html)
if data and 'data' in data.keys(): # 有data这个键值,并且data在这个一系列返回的keys()中
for item in data.get('data'):
yield item.get('article_url')
def get_page_detail(url):
try:
resonpse = requests.get(url)
if resonpse.status_code == 200:
return resonpse.text
return None
except RequestException :
print('请求详情页面出错')
return None
def parse_page_detail(html,url):
soup = BeautifulSoup(html,"lxml")
title = soup.select('title')[0].get_text()
print(title)
images_pattern = re.compile('var gallery = (.*?);',re.S)
result = re.search(images_pattern,html)
if result:
data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images :
download_image(image)
return {'title':title,
'url':url,
'images':images
}
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('存储到mongo_db成功',result)
return True
return False
def download_image(url): # 把url传过来,下载这些图片
print('正在下载',url)
try:
resonpse = requests.get(url)
if resonpse.status_code == 200:
save_image(resonpse.content)
return None
except RequestException :
print('下载出错',url)
return None
def save_image(content):
file_path =
'{0}/{1}.{2}'.format(os.getcwd(),
md5(content).hexdigest(),'jpg') # 匹配md5值以免重复下载
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
def main(offset) :
html = get_page_index(offset,KEYWORD) #主函数中先调用 get_page_index()得到图集这个网页
for url in parse_page_index(html):
html= get_page_detail(url)
if html:
result = parse_page_detail(html,url)
if result :save_to_mongo(result)
#print(result)
if __name__ == '__main__':
groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)]
pool = Pool()#开启多进程
pool.map(main,groups)
另外:数据库代码如下:
MONGO_URL = ‘localhost’
MONGO_DB = ‘toutiao’
MONGO_TABLE = ‘街拍’
GROUP_START = 1
GROUP_END = 20
KEYWORD = ‘街拍’
最后欢迎一起交流……