街拍美图获取

街拍图片的获取。

在网上搜索到一个项目,获取头条的街拍美图,于是跟着一起写下了这些代码,代码的思路很简明,下一步是如何将这个项目应用到更多的场景:


全部代码如下,包括数据库MongoDB的操作。

代码块

from urllib.parse import urlencode
import requests
from requests.exceptions import RequestException
import json
from bs4 import BeautifulSoup
import re
import lxml
from config import *
import pymongo
import os
from hashlib import md5
from multiprocessing import Pool

client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]


def get_page_index(offset,keyword):  # offset来控制图集的下拉页出             项,keyword=街拍保证是街拍这一页
    data = {
            'offset': offset,
            'format': 'json',
            'keyword': keyword,
            'autoload': 'true',
            'count': '20',
            'cur_tab': 3
           }
    url = 'http://www.toutiao.com/search_content/? ' + urlencode(data)  
    resonpse = requests.get(url)   # 这里得到是json格式数据,要在原网页上来看,要取出什么键值对
    try:
        if resonpse.status_code == 200:
            return resonpse.text
        return None
    except RequestException :
        print('请求索引页出错')
        return None

def parse_page_index(html):   # 图集这一页需要得到article的URL,并且以列表形式返回。先得到的是json变量,然后取出里面data这个k值中包括
 #article的这个信息
    data = json.loads(html)   
    if data and 'data' in data.keys():  # 有data这个键值,并且data在这个一系列返回的keys()中
        for item in data.get('data'):
            yield item.get('article_url')

def get_page_detail(url):
    try:
        resonpse = requests.get(url)
        if  resonpse.status_code == 200:
            return resonpse.text
        return None
    except RequestException :
        print('请求详情页面出错')
        return None

def parse_page_detail(html,url):
    soup = BeautifulSoup(html,"lxml")
    title = soup.select('title')[0].get_text()
    print(title)
    images_pattern = re.compile('var gallery = (.*?);',re.S)
    result = re.search(images_pattern,html)
    if result:
        data = json.loads(result.group(1))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images :
                download_image(image)
            return {'title':title,
                    'url':url,
                    'images':images
                    }

def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print('存储到mongo_db成功',result)
        return True
    return False

def download_image(url):    # 把url传过来,下载这些图片
    print('正在下载',url)
    try:
        resonpse = requests.get(url)
        if  resonpse.status_code == 200:
            save_image(resonpse.content)
        return None
    except RequestException :
        print('下载出错',url)
        return None

def save_image(content):
   file_path = 
       '{0}/{1}.{2}'.format(os.getcwd(),
         md5(content).hexdigest(),'jpg') # 匹配md5值以免重复下载                                                                                                                                                     
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)
            f.close()


def main(offset) :
    html = get_page_index(offset,KEYWORD) #主函数中先调用 get_page_index()得到图集这个网页
    for url in parse_page_index(html):
        html= get_page_detail(url)
        if html:
            result = parse_page_detail(html,url)
            if result :save_to_mongo(result)
            #print(result)


if __name__ == '__main__':
    groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)]
    pool = Pool()#开启多进程
    pool.map(main,groups)

另外:数据库代码如下:
MONGO_URL = ‘localhost’
MONGO_DB = ‘toutiao’
MONGO_TABLE = ‘街拍’
GROUP_START = 1
GROUP_END = 20
KEYWORD = ‘街拍’

最后欢迎一起交流……

猜你喜欢

转载自blog.csdn.net/qq_37884273/article/details/74625815