#! -- coding: utf-8 --

Date:2020-09-20 16:52

USER:gordon_lu

使用正则表达式删选指定的 URL 链接。

“”"
oo0oo
o8888888o
88" . “88
(| -- |)
0\ = /0
/ ‘—’ _
.’ | |/ ‘.
/ \||| : |||//
/ ||||| -卍- |||||
| | \\ - /// | |
| _| ‘’\ — /’’ | |
\ .-__ ‘-’ /-. /
， . ’ /–.--\ ’ ’
. ""￣￣ <’ '. _<|>/.’ '>￣￣ “” .
| | : ‘- \ .; ’ \ _ /’ ;, / - ’ : | |
\ \ '. _ __ \ / / . _.’ / /
===== '-.___ ‘.___ ___/.-’ _____.-’ =====
‘=—=’
“””
import requests
import re
import random

1,获取URL

word = input(“请输入你要爬虫的内容【暂时仅支持英文和字母】：”)
url = f’https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=&st=-1&fm=result&fr=&sf=1&fmq=1600592048477_R&pv=&ic=&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word={word}’

def random_user_agent():
ulist=[
“Mozilla/5.0 (Windows NT 6.1;Win64;x86) AppleWebKit / 537.36 (KHTML, likeGecko) Chrome / 88.0.4183.102Safari / 537.36”,
“Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36”
“Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36”
]
return ulist[random.randint(0,len(ulist)-1)]

def get_image(url):
headers = {
“user-agent”: random_user_agent(),
“referer”: url,
}

result = requests.get(url,headers = headers).text
# print(result)

image_urls = re.findall('"objURL":"(.*?)"',result)  # 找大图的 URL 确实不好找，这个技巧需要慢慢总结


for image_url in image_urls:
    # print(image_url)
    # 设置名字

    image_name = image_url.split('/')[-1]
    print(image_name)

    image_end = re.search('(.jpg|.png|.jpeg|.gif)$',image_name)
    if image_end ==None:
        image_name = image_name + '.jpg'

    image = requests.get(image_url).content



    with open('./baidu_pic/%s'%image_name,'wb') as f:
        f.write(image)

get_image(url)

爬虫百度百万高清美图源代码

Date:2020-09-20 16:52

USER:gordon_lu

使用正则表达式删选指定的 URL 链接。

1,获取URL

猜你喜欢

爬虫百度百万高清美图源代码

Date:2020-09-20 16:52

USER:gordon_lu

使用正则表达式 删选指定的 URL 链接。

1,获取URL

猜你喜欢

使用正则表达式删选指定的 URL 链接。