版权声明:自学笔记,如有引用请标明博客,感谢 https://blog.csdn.net/feng_jlin/article/details/82218819
-
#1.设定程序休止时间
-
import time
-
time.sleep(5) #sleep for 5 seconds
-
#2.设定代理,比如fb和微博等一些成熟的网站检测严格,会封锁IP
-
#使用urllib.request的两个方法进行代理的设置
-
proxy = urlrequest.ProxyHandler({'https': '47.91.78.201:3128'})#服务器地址
-
opener = urlrequest.build_opener(proxy)
360浏览器
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)
![](/qrcode.jpg)
opener.addheaders = [('User-Agent','...')]
设定User-Agent是为了告诉浏览器我是用的浏览器,而不是Python,模拟浏览器使用;如果不设定则浏览器知道你是Python,可能会限制访问速率等等
-
import urllib.request as urlrequest
-
import time #休息时间
-
import random #为了时间随机
-
IMG_PATH = 'C:/Users/feng_jlin/Desktop/imgs/{}.jpg' #最后下载的图片放到imgs下,命名为{}
-
DATA_FILE = 'C:/Users/feng_jlin/Desktop/data/votes.csv' #数据文件存在DATA下
-
STORED_IMG_ID_FILE = 'C:/Users/feng_jlin/Desktop/data/cached_img.txt' #记录下载了哪些图片,目的是为了如果下载图片中断的时候,检测以前图片ID是否被下载,如果被下载过则不进行下载
-
STORED_IMG_IDS = set() #将STORED_IMG_ID_FILE读取到这个集合中,用于检测
-
IMG_URL = 'https://maps.googleapis.com/maps/api/streetview?size=400x300&location={},{}' #谷歌map针对下载街景地图的一个API,后面两个{}则为经纬度
-
#creat the object,assign it to a variable
-
proxy = urlrequest.ProxyHandler({'https': '47.91.78.201:3128'})
-
#construct a new opener using your proxy settings
-
opener = urlrequest.build_opener(proxy)
-
opener.addheaders = [('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)')]
-
#install the openen on the module-level
-
urlrequest.install_opener(opener)
-
#将上面记录已经下载的图片ID,cached_img.txt的文件,读取到STORED_IMG_IDS的集合中
-
with open(STORED_IMG_ID_FILE) as input_file:
-
for line in input_file:
-
STORED_IMG_IDS.add(line.strip())
-
with open(DATA_FILE) as input_file:
-
skip_first_line = True #一行一行读取数据,但是第一行是属性名称,所以设置这个变量进行初始化为TURE
-
for line in input_file: #判断第一行是否为TRUE
-
if skip_first_line:
-
skip_first_line = False
-
continue #如果进来第一句为TURE,则跳出这个循环不进行下一段程序,进入下次循环;并且skip_first_line改为False,以后可以进行下一段程序
-
left_id, right_id, winner, left_lat, left_long, right_lat, right_long, category = line.split(',') #根据逗号切割数据,前面为切割出来的变量
-
if left_id not in STORED_IMG_IDS: #判断前面切割出来当个图片的left_id,是否包含在STORED_IMG_IDS中
-
print ('saving img {}...'.format(left_id)) #正在下载的提示信息
-
urlrequest.urlretrieve(IMG_URL.format(left_lat, left_long), IMG_PATH.format(left_id)) #这是一个urlretrieve方法,其实和Open与write一样,这个是将打开一个地址直接存储到一个路径里
-
STORED_IMG_IDS.add(left_id) #在STORED_IMG_IDS字典中增加下载的left_id
-
with open(STORED_IMG_ID_FILE, 'a') as output_file: #将下载的left_id增加到STORED_IMG_ID_FILE文件中,a为追加
-
output_file.write('{}\n'.format(left_id))
-
time_interval = random .uniform(1,5) #随机1-5秒停止
-
time.sleep(time_interval) # wait some time, trying to avoid google forbidden (of crawler)
-
if right_id not in STORED_IMG_IDS: #同理为right_id照片
-
print ('saving img {}...'.format(right_id))
-
urlrequest.urlretrieve(IMG_URL.format(right_lat, right_long), IMG_PATH.format(right_id))
-
STORED_IMG_IDS.add(right_id)
-
with open(STORED_IMG_ID_FILE, 'a') as output_file:
-
output_file.write('{}\n'.format(right_id))
-
time_interval = random .uniform(1,5) #随机1-5秒停止
-
time.sleep(time_interval) # wait some time, trying to avoid google forbidden (of crawler)
需要手动建文件夹
txt为手动建,csv为 http://pulse.media.mit.edu/data/ 下载的资料