1.有道翻译
元素如下
代码:
import urllib.request
import urllib.parse
url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
data = {}
data['action'] = 'FY_BY_CLICKBUTTION'
data['client'] = 'fanyideskweb'
data['doctype'] = 'json'
data['from'] = 'AUTO'
data['i'] = 'I+love+fish'
data['keyfrom'] = 'fanyi.web'
data['salt'] = '1538035011463'
data['sign'] = 'ad6798a0ad1cb20ca5426bfe6d21aace'
data['smartresult'] = 'dict'
data['to'] = 'AUTO'
data['typoResult'] = 'false'
data['version'] = '2.1'
data =urllib.parse.urlencode(data).encode('utf-8')
response = urllib.request.urlopen(url,data)
html = response.read().decode('utf-8')
print(html)
运行会报错:{"errorCode":50}
将url中的 '_o'删除后运行结果正常,即
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
尝试将data中一些字典删去,doctype和i不可以删,其他的删了也能正常运行
2. 爬取http://www.51yuansu.com/
代码如下:
'''
Python爬虫练习 --爬取图片
'''
import urllib.request
import os
img_addrs = []
html_addrs = []
def url_open(url):#打开页面
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0')
response = urllib.request.urlopen(url)#
html = response.read()###为什么没有decode? save_imgs,不能保存为解码的格式
return html
def get_page(url):#返回一个字符串,找到要爬取的页面
html = url_open(url).decode('utf-8')
a = html.find('http://www.51yuansu.com')
print("get_page...")
while a != -1:
b = html.find('.html',a,a+255)
if b != -1:
html_addrs.append(html[a:b+5])
a = html.find('http://www.51yuansu.com',b+5)
else:
b = a + 255
a = html.find('http://www.51yuansu.com',b)
def find_imgs(url):#get_page找到图片地址
print("find_image...")
html = url_open(url).decode('utf-8')
a = html.find('http://pic.')
while a != -1:
b = html.find('.jpg',a,a + 255)
print(b)
if b != -1:
img_addrs.append(html[a:b+4])
print(img_addrs)
a = html.find('http://pic.',b+4)
else:
b = a + 255
a = html.find('http://pic.',b)
def save_imgs(folder,img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename,'wb') as f:
img = url_open(each)
f.write(img)
def down_pic(folder = 'Picture-ALL',pages = 1):
os.mkdir(folder)
os.chdir(folder)
url_init = 'http://www.51yuansu.com/all/'
html_addrs.append(url_init)
print('first:',html_addrs)
page_num = get_page(url_init)
for each in html_addrs:
find_imgs(each)#列表-找到的图片
print(img_addrs)
save_imgs(folder,img_addrs)#保存这些图片
if __name__ == '__main__':
down_pic()