初识网络爬虫

这里写图片描述

HTTP请求

urllib模块

GET请求

import urllib
response=urllib.request.urlopen('http://www.zhihu.com')
html=response.read()
print(html)
import urllib
request=urllib.request.Request('https://weibo.com')
response=urllib.request.urlopen(request)
html=response.read()
print(html)

POST请求

import urllib

url='https://weibo.com/login'
postdata={
    'uname':' [email protected]',
    'password':'suhangshispz '}
data=urllib.parse.urlencode(postdata).encode('utf-8')
req=urllib.request.Request(url,data)
response=urllib.request.urlopen(req)
html=response.read()
print(html)

Headers处理

import urllib

url='http://www.weibo.com/login'
user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0'
referer='http://s.weibo.com/'
postdata={
    'uname':' [email protected]',
    'password':'suhangshispz '}
headers={'User_Agent':user_agent,'Referer':referer}
data=urllib.parse.urlencode(postdata).encode('utf-8')
req=urllib.request.Request(url,data,headers)
response=urllib.request.urlopen(req)
html=response.read()
print(html)

Cookie处理

import urllib
from http import cookiejar

cookie=cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response=opener.open('http://www.zhihu.com')
for item in cookie:
    print(item.name+':'+item.value)
import urllib

opener=urllib.request.build_opener()
opener.addheaders.append(('Cookie','email='+'[email protected]'))
req=urllib.request.Request('http://www.zhihu.com/')
response=opener.open(req)
print(response.headers)
retdata=response.read()

Timeout设置

import urllib
import socket

socket.setdefaulttimeout(10)
urllib.request.socket.setdefaulttimeout(10)
import urllib

request=urllib.request.Request('http://www.zhihu.com')
response=urllib.request.urlopen(request,timeout=2)
html=response.read()
print(html)

HTTP响应码

import urllib

try:
    response=urllib.request.urlopen('http://www.python.org/fish.html')
    print(response)
except (urllib.request.HTTPError,urllib.request.URLError) as e:
    if hasattr(e,'code'):
        print('Error code:',e.code)

重定向

import urllib

response=urllib.request.urlopen('http://www.zhihu.com')
isRedirected=response.geturl()=='http://www.zhihu.cn'
import urllib

class RedirectHandler(urllib.request.HTTPRedirectHandler):
    def http_error_301(self,req,fp,code,msg,headers):
        pass
    def http_error_302(self,req,fp,close,msg,header):
        result.status=code
        result.newurl=result.geturl()
        return result

opener=urllib.request.build_opener(RedirectHandler)
opener.open('http://www.zhihu.cn')

Proxy设置

import urllib

proxy=urllib.request.ProxyHandler({'http':'121.42.167.160'})
opener=urllib.request.build_opener(proxy,)
response=opener.open('http://www.zhihu.com/')
print(response.read())

Requests模块

GET请求

import requests

r=requests.get('http://www.baidu.com')
print(r.content)

POST请求

import requests

postdata={
    'uname':' [email protected]',
    'password':'suhangshispz '}
r=requests.post('http://weibo.com/login',data=postdata)
print(r.content)

复杂请求

import requests

payload={'Keywords':'blog:qiyeboy','pageindex':1}
r=requests.get('http://zzk.cnblogs.com/s/blogpost',params=payload)
print(r.url)

响应

import requests

r=requests.get('http://www.baidu.com')
#print('content-->'+r.content)
print('text-->'+r.text)
print('encoding-->'+r.encoding)
r.encoding='utf-8'
print('new text-->'+r.text)
import requests
import chardet

r=requests.get('http://www.baidu.com')
print(chardet.detect(r.content))
r.encoding=chardet.detect(r.content)['encoding']
print(r.text)
import requests

r=requests.get('http://www.baidu.com',stream=True)
print(r.raw.read(10))

Headers处理

import requests

user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331'
headers={'User-Agent':user_agent}
r=requests.get('http://www.baidu.com',headers=headers)
print(r.content)

响应码&响应头

import requests

r=requests.get('http://www.baidu.com')
if r.status_code==requests.codes.ok:
    print(r.status_code,'\n',r.headers,'\n',r.headers.get('content-type'),'\n',r.headers['content-type'])
else:
    r.raise_for_status()

Cookie处理

#获取Cookie字段的值
import requests

user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331'
headers={'User-Agent':user_agent}
r=requests.get('http://www.baidu.com',headers=headers)
for cookie in r.cookies.keys():
    print(cookie+':'+r.cookies.get(cookie))
#自定义Cookie
import requests

user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331'
headers={'User-Agent':user_agent}
cookies=dict(name='qiye',age='10')
r=requests.get('http://www.baidu.com',headers=headers,cookies=cookies)
print(r.text)
#自动处理Cookie
import requests

loginUrl='http://www.weibo.com/login'
s=requests.Session()
r=s.get(loginUrl,allow_redirects=True)
datas={
    'uname':' [email protected]',
    'password':'suhangshispz '}
r=s.post(loginUrl,data=datas,allow_redirects=True)
print(r.text)

重定向&历史信息

import requests

r=requests.get('http://github.com')
print(r.url)
print(r.status_code)
print(r.history)

Timeout设置

import requests

r=requests.get('http://github.com',timeout=2)
print(r.content)

Proxy设置

import requests

proxies={
    'http':'http://111.121.193.214',
    'https':'http://121.201.33.100',
}
requests.get('http://example.org',proxies=proxies)

猜你喜欢

转载自blog.csdn.net/weixin_39777626/article/details/81563685