python requests模块 模拟请求的响应内容乱码问题(源码分析)

def request(url, data=None, get_or_post=None):
    try:
        if get_or_post:
            response = requests.post(url=url, data=data, headers=headers)
        else:
            if data:
                url = url + urlencode(data)
            response = requests.get(url=url, headers=headers)
            # print(response.headers)
            # {'Server': 'jfe', 'Date': 'Wed, 06 Mar 2019 05:01:58 GMT', 'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'xtest=3695.cf6b6759; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; domain=search.jd.com, ipLoc-djd=1-72-2799-0; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; path=/; domain=jd.com', 'Content-Encoding': 'gzip', 'Strict-Transport-Security': 'max-age=86400'}
            # print(type(response))  # <class 'requests.models.Response'>
            # print(type(response.text))  # <class 'str'>
            # print(response.headers['content-type'])  text/html
            # print(response.encoding)  # ISO-8859-1#response内容的编码
            # print(response.apparent_encoding) utf-8#response headers里设置的编码(即服务端返回的数据是用utf8格式编码的)
            # print(requests.utils.get_encodings_from_content(response.text))  ['utf-8']#response返回的html header标签里设置的编码
            '''
            class HTTPAdapter(BaseAdapter):
                # 接收到服务端的响应之后对服务端的响应进行处理,构造Response对象
                def build_response(self, req, resp):
                    response = Response()
                    response.status_code = getattr(resp, 'status', None)
                    response.encoding = get_encoding_from_headers(response.headers)
            response.encoding由下面的函数返回值赋值得到的,下面函数判断响应头中的content-type中有没有charset,如果有charset就将charset的值返回,如果没有则判断有没有text,如果有返回ISO-8859-1,而我们请求搜索页的时候content-type是没有charset的,只有text
            def get_encoding_from_headers(headers):
                """Returns encodings from given HTTP Header Dict.
            
                :param headers: dictionary to extract encoding from.
                :rtype: str
                """
            
                content_type = headers.get('content-type')
            
                if not content_type:
                    return None
            
                content_type, params = cgi.parse_header(content_type)
            
                if 'charset' in params:
                    return params['charset'].strip("'\'")
            
                if 'text' in content_type:
                    return 'ISO-8859-1'
                    
            response.text是如何被编码的:
            class Response(object):
                @property
                def text(self):
                    encoding = self.encoding # (response.encoding已被上面的函数赋值为ISO-8859-1)
                    
                    try:
                        # 将服务端返回的响应体的内容(bytes类型)使用encoding(ISO-8859-1)的编码格式进行解码,解码成str类型
                        # 但是服务端返回的响应体的内容(bytes类型)是用utf-8编码生成的,用ISO-8859-1编码格式去进行解码成str类型,肯定会乱码                        
                        content = str(self.content, encoding, errors='replace')
            总结:requests模块会根据响应头的content-type里的charset去设置响应体的编码格式,如果没有会给一个默认的编码格式ISO-8859-1, 但是服务端对响应体是用utf-8进行编码,编码成bytes类型返回的,然后你用ISO-8859-1去解码成str类型,肯定乱码(response.txt是ISO-8859-1编码格式的str类型)
            解决方案:将上述过程逆向,将response.txt str类型使用ISO-8859-1编码格式编码成服务端原始返回的utf-8编码格式的bytes类型,然后再使用utf-8编码格式解码成str类型,即response.text.encode(response.encoding).decode(response.apparent_encoding),response.apparent_encoding就是服务端返回的响应头中设置编码格式,即服务端对返回的响应体(bytes类型)的编码格式,在本例中就是utf-8
            '''
        if response.status_code == 200:
            return response.text.encode(response.encoding).decode(response.apparent_encoding)
        return None
    except RequestException:
        print('请求' + url + '出错')
        return None
def search(keyword, page): url = "https://search.jd.com/Search?" data = { "keyword": keyword, "enc": "utf-8", "page": page, } html = request(url, data) return html
html = search('显卡', 2)

猜你喜欢

转载自www.cnblogs.com/xyz2b/p/10512887.html