一、socket编程与mvc结构

一、socket的http套路

　　web应用本质上是一个socket服务端，用户的浏览器是一个socket客户端。socket处在应用层与传输层之间，是操作系统中I/O系统的延伸部分（接口），负责系统进程和应用之间的通信。

　　HTTP协议又称超文本传输协议。

1 //浏览器发送一个HTTP请求；
2 //服务器收到请求，根据请求信息，进行函数处理，生成一个HTML文档；
3 //服务器把HTML文档作为HTTP响应的Body发送给浏览器；
4 //浏览器收到HTTP响应，从HTTP Body取出HTML文档并显示；

View Code

　　1、客户端套路解析

 1 import socket, ssl
 2 # socket 是操作系统用来进行网络通信的底层方案，用来发送/接收数据
 3 
 4 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 5 # socket.AF_INET表示是ipv4协议，socket.SOCK_STREAM表示是tcp协议,这两个值是默认的。
 6 # s = socket.socket()
 7 # 上面只能连接http，如果连接https，需要用s = ssl.wrap_socket(socket.socket())
 8 
 9 host, port  = ("g.cn", 80)
10 s.connect((host, port))  # 连接主机，参数是主机的和端口
11 
12 ip, port = s.getsockname()
13 print('本机 ip 和 port {} {}'.format(ip, port))  # 查看本机的ip和端口
14 
15 # 构造一个http请求
16 http_request = 'GET / HTTP/1.1\r\nhost:{}\r\n\r\n'.format(host)
17 
18 # 发送HTTP请求给服务器
19 # send 函数只接收bytes作为参数，所以要重编码为utf-8。实际上,web数据传送都是utf-8编码的字节流数据。
20 
21 request = http_request.encode(encoding='utf_8', errors='strict')
22 print("发送请求", request)
23 s.send(request)
24 
25 
26 # 接收服务器的相应数据
27 response = s.recv(1024)   # buffer_size=1024，只接收1024个字节，多余的数据就不接收了。粘包。
28 
29 # 输出响应的数据，bytes类型
30 # print("响应", response)
31 # 再讲response的utf-8编码的字节流数据进行解码(实际上是转换成unicode编码的字符串，因为读进了内存)。
32 print("响应的 str 格式: ", end="\r\n")
33 print(response.decode(encoding='utf-8'))

View Code

　　2、服务器套路解析

 1 import socket
 2 
 3 host, port = '', 2000
 4 # 服务器的host为空字符串，表示接受任意ip地址的连接
 5 
 6 s  = socket.socket()
 7 s.bind((host, port))  # 监听
 8 
 9 # 用一个无线循环来接受数据
10 while True:
11     print("before listen")
12     s.listen(5)
13     
14     # 接收请求
15     connection, address = s.accept()
16     # connection是一个socket.socket对象
17     print("after listen")
18     
19     # 接收请求
20     request = connection.recv(1024)  # 只接收1024个字节
21     print("ip and request, {}\n{}".format(address, request.decode("utf-8")))
22     
23     # 构造响应
24     response = b"HTTP/1.1 200 OK\r\n\r\n<h1>Hello World!</h1>"
25     
26     # 用sendall发送响应数据
27     connection.sendall(response)
28     
29     # 关闭连接
30     connection.close()
31

View Code

　　在浏览器中输入localhost:2000，看到Hello World。后台打印的结果如下:

 1 before listen
 2 after listen
 3 ip and request, ('127.0.0.1', 54275)
 4 GET / HTTP/1.1
 5 Host: localhost:2000
 6 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:59.0) Gecko/20100101 Firefox/59.0
 7 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
 8 Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
 9 Accept-Encoding: gzip, deflate
10 Cookie: username-localhost-8888="2|1:0|10:1524886551|23:username-localhost-8888|44:ZTc2ZjE3MjIxMTMwNDIxYzg3OTZmMDlkMDdhNzhjMjI=|b432650e1e450be30083dd567068aebd47dc8d5b7167b068610af60db4c5a35d"; _xsrf=2|fc2cd1bd|88c10f771944fea069e65d5e767b6621|1524882104
11 Connection: keep-alive
12 Upgrade-Insecure-Requests: 1

View Code

　　粘包处理

1 粘包处理:
2 
3 buffer_size = 1023
4 r = b''
5 while True:
6     request = connection.recv(buffer_size)
7     r += request
8     if len(request) < buffer_size:
9     break

View Code

　　3、HTTP请求内容解析

 1 """
 2 HTTP头
 3 http://localhost:2000/，浏览器默认会隐藏http://和末尾的/
 4 
 5 GET / HTTP/1.1
 6 # GET表示请求方式， / 表示请求的资源路径， HTTP/1.1 协议版本
 7 
 8 Host: localhost:2000
 9 # 主机地址和端口
10 
11 Connection: keep-alive
12 # keep-alive保持连接状态，它表示http连接(不用TCP请求断开再请求)；close每次离开就关闭连接
13 
14 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:59.0) Gecko/20100101 Firefox/59.0
15 # 用户代理，浏览器标识。可以伪造浏览器。
16 
17 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
18 # 浏览器接收的数据类型。左边是数据类型，右边是解析时的权重(优先级)。
19 
20 Accept-Encoding: gzip, deflate
21 # 可解压的压缩文件类型；只表示能解压该类型的压缩文件，不代表拒绝接收其它类型的压缩文件。
22 
23 Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
24 # 支持的语言类型及其解析权重。
25 
26 Cookie: username-localhost-8888="2|1:0|10:1524886551|23:username-localhost-8888|44:ZTc2ZjE3MjIxMTMwNDIxYzg3OTZmMDlkMDdhNzhjMjI=|b432650e1e450be30083dd567068aebd47dc8d5b7167b068610af60db4c5a35d"; _xsrf=2|fc2cd1bd|88c10f771944fea069e65d5e767b6621|1524882104
27 # 浏览器缓存。
28 
29 /favicon.ico
30 # url地址图标，非必须。
31 
32 # Header里可以添加任意的内容。
33 """

View Code

二、socket的udp和tcp套路

　　1、udp的客户端和服务端写法

　　客户端

 1 import socket
 2 # upd链接
 3 # SOCK_DGRAM:数据报套接字，主要用于UDP协议
 4 udpSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 5 
 6 # 关闭防火墙
 7 # 同一网段（局域网）下，主机的ip地址和端口号.
 8 sendAddr = ('192.168.10.247', 8080)
 9 
10 # 绑定端口:写的是自己的ip和固定的端口，一般是写在sever端.
11 udpSocket.bind(('', 9900))
12 
13 # sendData = bytes(input('请输入要发送的数据：'), 'gbk')
14 # gbk, utf8, str
15 sendData = input('请输入要发送的数据：').encode('gbk')
16 
17 # 使用udp发送数据，每一次发送都需要写上接收方的ip地址和端口号
18 udpSocket.sendto(sendData, sendAddr)
19 # udpSocket.sendto(b'hahahaha', ('192.168.10.247', 8080))
20 
21 udpSocket.close()

View Code

　　服务端

 1 import socket
 2 
 3 udpSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 4 
 5 # 接收方一般需要绑定端口
 6 # ''表示自己电脑的任何一个ip，即无线和有限同时连接或者电脑有不同的网卡（桥接），会有多个ip.
 7 # 绑定自己的端口
 8 bindAddr = ('', 7788)
 9 udpSocket.bind(bindAddr)
10 
11 recvData = udpSocket.recvfrom(1024)
12 # print(recvData)
13 print(recvData[0].decode('gbk'))
14 
15 udpSocket.close()
16 # recvData的格式：（data, ('ip', 端口)).它是一个元组，前面是数据，后面是一个包含ip和端口的元组.

View Code

　　2、tcp的客户端和服务端写法

　　客户端

 1 import socket
 2 
 3 tcpClient = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 4 
 5 serverAddr = ('192.168.10.247', 8899)
 6 
 7 # tcp的三次握手，写进了这一句话
 8 tcpClient.connect(serverAddr)
 9 
10 sendData = input('')
11 
12 # 直接用send就行了，udp是用sendto
13 tcpClient.send(sendData.encode('gbk'))
14 
15 recvData = tcpClient.recv(1024)
16 
17 print('接收到的数据为：%s' % recvData.decode('gbk'))
18 
19 tcpClient.close()
20 
21 # 为什么用send而不是sendto？因为tcp连接是事先链接好了，后面就直接发就行了。前面的connect已经连接好了，后面直接用send发送即可。
22 # 而udp必须用sendto，是发一次数据，连接一次。必须要指定对方的ip和port。
23 # 相同的道理，在tcpServer端，要写recv，而不是recvfrom来接收数据

View Code

　　服务端

 1 import socket
 2 tcpServer = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 3 tcpServer.bind(('', 8899))
 4 tcpServer.listen(5)
 5 
 6 # tcp的三次握手，写进了这一句话当中
 7 tcpClient, tcpClientInfo = tcpServer.accept()
 8 # tcpServer.accept()，不需要写ip,可以接收多个客户端的。但事先要绑定端口和接入的客户端的数量
 9 # client 表示接入的新的客户端
10 # clientInfo 表示接入的新的客户端的ip和端口port
11 
12 recvData = tcpClient.recv(1024)
13 print('%s: %s' % (str(tcpClientInfo), recvData.decode('gbk')))
14 
15 # tcp的四次握手，写进了这一句话
16 tcpClient.close()
17 tcpServer.close()
18 
19 # tcpServer.accept()：等待客户端的接入，自带堵塞功能：即必须接入客户端，然后往下执行
20 # tcpClient.recv(1024): 也是堵塞，不输入数据就一直等待，不往下执行.
21 # tcpServer创建了两个套接字，一个是Server，另一个是tcpClient.Server负责监听接入的Client,再为其创建专门的tcpClient进行通信.

View Code

　　3、服务端开启循环和多线程模式

　　开启循环

 1 import socket
 2 
 3 Server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 4 
 5 Server.bind(('', 9000))
 6 Server.listen(10)
 7 
 8 while True:
 9     # 如果有新的客户端来链接服务器，那么就产生一个新的套接字专门为这个客户端服务
10     serverThisClient, ClientInfo = Server.accept()
11     print('Waiting connect......')
12 
13     # 如果客户发送的数据是空的，那么断开连接
14     while True:
15         recvData = serverThisClient.recv(1024)
16         if len(recvData) > 1:
17 
18             print('recv:　%s' % recvData.decode('gbk'))
19 
20             sendData = input('send: ')
21             serverThisClient.send(sendData.encode('gbk'))
22         else:
23             print('再见！')
24             break
25     serverThisClient.close()

View Code

　　多线程写法

 1 from threading import Thread
 2 import socket
 3 # 收数据，然后打印
 4 def recvData():
 5     while True:
 6         recvInfo = udpSocket.recvfrom(1024)
 7         print('%s:%s' % (str(recvInfo[1]), recvInfo[0].decode('gbk')))
 8 
 9 # 检测键盘，发数据
10 def sendData():
11     while True:
12         sendInfo = input('')
13         udpSocket.sendto(sendInfo.encode('gbk'), (destIp, destPort))
14 
15 udpSocket = None
16 destIp = ''
17 destPort = 0
18 # 多线程
19 def main():
20 
21     global udpSocket
22     global destIp
23     global destPort
24 
25     destIp = input('对方的ip: ')
26     destPort = int(input('对方的端口：'))
27 
28     udpSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
29     udpSocket.bind(('', 45678))
30 
31     tr = Thread(target=recvData)
32     ts = Thread(target=sendData)
33 
34     tr.start()
35     ts.start()
36 
37     tr.join()
38     ts.join()
39 if __name__ == '__main__':
40     main()

View Code

三、socket 获取 html

　　目标：获取https://movie.douban.com/top250整个网页。

　　过程：

　　　　1.构造GET请求头，包括: https协议处理，movie.douban.com为host，port默认为443，/top250为请求的url。

　　　　2.粘包接收字节流并进行解析，返回状态码、响应头、响应体。

　　实例代码：

  1 import socket, ssl
  2 
  3 
  4 """
  5 https 请求的默认端口是 443，https 的 socket 连接需要 import ssl，并且使用 s = ssl.wrap_socket(socket.socket()) 来初始化
  6 
  7 HTTP 协议的 301 状态会在 HTTP 头的 Location 部分告诉你应该转向的 URL
  8 如果遇到 301, 就请求新地址并且返回
  9         HTTP/1.1 301 Moved Permanently
 10         ...
 11         Location: https://movie.douban.com/top250
 12 """
 13 
 14 
 15 def parse_url(url):
 16     protocol = 'http'
 17     
 18     if url[:7] == 'http://':
 19         u = url.split("://")[1]
 20     elif url[:8] == "https://":
 21         protocol = "https"
 22         u = url.split("://")[1]
 23     else:
 24         u = url
 25     
 26     # https://g.cn:1234/hello/world
 27     # 这里host就是g.cn:1234
 28     # 这里path就是/hello/world
 29     
 30     # 检查host和path
 31     i = u.find('/')
 32     if i == -1:
 33         host = u
 34         path = '/'
 35     else:
 36         host = u[:i]
 37         path = u[i:]
 38     
 39     # 检查端口
 40     port_dict = dict(
 41         http=80,
 42         https=443,
 43     )
 44     # 默认端口
 45     port = port_dict[protocol]
 46     if ":" in host:
 47         h = host.split(":")
 48         host = h[0]             # 到这里获取ip
 49         port = int(h[1])        # 获取端口
 50     return protocol, host, port, path
 51 
 52 
 53 def socket_by_protocol(protocol):
 54     """根据协议返回一个socket实例"""
 55     if protocol == 'http':
 56         s = socket.socket()
 57     else:
 58         # https协议要用到ssl
 59         s = ssl.wrap_socket(socket.socket())
 60     return s
 61         
 62 def response_by_socket(s): 
 63     """s是一个socket实例，返回这个socket读取的所有数据"""
 64     response = b''
 65     buffer_size = 1024
 66     while True:
 67         r = s.recv(buffer_size)
 68         if len(r) == 0:
 69             break
 70         response += r
 71     return response
 72 
 73 def parsed_response(r):
 74     """
 75     把response解析出 状态码 headers body 返回
 76     状态码是 int
 77     headers是 dict
 78     body是 str
 79     """
 80     header, body = r.split('\r\n\r\n', 1)     # split 1 只分割1次
 81     h = header.split('\r\n')
 82  
 83     # HTTP/1.1 200 OK
 84     status_code = h[0].split()[1]   # 空格切分，取中间的状态码
 85     status_code= int(status_code)
 86     
 87     headers = {}
 88     for line in h[1: ]:
 89         k, v = line.split(": ")
 90         headers[k] = v
 91     return status_code, headers, body
 92 
 93 def get(url):
 94     """用get请求url并返回响应"""
 95     protocol, host, port, path = parse_url(url)
 96     print(protocol, host, port, path)
 97     # 根据protocol确定socket方式
 98     s = socket_by_protocol(protocol)
 99     s.connect((host, port))
100     
101     # 构造请求，注意不要用keep-alive,因为服务器不会主动关闭连接
102     request = 'GET {} HTTP/1.1\r\nhost: {}\r\nconnection: close\r\n\r\n'.format(path, host)
103     
104     # 发送请求
105     s.send(request.encode(encoding='utf_8', errors='strict'))
106     
107     # 获取响应
108     response = response_by_socket(s)   # 接收所有的数据
109     r = response.decode('utf-8')
110     
111     status_code, headers, body = parsed_response(r)  # 解析状态码,请求头和请求体
112     
113     if status_code in [301, 302]:    # 301和302是重定向，要递归进行寻址
114         url = headers["Location"]
115         return get(url)
116     return status_code, headers, body    
117 
118 def main():
119     url = 'https://movie.douban.com/top250'
120     status_code, headers, body = get(url)
121     print(status_code, headers, body)
122 
123 if __name__ == '__main__':
124     main()
125

View Code

　　缺陷：这里没有对html文档进行处理，包括html文档中所需内容(node节点)的解析和保存。可以使用bs4进一步处理。

　注意：request请求必须要确保无误，否则会很容易出错。

四、socket写web服务

　　这里的内容极其重要。一切web框架的本质是socket。Django和Flask的框架都是在下面最简功能上进行拆分解耦建立起来的。

 1 import socket, functools
 2 from docutils.parsers.rst.directives import encoding
 3 
 4 #  包裹print，禁止它的一些额外的功能
 5 def log(*args, **kwargs):
 6     print("LOG: ", *args, **kwargs)
 7 # 禁止函数的默认返回, func = lambda x: x, print = func(print)。这个在flask的werkzeug.local中用了几十次。
 8 
 9 
10 def route_index():
11     """主页的处理函数, 返回响应"""
12     header = 'HTTP/1.x 200 OK \r\nContent-Type: text/html\r\n'
13     body = '<h1>Hello World.</h1><img src="dog.gif"/>'
14     r = header + '\r\n' + body
15     return r.encode(encoding='utf-8')
16 
17 def route_image():
18     """返回一个图片"""
19     with open('dog.gif', mode='rb') as f:
20         header = b'HTTP/1.x 200 OK\r\nContent-Type: image/gif\r\n\r\n'
21         img = header + f.read()
22         return img
23 
24 def page(html):
25     with open(html, encoding="utf-8") as f:
26         return f.read()
27     
28     
29 def route_msg():
30     """返回一个html文件"""
31     header = 'HTTP/1.x 200 OK \r\nContent-Type: text/html\r\n'
32     body = page("html_basic.html")
33     r = header + '\r\n' + body
34     return r.encode(encoding='utf-8')
35 
36 
37 def error(code=404):
38     e = {
39         404: b'HTTP/1.x 404 NOT FOUND\r\n\r\n<h1>Page Not Found</h1>',
40     }
41     return e.get(code, b'')
42 
43 
44 def response_for_path(path):
45     """根据path调用相应的处理函数，没有处理的path会返回404"""
46     r = {
47         '/': route_index,
48         '/dog.gif': route_image,
49         '/msg': route_msg,
50     }
51     response = r.get(path, error)   # 注意，这里用dict的get方法设置了不存在时的默认值
52     return response()
53     
54 def run(host="", port=3000):
55     """启动服务器"""
56     with socket.socket() as s:
57         # 使用with可以保证程序终端的时候正确关闭socket，释放占用的端口
58         s.bind((host, port))
59         
60         while True:
61             s.listen(5)
62             
63             connection, address = s.accept()
64             request = connection.recv(1024)
65             request = request.decode('utf-8')
66             log('ip and request, {}\n{}'.format(address, request))
67             
68             try:
69                 path = request.split()[1]
70                 response = response_for_path(path)  # 用response_for_path函数来根据不同的path，生成不同的响应内容
71                 connection.sendall(response)
72             
73             except Exception as e:
74                 log("error ", e)
75             
76             connection.close()
77     
78 
79 def main():
80     config = dict(
81         host='',
82         port=4000,
83     )
84     run(**config)
85     
86 if __name__ == '__main__':
87     main()

View Code

　　用到的图片直接放在当前.py同一目录下即可。html也是同级目录，内容如下:

 1 <!DOCTYPE html>
 2 <!-- 注释是这样的, 不会被显示出来 -->
 3 <!--
 4     html 格式是浏览器使用的标准网页格式
 5     简而言之就是 标签套标签
 6 -->
 7 <!-- html 中是所有的内容 -->
 8 <html>
 9     <!-- head 中是放一些控制信息, 不会被显示 -->
10     <head>
11         <!-- meta charset 指定了页面编码, 否则中文会乱码 -->
12         <meta charset="utf-8">
13         <!-- title 是浏览器显示的页面标题 -->
14         <title>例子 1</title>
15     </head>
16     <!-- body 中是浏览器要显示的内容 -->
17     <body>
18         <!-- html 中的空格是会被转义的, 所以显示的和写的是不一样的 -->
19         <!-- 代码写了很多空格, 显示的时候就只有一个 -->
20         很         好普通版
21         <h1>很好 h1 版</h1>
22         <h2>很好 h2 版</h2>
23         <h3>很好 h3 版</h3>
24         <!-- form 是用来给服务器传递数据的 tag -->
25         <!-- action 属性是 path -->
26         <!-- method 属性是 HTTP方法 一般是 get 或者 post -->
27         <!-- get post 的区别上课会讲 -->
28         <form action="/" method="get">
29             <!-- textarea 是一个文本域 -->
30             <!-- name rows cols 都是属性 -->
31             <textarea name="message" rows="8" cols="40"></textarea>
32             <!-- button type=submit 才可以提交表单 -->
33             <button type="submit">GET 提交</button>
34         </form>
35         <form action="/" method="post">
36             <textarea name="message" rows="8" cols="40"></textarea>
37             <button type="submit">POST 提交</button>
38         </form>
39     </body>
40 </html>

View Code

　运行上述py程序，访问localhost:4000/msg，在get和post输入框里分别输入内容并点击发送。可以看到，get请求的参数包含在请求头里，post请求的参数包含在请求体里。这需要分别处理并获取参数。

五、从socket到web框架

　　有了上述内容，整个url请求的处理流程如下图所示。