2.6.32 socket创建流程如下

先看总的函数调用流程
应用层 | socket层 | inet层
socket------->sys_socket---->sock_create->__sock_create--->inet_create

int socket(int domain, int type, int protocol)
domain:协议族 AF_INET,AF_UNIX等
type:指定socket类型,常用的socket类型有，SOCK_STREAM(字节流)、SOCK_DGRAM(数据报)、SOCK_RAW(原始套接口)等,
在Linux 2.6.27后可以通过|设置SOCK_NONBLOCK,SOCK_CLOEXEC,来修改socket行为
protocol:指定协议如IPPROTO_TCP,IPPTOTO_UDP, IPPROTO_IP
返回值: 成功返回一个>0的文件描述符,否则返回-1

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock;
	int flags;

	//检查宏定义的值是否一致
	//BUILD_BUG_ON的作用是在编译的时候如果condition为真,则编译出错
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

    //type的组成为 socket类型 | 标志位(可以不设置)
	//获得SOCK_CLOEXEC或SOCK_NONBLOCK标志位
	//通过SOCK_TYPE_MASK为0xf 可看出type的低四位为socket类型
	flags = type & ~SOCK_TYPE_MASK;

	//检查设置的flags是否为 SOCK_CLOEXEC或SOCK_NONBLOCK中的值,
	//不是的话返回错误
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;

	//获得socket类型
	type &= SOCK_TYPE_MASK;

	//防止O_NONBLOCK和SOCK_NONBLOCK值不一样的问题
	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

	//创建一个socket{}
	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	//将fd于socket结构体进行关联 用于之后根据fd来查找socket结构体
	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	//释放应用层进程对sock的使用锁
	sock_release(sock);
	return retval;
}

int sock_create(int family, int type, int protocol, struct socket **res)
{
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

static int __sock_create(struct net *net, int family, int type, int protocol,
			                     struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

    //检测传入的协议族是否在正确范围值内 
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;

	//检测socket类型传入是否正确
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	//检测用户是否使用了废弃的方式调用socket系统调用 
	//2.0以前使用socket(PF_INET,SOCK_PACKET,IPPROTO_TCP),可以用来操作链路层
	//但在之后采用socket(PF_PACKET,type)来操作链路层type为SOCK_DGRAM或SOCK_RAM或SOCK_PACKET
	//SOCK_DGRAM和SOCK_RAM前者将去掉以太网头部再传输。后者给予应用程序完整的数据包
	//此处SOCK_PACKET保留是为了兼容之前的socket(PF_INET,SOCK_PACKET,IPPROTO_TCP)这种调用方式
	if (family == PF_INET && type == SOCK_PACKET) 
	{
		static int warned;
		if (!warned) 
		{
			warned = 1;
			//打印一次警告信息,此调用方式已经废弃
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",current->comm);
		}
		family = PF_PACKET;//将协议族强制修改成PF_PACKET
	}

	//用于LSM安全框架
	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	//分配inode结构并获得对应的socket结构
	sock = sock_alloc();
	if (!sock) {
		//用于保护内核网络调试信息的打印, 当它返回(TRUE)时则可以打印调试信息,返回零则禁止信息打印.
        /*
        它的特性为当"极快地"调用net_ratelimit()时,
        它最多只允许连续打印前10条信息, 后继信息每隔5秒允许打印一次.
        这样可防止攻击者使内核不断产生调试信息来使系统过载的拒绝服务攻击.2) 
        net_ratelimit()定义了一个时间计数器变量(toks), 它随着系统时钟计数线性增长,
        但不超时50秒时钟计数(net_msg_burst). 当计时器的值大于或等于5秒时钟计数(net_msg_cost)时,
        则允许打印信息. 每允许打印一条信息, 计时器就减去5秒计数, 当计时器的值小于5秒时, 就不允许打印信息了 
		*/
		if (net_ratelimit())
			printk(KERN_WARNING "socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}
	sock->type = type;//保存socket类型

#ifdef CONFIG_MODULES
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	if (net_families[family] == NULL)
		request_module("net-pf-%d", family);//加载对应的协议族模块
#endif

	rcu_read_lock();

    //net_families 在sock_register注册了管理协议族初始化的结构体inet_family_ops(AF_INET协议族) 
    /*inet_init->sock_register(&inet_family_ops);
	用RCU机制读取协议族结构体*/
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	//拖协议族以模块防护加载则增加模块引用计数
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	//对于INET这里调用inet_create函数对INET协议族进行创建
	err = pf->create(net, sock, protocol);
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
    //创建完成后减少引用计数
	module_put(pf->owner);

	//用于LSM安全框架
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	
	*res = sock;
	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
	struct sock *sk;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;

    //inet_ehash_secret用于计算hash值将建立连接的sock加入到hash表tcp_hashinfo中
    //inet_lookup_established()->__inet_lookup_established()->inet_ehashfn()
	if (unlikely(!inet_ehash_secret))
		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
			build_ehash_secret();//函数创建加密数据赋值给inet_ehash_secret

	//设置socket起始的状态为SS_UNCONNECTED
	sock->state = SS_UNCONNECTED;

	/* Look for the requested type/protocol pair. */
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	
	 //inetsw的初始化是在inet_init 中将inetsw_array中的数据拷贝到inetsw中
	 //根据套接字类型找到对应的inet_protosw
	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

        /*
        (1)判断传入的协议号,与socket类型需要的协议号是否一致,一致则判断传入的协议号是否为0,若不为0则找到inet_protosw结构
           若为0 则说明protocol和answer->protocol都为0,可以猜测上层使用了socket(AF_INET,SOCK_RAW,0)这样方式创建socket,这是错误的
        (2)若传入的协议号与socket类型需要的协议号不一致,若传入的协议号为0，则根据socket类型的需要来自己设置协议类型如
           socket(AF_INET,SOCK_STREAM,0),传入的协议号不为0而socket类型配对的answer->protocol为0,那么此socket类型为SOCK_RAW 
		*/ 	  
		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break; //找到了适配的inetsw[]元素 
		} 
		else 
		{   
		    /* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) 
			{
				protocol = answer->protocol;
				break;
			}

			
			if (IPPROTO_IP == answer->protocol)
				break; //到此处说明是个SOCK_RAW类型
		}
		err = -EPROTONOSUPPORT;
	}
   //到这里answer指向了合适的inetsw结构，若是TCP协议，answer指向内容如下  
    /* 
    *   .type =       SOCK_STREAM, 
    *   .protocol =   IPPROTO_TCP, 
    *   .prot =       &tcp_prot, 
    *   .ops =        &inet_stream_ops, 
    *   .no_check =   0, 
    *   .flags =      INET_PROTOSW_PERMANENT | 
    *                 INET_PROTOSW_ICSK, 
    */  
	if (unlikely(err)) {//若找inet_protosw失败,则从外部进行加载对应模块,尝试加载两次，并且每次都重新查找
	                    //若还失败,则报错
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
					       PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
					       PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	
	//检测是否有权能创建sock
	if (answer->capability > 0 && !capable(answer->capability))
		goto out_rcu_unlock;

	err = -EAFNOSUPPORT;
	//判断协议protocol是否支持网络命名空间
	if (!inet_netns_ok(net, protocol))
		goto out_rcu_unlock;

	sock->ops = answer->ops;//inet层操作集
	answer_prot = answer->prot;//传输层操作集
	answer_no_check = answer->no_check;//发送或接收报文时候是否需要校验和
	answer_flags = answer->flags;//标志如下
	/***********************************************
    #define INET_PROTOSW_REUSE     0x01  标识端口是否能被重用 
    #define INET_PROTOSW_PERMANENT 0x02  标识此协议不能被替换或卸载
    #define INET_PROTOSW_ICSK      0x04  标识是不是连接的套接口
	****************************************/
	
	//释放读锁
	rcu_read_unlock();

	WARN_ON(answer_prot->slab == NULL);

	err = -ENOBUFS;
	//分配sock结构体内存，这里在inet_init函数初始化好的高速缓冲区中分配内存，然后做一些初始化工作
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
	if (sk == NULL)
		goto out;

	err = 0;
	sk->sk_no_check = answer_no_check;//发送或接收报文时候是否需要校验和
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1; //端口可以重用

	//根据sk变量得到inet_sock变量的地址
	inet = inet_sk(sk);

	//sock扩展结构体是否含有inet_connection_sock结构体
	//如tcp的sock扩展结构:tcp_sock{inet_connection_sock{inet_sock{sock}}}
	//  udp的sock扩展结构:udp_sock{{inet_sock{sock}}},此sock扩展就不包含inet_connection_sock
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
	
	if (SOCK_RAW == sock->type) 
	{
		inet->num = protocol;
		if (IPPROTO_RAW == protocol)//对于RAW的IPPROTO_ICMP类型protocol为1 所以inet->hdrincl=0
			inet->hdrincl = 1;
	}

	//路径MTU发现是用来确定到达目的地的路径中最大传输单元(MTU)的大小
	if (ipv4_config.no_pmtu_disc)//不进行pmtu发现
		inet->pmtudisc = IP_PMTUDISC_DONT;//系统不发送IP头带有DF标志的报文
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;//策略根据路由中表项是否锁定了MTU，来决定是否设置DF位，如锁定，不设置DF位

	inet->id = 0;

    //对sk进一步初始化  并将sock和sk进行绑定
	sock_init_data(sock, sk);

	//进一步设置sk的其他属性信息
	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_all	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;

    //增加对协议集合的引用计数
	sk_refcnt_debug_inc(sk);

	if (inet->num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->sport = htons(inet->num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}

	//若设置过传输层初始化结构
	if (sk->sk_prot->init) 
	{
	    //对于tcpv4调用tcp_v4_init_sock函数进行进一步的初始化，由于在函数sk_alloc中一些属性被设置成0了，所以在此调用进行初始化
		err = sk->sk_prot->init(sk);
		if (err)//初始化失败的话释放分配的sock
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

2.6.32 socket创建流程如下

猜你喜欢