Linux内核Socket实现之------Socket创建(2) 文件描述符

转载请注明:http://blog.chinaunix.net/uid-20788636-id-4408276.html

1.2 sock_map_fd函数

在用户空间创建了一个socket后,返回值是一个文件描述符,下面分析一下创建socket时怎么和文件描述符联系的。在SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)最后调用sock_map_fd进行关联,其中返回的retval就是用户空间获取的文件描述符fd,sock就是调用sock_create创建成功的socket.

sock_map_fd()主要用于对socket的*file指针初始化,经过sock_map_fd()操作后,socket就通过其*file指针与VFS管理的文件进行了关联,便可以进行文件的各种操作,如read、write、lseek、ioctl等.

retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

static int sock_map_fd(struct socket *sock, int flags)

{

struct file *newfile;

int fd = get_unused_fd_flags(flags);//根据flags获取没有使用的fd,具体分析见1.2.1

if (unlikely(fd < 0))

return fd;

newfile = sock_alloc_file(sock, flags, NULL);

if (likely(!IS_ERR(newfile))) {

fd_install(fd, newfile);

return fd;

}

put_unused_fd(fd);

return PTR_ERR(newfile);

}

1.2.1   get_unused_fd_flags函数

get_unused_fd_flags()函数调用__alloc_fd分配一个新的可用的fd

int __alloc_fd(struct files_struct *files,

unsigned start, unsigned end, unsigned flags)

{

unsigned int fd;

int error;

struct fdtable *fdt;

spin_lock(&files->file_lock);

repeat:

/*得到本进程的文件描述符表*/

fdt = files_fdtable(files);

fd = start;//从start开始,这里的start为0

/* files->next_fd为上一次查找确定的下一个可用空闲的文件描述符,这样可以提高获取的效率,如果fd小于files->next_fd的话就可以直接使用next_fd */

if (fd < files->next_fd)

fd = files->next_fd;

/*当fd小于目前进程支持的最大的描述符号,那么可以通过fds_bits位图,从fd位开始查找,找到下一个0位,即下一个空闲描述符。*/

if (fd < fdt->max_fds)

fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);

/*

* N.B. For clone tasks sharing a files structure, this test

* will limit the total number of files that can be opened.

*/

error = -EMFILE;

if (fd >= end)

goto out;

/* 如需要则扩展文件描述符表 */

error = expand_files(files, fd);

if (error < 0)

goto out;

/*

* If we needed to expand the fs array we

* might have blocked - try again.

*/

if (error)

goto repeat;

/*

设置next_fd,用于下次加速查找空闲的fd。

当start大于next_fd时,不会设置next_fd以避免文件描述符的不连续

*/

if (start <= files->next_fd)

files->next_fd = fd + 1;

/* 将fd添加到已打开的文件描述符表中 */

__set_open_fd(fd, fdt);

if (flags & O_CLOEXEC)

__set_close_on_exec(fd, fdt);

else

__clear_close_on_exec(fd, fdt);

error = fd;

#if 1

/* Sanity check */

if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {

printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);

rcu_assign_pointer(fdt->fd[fd], NULL);

}

#endif

out:

spin_unlock(&files->file_lock);

return error;

}

1.2.2 sock_alloc_file函数

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)

{

struct qstr name = { .name = "" };

struct path path;

struct file *file;

if (dname) {//这里的dname为空

name.name = dname;

name.len = strlen(name.name);

} else if (sock->sk) {

/*这里的name应该是TCP 根据struct proto tcp_prot */

name.name = sock->sk->sk_prot_creator->name;

name.len = strlen(name.name);

}

/*申请一个新的dentry,其中sock_mnt->mnt_sb在前面已经分析过了,是一个sock_fs_type文件系统挂载点,*/

path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);

if (unlikely(!path.dentry))

return ERR_PTR(-ENOMEM);

path.mnt = mntget(sock_mnt);

/*将文件操作的函数绑定到inode,对于dentry是在sockfs_mount函数中sockfs_dentry_operations,该函数在sock_init是调用,在前面有分析 */

d_instantiate(path.dentry, SOCK_INODE(sock));

SOCK_INODE(sock)->i_fop = &socket_file_ops;

/*申请新的file,将path,file,关联起来*/

file = alloc_file(&path, FMODE_READ | FMODE_WRITE,

&socket_file_ops);

if (unlikely(IS_ERR(file))) {

/* drop dentry, keep inode */

ihold(path.dentry->d_inode);

path_put(&path);

return file;

}

sock->file = file;//sock->file和刚分配的file关联起来

file->f_flags = O_RDWR | (flags & O_NONBLOCK);//设置file的标志

file->private_data = sock;//file的私有数据指针指向sock.

return file;

}
Linux内核Socket实现之------Socket创建(2) 文件描述符

Socket创建流程图

附录:对于sk_alloc分配的内存大小问题分析

在分析中经常看到此种类型的强制转换inet = inet_sk(sk);,其中inet被定义为struct inet_sock *inet;结构体,我们看结构体的定义sock结构体的大小小于struct inet_sock,这样是无法进行强制类型转换的,但在实际分配的过程中sock分配的大小为tcp_sock的大小,而该结构足够大。

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,

struct proto *prot)

{

struct sock *sk;

sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);

if (sk) {

sk->sk_family = family;

/*

* See comment in struct sock definition to understand

* why we need sk_prot_creator -acme

*/

sk->sk_prot = sk->sk_prot_creator = prot;

sock_lock_init(sk);

sock_net_set(sk, get_net(net));

atomic_set(&sk->sk_wmem_alloc, 1);

sock_update_classid(sk);

sock_update_netprioidx(sk);

}

return sk;

}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,

int family)

{

struct sock *sk;

struct kmem_cache *slab;

/*这里分配内存空间时,分为两种情况,第一种情况是从高速缓存上分配,第二种是普通的分配*/

slab = prot->slab;

if (slab != NULL) {

sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);---------------------(1)

if (!sk)

return sk;

if (priority & __GFP_ZERO) {

if (prot->clear_sk)

prot->clear_sk(sk, prot->obj_size);

else

sk_prot_clear_nulls(sk, prot->obj_size);

}

} else

sk = kmalloc(prot->obj_size, priority);---------------------------(2)

if (sk != NULL) {

kmemcheck_annotate_bitfield(sk, flags);

if (security_sk_alloc(sk, family, priority))

goto out_free;

if (!try_module_get(prot->owner))

goto out_free_sec;

sk_tx_queue_clear(sk);

}

return sk;

out_free_sec:

security_sk_free(sk);

out_free:

if (slab != NULL)

kmem_cache_free(slab, sk);

else

kfree(sk);

return NULL;

}

(1)第一种情况:sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO) 这里的slap等于slab = prot->slab;也就是函数传递过来的struct proto *prot,再看一下这个结构体是怎么定义的?在inet_create函数中sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);,这里的answer_prot为answer_prot = answer->prot;在看一下answer->prot是如何来的?

在inet_ctreate函数中通过遍历inetsw数组获取到struct inet_protosw *answer;

list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

err = 0;

/* Check the non-wild match. */

if (protocol == answer->protocol) {

if (protocol != IPPROTO_IP)

break;

} else {

/* Check for the two wild cases. */

if (IPPROTO_IP == protocol) {

protocol = answer->protocol;

break;

}

if (IPPROTO_IP == answer->protocol)

break;

}

err = -EPROTONOSUPPORT;

}

其中inetsw的定义下面类型的数组,如果是SOCK_STREAM类型的socket,这里的prot = tcp_prot

static struct inet_protosw inetsw_array[] =

{

{

.type =       SOCK_STREAM,

.protocol =   IPPROTO_TCP,

.prot =       &tcp_prot,

.ops =        &inet_stream_ops,

.no_check =   0,

.flags =      INET_PROTOSW_PERMANENT |

INET_PROTOSW_ICSK,

},

{

.type =       SOCK_DGRAM,

.protocol =   IPPROTO_UDP,

.prot =       &udp_prot,

.ops =        &inet_dgram_ops,

.no_check =   UDP_CSUM_DEFAULT,

.flags =      INET_PROTOSW_PERMANENT,

},

{

.type =       SOCK_DGRAM,

.protocol =   IPPROTO_ICMP,

.prot =       &ping_prot,

.ops =        &inet_dgram_ops,

.no_check =   UDP_CSUM_DEFAULT,

.flags =      INET_PROTOSW_REUSE,

},

{

.type =       SOCK_RAW,

.protocol =   IPPROTO_IP,       /* wild card */

.prot =       &raw_prot,

.ops =        &inet_sockraw_ops,

.no_check =   UDP_CSUM_DEFAULT,

.flags =      INET_PROTOSW_REUSE,

}

};

再看一下

struct proto tcp_prot = {

.name                         = "TCP",

.owner                        = THIS_MODULE,

.close                          = tcp_close,

.connect            = tcp_v4_connect,

.disconnect                = tcp_disconnect,

.accept                       = inet_csk_accept,

.ioctl                            = tcp_ioctl,

.init                     = tcp_v4_init_sock,

.destroy            = tcp_v4_destroy_sock,

.shutdown                 = tcp_shutdown,

.setsockopt               = tcp_setsockopt,

.getsockopt               = tcp_getsockopt,

.recvmsg           = tcp_recvmsg,

.sendmsg                   = tcp_sendmsg,

.sendpage                  = tcp_sendpage,

.backlog_rcv              = tcp_v4_do_rcv,

.release_cb               = tcp_release_cb,

.mtu_reduced          = tcp_v4_mtu_reduced,

.hash                           = inet_hash,

.unhash                      = inet_unhash,

.get_port          = inet_csk_get_port,

.enter_memory_pressure       = tcp_enter_memory_pressure,

.stream_memory_free    = tcp_stream_memory_free,

.sockets_allocated  = &tcp_sockets_allocated,

.orphan_count                   = &tcp_orphan_count,

.memory_allocated = &tcp_memory_allocated,

.memory_pressure = &tcp_memory_pressure,

.sysctl_mem             = sysctl_tcp_mem,

.sysctl_wmem          = sysctl_tcp_wmem,

.sysctl_rmem            = sysctl_tcp_rmem,

.max_header            = MAX_TCP_HEADER,

.obj_size           = sizeof(struct tcp_sock),

.slab_flags                 = SLAB_DESTROY_BY_RCU,

.twsk_prot                 = &tcp_timewait_sock_ops,

.rsk_prot           = &tcp_request_sock_ops,

.h.hashinfo                = &tcp_hashinfo,

.no_autobind            = true,

#ifdef CONFIG_COMPAT

.compat_setsockopt        = compat_tcp_setsockopt,

.compat_getsockopt        = compat_tcp_getsockopt,

#endif

#ifdef CONFIG_MEMCG_KMEM

.init_cgroup               = tcp_init_cgroup,

.destroy_cgroup                = tcp_destroy_cgroup,

.proto_cgroup          = tcp_proto_cgroup,

#endif

};

在af_inet.c文件中的inet_init函数中的

static int __init inet_init(void)

{

struct inet_protosw *q;

struct list_head *r;

int rc = -EINVAL;

BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));

sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);

if (!sysctl_local_reserved_ports)

goto out;

//该函数是注册tcp_prot,在该函数中对tcp_prot->slab进行内存分配

rc = proto_register(&tcp_prot, 1);

if (rc)

goto out_free_reserved_ports;

rc = proto_register(&udp_prot, 1);

if (rc)

goto out_unregister_tcp_proto;

rc = proto_register(&raw_prot, 1);

if (rc)

goto out_unregister_udp_proto;

rc = proto_register(&ping_prot, 1);

if (rc)

goto out_unregister_raw_proto;

/*

*     Tell SOCKET that we are alive...

*/

(void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL

ip_static_sysctl_init();

#endif

/*

*     Add all the base protocols.

*/

if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)

pr_crit("%s: Cannot add ICMP protocol\n", __func__);

if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)

pr_crit("%s: Cannot add UDP protocol\n", __func__);

if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)

pr_crit("%s: Cannot add TCP protocol\n", __func__);

#ifdef CONFIG_IP_MULTICAST

if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)

pr_crit("%s: Cannot add IGMP protocol\n", __func__);

#endif

/* Register the socket-side information for inet_create. 对inetsw进行初始化操作*/

for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)

INIT_LIST_HEAD(r);

/*将inetsw_array 加入到对于的inetsw链表中,就可以在inet_create 函数中进行遍历*/

for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)

inet_register_protosw(q);

/*

*     Set the ARP module up

*/

arp_init();

/*

*     Set the IP module up

*/

ip_init();

tcp_v4_init();

/* Setup TCP slab cache for open requests. */

tcp_init();

/* Setup UDP memory threshold */

udp_init();

/* Add UDP-Lite (RFC 3828) */

udplite4_register();

ping_init();

/*

*     Set the ICMP layer up

*/

if (icmp_init() < 0)

panic("Failed to create the ICMP control socket.\n");

/*

*     Initialise the multicast router

*/

#if defined(CONFIG_IP_MROUTE)

if (ip_mr_init())

pr_crit("%s: Cannot init ipv4 mroute\n", __func__);

#endif

/*

*     Initialise per-cpu ipv4 mibs

*/

if (init_ipv4_mibs())

pr_crit("%s: Cannot init ipv4 mibs\n", __func__);

ipv4_proc_init();

ipfrag_init();

dev_add_pack(&ip_packet_type);

rc = 0;

out:

return rc;

out_unregister_raw_proto:

proto_unregister(&raw_prot);

out_unregister_udp_proto:

proto_unregister(&udp_prot);

out_unregister_tcp_proto:

proto_unregister(&tcp_prot);

out_free_reserved_ports:

kfree(sysctl_local_reserved_ports);

goto out;

}

在proto_register函数中,主要是关注prot->slab进行了初始化。

int proto_register(struct proto *prot, int alloc_slab)

{

if (alloc_slab) {

prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,

SLAB_HWCACHE_ALIGN | prot->slab_flags,

NULL);// 这里的饿prot->obj_size为.obj_size               = sizeof(struct tcp_sock),

if (prot->slab == NULL) {

pr_crit("%s: Can't create sock SLAB cache!\n",

prot->name);

goto out;

}

……………………..

}

(2)对于第二种情况,主要prot->obj_size,就是struct proto tcp_prot 中初始化的.obj_size            = sizeof(struct tcp_sock)。sk = kmalloc(prot->obj_size, priority);---------------------------(2)

下面是五个相关的数据结构,tcp_sock结构体占用的空间是最大的,所以在分配内存空间时,都是分配的tcp_sock的大小,这样在后面进行强制转换的过程中可以保证正确。

Linux内核Socket实现之------Socket创建(2) 文件描述符

上一篇:Hibernate总结1(入门)


下一篇:App界面交互设计规范(转)