2010/6/18:还在分析linux/net代码中,觉得在2.6的内核用来分析代码流程的确很麻烦。原理的东西一般都很好理解,有的技术PPT可能几页就能讲的很清楚了。比如GRE,理论上就是圆环套圆环(记得好像是那部电影??),可是分析2.6的代码就麻烦的很。还要理解tunnel。
所以突然想到,跟踪linux/net功能的发展历程来分析,应该还不错。
于是,先下载了0.96c的代码,发现这个版本的代码还只支持本地socket(AF_UNIX),还不算真正意义上的net。
找来找去,搞了一个0.99.15。还算完善些。
这个版本支持INET,也就是IPV4。所以有值得分析的部分。
先看看吧,有了心得在写下来。希望借这个版本能够把路由策略(route.c)和TCP协议(tcp.c)很好的掌握以下。
1. 路由策略
在ip报文的发送和转发过程中,都涉及到路由的问题。在ip_build_header和ip_forward中,都调用了rt_route()。rt_route()这个函数根据传入的目的ip,返回一个rtable结构。这个结构就是我们经常看到的路由表,route这个命令能够查看到。
struct rtable {
struct rtable *rt_next;//路由表项是一个单项的链表,没有hash,完全是最简单的实现
unsigned long rt_dst; //目的地址
unsigned long rt_mask; //目的掩码
unsigned long rt_gateway;//目的网关
unsigned char rt_flags; //表示下一跳是个什么东西
unsigned char rt_metric;//可以理解为cost
short rt_refcnt;//引用计数
unsigned long rt_use;//命中次数?
unsigned short rt_mss, rt_mtu;//
struct device *rt_dev;//对应的设备,从哪个接口出去
};
在看一下rt_route的实现,简直是爽死了。就这么几行就搞定了2.6内核那一坨东西,不过功能肯定是不及:)
struct rtable * rt_route(unsigned long daddr, struct options *opt)
{
struct rtable *rt;
//遍历路由表
for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) {
//如果完全匹配或同一个子网,则选择这条路由。原来子网的用途在这里
if (!((rt->rt_dst ^ daddr) & rt->rt_mask))
break;
/* broadcast addresses can be special cases.. */
//如果你是一个广播地址,并且该路由表项允许广播,也就是这个路由表项制定的物理接口支持广播,表示命中这个表项。但有一个问题就是如果有多个接口,只能从第一个表项指定的接口进行广播了,因为break了。
if ((rt->rt_dev->flags & IFF_BROADCAST) &&
rt->rt_dev->pa_brdaddr == daddr)
break;
}
//如果是loopback,但没有loopback路由?奇怪,可能是不支持loopback接口吧
if (daddr == rt->rt_dev->pa_addr) {
if ((rt = rt_loopback) == NULL)
goto no_route;
}
rt->rt_use++;//命中次数
return rt;
no_route:
return NULL;
}
由此看来,早期内核对网络的支持非常简单,用来分析网络的实现也很容易。在看看路由表项是怎么添加进去的。rt_add()负责向路由表项添加路由表,而调用rt_add的地方有rt_ioctl和icmp.c中,icmp的路由重定向的支持。
也就是说,添加路由表项有两种方式:
a) inet_ioctl/rt_ioctl用来添加删除。
b) icmp协议的重定向报文。
在rt_add中,指定类型(flags),目的地址,掩码,目的网关地址,对应的物理接口等参数。
如此,只要花费半小时,就分析完了route.c,效率高啊:)
2. TCP协议
linux内核在简单,在TCP协议的实现上也简单不了,所以看一下tcp.c,所以linux的tcp实现参考了BSD Socket,这在文件头中有说明。
先看一下tcp_prot结构
struct proto tcp_prot = {
sock_wmalloc,
sock_rmalloc,
sock_wfree,
sock_rfree,
sock_rspace,
sock_wspace,
tcp_close,
tcp_read,
tcp_write,
tcp_sendto,
tcp_recvfrom,
ip_build_header,
tcp_connect,
tcp_accept,
ip_queue_xmit,
tcp_retransmit,
tcp_write_wakeup,
tcp_read_wakeup,
tcp_rcv,
tcp_select,
tcp_ioctl,
NULL,
tcp_shutdown,
tcp_setsockopt,
tcp_getsockopt,
128,
0,
{NULL,},
"TCP"
};
2010/6/21:稍微考虑了一下,一个完整的C/S通信过程,最好先看Server端的处理流程,在分析Client端的处理。
Server端流程一般都是这样的情况
a)socket
b)bind
c)listen
d)accept //这个过程中一般会fork一个进程处理新建立的链接。
e)send/recv
f)close
好,先看一下socket()都做了哪些事情,sock_register函数注册了inet相关操作,对应函数inet_create()。代码就不贴了,这个函数实现比较简单,就是为sock结构分配内存并初始化。
大概过程是socket(user space)->sys_socketcall(kernel space)->sock_socket()->inet_create(前提是指定inet通信)。
这里在sock_socket中为socket结构分配内存和属于进程的fd,在inet_create中为sock分配内存。
接下来看一下inet_bind
static int
inet_bind(struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in addr;
struct sock *sk, *sk2;
unsigned short snum;
int err;
sk = (struct sock *) sock->data;
if (sk == NULL) {
printk("Warning: sock->data = NULL: %d\n" ,__LINE__);
return(0);
}
/* check this error. */
if (sk->state != TCP_CLOSE) return(-EIO);
if (sk->num != 0) return(-EINVAL);
err=verify_area(VERIFY_READ, uaddr, addr_len);
if(err)
return err;
memcpy_fromfs(&addr, uaddr, min(sizeof(addr), addr_len));
snum = ntohs(addr.sin_port);
DPRINTF((DBG_INET, "bind sk =%X to port = %d\n", sk, snum));
sk = (struct sock *) sock->data;
/*
* We can't just leave the socket bound wherever it is, it might
* be bound to a privileged port. However, since there seems to
* be a bug here, we will leave it if the port is not privileged.
*/
if (snum == 0) { //如果没有指定bind的端口号,系统为你分配一个
snum = get_new_socknum(sk->prot, 0);
}
if (snum if (addr.sin_addr.s_addr!&#61;0 && chk_addr(addr.sin_addr.s_addr)!&#61;IS_MYADDR) return(-EADDRNOTAVAIL); /* Source address MUST be ours! */ if (chk_addr(addr.sin_addr.s_addr) || addr.sin_addr.s_addr &#61;&#61; 0) sk->saddr &#61; addr.sin_addr.s_addr; DPRINTF((DBG_INET, "sock_array[%d] &#61; %X:\n", snum &(SOCK_ARRAY_SIZE -1), sk->prot->sock_array[snum &(SOCK_ARRAY_SIZE -1)])); /* Make sure we are allowed to bind here. */ cli(); outside_loop: //这里以源端口号做了一个散列&#xff0c;在bind操作&#xff0c;找到符合要求的sk for(sk2 &#61; sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)]; sk2 !&#61; NULL; sk2 &#61; sk2->next) { #if 1 /* should be below! */ if (sk2->num !&#61; snum) continue; /* if (sk2->saddr !&#61; sk->saddr) continue; */ #endif if (sk2->dead) { destroy_sock(sk2); goto outside_loop; } if (!sk->reuse) { sti(); return(-EADDRINUSE); } if (sk2->num !&#61; snum) continue; /* more than one */ if (sk2->saddr !&#61; sk->saddr) continue; /* socket per slot ! -FB */ if (!sk2->reuse) { sti(); return(-EADDRINUSE); } } sti(); remove_sock(sk); put_sock(snum, sk); sk->dummy_th.source &#61; ntohs(sk->num); sk->daddr &#61; 0; sk->dummy_th.dest &#61; 0; return(0); } 再看listen static int inet_listen(struct socket *sock, int backlog) { struct sock *sk; sk &#61; (struct sock *) sock->data; if (sk &#61;&#61; NULL) { printk("Warning: sock->data &#61; NULL: %d\n" ,__LINE__); return(0); } /* We may need to bind the socket. */ if (sk->num &#61;&#61; 0) { sk->num &#61; get_new_socknum(sk->prot, 0); if (sk->num &#61;&#61; 0) return(-EAGAIN); put_sock(sk->num, sk); sk->dummy_th.source &#61; ntohs(sk->num); } /* We might as well re use these. */ sk->max_ack_backlog &#61; backlog;//比较感兴趣的就是这里和下面的sk->state, backlog这个值如果不分析socket实现的话&#xff0c;是不会理解这个含义的。先说一下&#xff0c;这个数值的功能是能够同时接受多少个sync报文&#xff0c;也算是为了防止sync攻击的初级防御吧&#xff0c;回想一下tcp的三次握手&#xff0c;先是client端发送sync报文&#xff0c;如果server端在接收这个报文后&#xff0c;还没有创建新的连接&#xff0c;那么最多可以缓存backlog个sync报文&#xff0c;多出的部分直接扔掉 if (sk->state !&#61; TCP_LISTEN) { sk->ack_backlog &#61; 0; sk->state &#61; TCP_LISTEN;//当设置这个值为listen的时候&#xff0c;对应端口的tcp报文就可以被内核处理了 } return(0); } 现在可以看一下tcp_recv case TCP_LISTEN: //如果是listen状态才处理sync报文 if (th->rst) {//这时收到rst报文&#xff0c;直接丢弃&#xff0c;不处理 kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (th->ack) {//如果收到ack报文&#xff0c;说明可能server端down过&#xff0c;通知对方链接已经被重置 tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if (th->syn) {//这个才是正确的报文 #if 0 if (opt->security !&#61; 0 || opt->compartment !&#61; 0) { tcp_reset(daddr, saddr, th, prot, opt,dev); release_sock(sk); return(0); } #endif /* * Now we just put the whole thing including * the header and saddr, and protocol pointer * into the buffer. We can&#39;t respond until the * user tells us to accept the connection. */ tcp_conn_request(sk, skb, daddr, saddr, opt, dev);//这个函数处理sync报文&#xff0c;下面分析 release_sock(sk); return(0); } kfree_skb(skb, FREE_READ); release_sock(sk); return(0); default://缺省的报文丢弃 if (!tcp_sequence(sk, th, len, opt, saddr,dev)) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, unsigned long daddr, unsigned long saddr, struct options *opt, struct device *dev) { struct sk_buff *buff; struct tcphdr *t1; unsigned char *ptr; struct sock *newsk; struct tcphdr *th; int tmp; DPRINTF((DBG_TCP, "tcp_conn_request(sk &#61; %X, skb &#61; %X, daddr &#61; %X, sadd4&#61; %X, \n" " opt &#61; %X, dev &#61; %X)\n", sk, skb, daddr, saddr, opt, dev)); th &#61; skb->h.th; /* If the socket is dead, don&#39;t accept the connection. */ if (!sk->dead) { sk->data_ready(sk,0); } else { DPRINTF((DBG_TCP, "tcp_conn_request on dead socket\n")); tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); kfree_skb(skb, FREE_READ); return; } /* * Make sure we can accept more. This will prevent a * flurry of syns from eating up all our memory. */ if (sk->ack_backlog >&#61; sk->max_ack_backlog) {//这里&#xff0c;达到max&#xff0c;丢弃报文 kfree_skb(skb, FREE_READ); return; } /* * We need to build a new sock struct. * It is sort of bad to have a socket without an inode attached * to it, but the wake_up&#39;s will just wake up the listening socket, * and if the listening socket is destroyed before this is taken * off of the queue, this will take care of it. */ //复制一份新的sock newsk &#61; (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); if (newsk &#61;&#61; NULL) { /* just ignore the syn. It will get retransmitted. */ kfree_skb(skb, FREE_READ); return; } DPRINTF((DBG_TCP, "newsk &#61; %X\n", newsk)); memcpy((void *)newsk,(void *)sk, sizeof(*newsk)); newsk->wback &#61; NULL; newsk->wfront &#61; NULL; newsk->rqueue &#61; NULL; newsk->send_head &#61; NULL; newsk->send_tail &#61; NULL; newsk->back_log &#61; NULL; newsk->rtt &#61; TCP_CONNECT_TIME <<3; newsk->rto &#61; TCP_CONNECT_TIME; newsk->mdev &#61; 0; newsk->max_window &#61; 0; newsk->cong_window &#61; 1; newsk->cong_count &#61; 0; newsk->ssthresh &#61; 0; newsk->backoff &#61; 0; newsk->blog &#61; 0; newsk->intr &#61; 0; newsk->proc &#61; 0; newsk->done &#61; 0; newsk->partial &#61; NULL; newsk->pair &#61; NULL; newsk->wmem_alloc &#61; 0; newsk->rmem_alloc &#61; 0; newsk->max_unacked &#61; MAX_WINDOW - TCP_WINDOW_DIFF; newsk->err &#61; 0; newsk->shutdown &#61; 0; newsk->ack_backlog &#61; 0; newsk->acked_seq &#61; skb->h.th->seq&#43;1; newsk->fin_seq &#61; skb->h.th->seq; newsk->copied_seq &#61; skb->h.th->seq; newsk->state &#61; TCP_SYN_RECV;//设置状态&#xff0c;注意这个是newsk&#xff0c;原来的sk状态仍为LISTEN&#xff0c;这样在下一个tcp_rcv的处理中&#xff0c;会做进一步的处理 newsk->timeout &#61; 0; newsk->send_seq &#61; jiffies * SEQ_TICK - seq_offset;//随机生成序列号算法 newsk->window_seq &#61; newsk->send_seq; newsk->rcv_ack_seq &#61; newsk->send_seq; newsk->urg &#61;0; newsk->retransmits &#61; 0; newsk->destroy &#61; 0; newsk->timer.data &#61; (unsigned long)newsk; newsk->timer.function &#61; &net_timer;//协议用到的timer newsk->dummy_th.source &#61; skb->h.th->dest; newsk->dummy_th.dest &#61; skb->h.th->source; /* Swap these two, they are from our point of view. */ newsk->daddr &#61; saddr; newsk->saddr &#61; daddr; put_sock(newsk->num,newsk); newsk->dummy_th.res1 &#61; 0; newsk->dummy_th.doff &#61; 6; newsk->dummy_th.fin &#61; 0; newsk->dummy_th.syn &#61; 0; newsk->dummy_th.rst &#61; 0; newsk->dummy_th.psh &#61; 0; newsk->dummy_th.ack &#61; 0; newsk->dummy_th.urg &#61; 0; newsk->dummy_th.res2 &#61; 0; newsk->acked_seq &#61; skb->h.th->seq &#43; 1; newsk->copied_seq &#61; skb->h.th->seq; /* Grab the ttl and tos values and use them */ newsk->ip_ttl&#61;sk->ip_ttl; newsk->ip_tos&#61;skb->ip_hdr->tos; /* use 512 or whatever user asked for */ /* note use of sk->user_mss, since user has no direct access to newsk */ if (sk->user_mss) newsk->mtu &#61; sk->user_mss; else { #ifdef SUBNETSARELOCAL if ((saddr ^ daddr) & default_mask(saddr)) #else if ((saddr ^ daddr) & dev->pa_mask) #endif newsk->mtu &#61; 576 - HEADER_SIZE; else newsk->mtu &#61; MAX_WINDOW; } /* but not bigger than device MTU */ newsk->mtu &#61; min(newsk->mtu, dev->mtu - HEADER_SIZE); /* this will min with what arrived in the packet */ tcp_options(newsk,skb->h.th); //准备发送sync&ack的报文 buff &#61; newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC); if (buff &#61;&#61; NULL) { sk->err &#61; -ENOMEM; newsk->dead &#61; 1; release_sock(newsk); kfree_skb(skb, FREE_READ); return; } buff->mem_addr &#61; buff; buff->mem_len &#61; MAX_SYN_SIZE; buff->len &#61; sizeof(struct tcphdr)&#43;4; buff->sk &#61; newsk; t1 &#61;(struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp &#61; sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &dev, IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); /* Something went wrong. */ if (tmp <0) { sk->err &#61; tmp; buff->free&#61;1; kfree_skb(buff,FREE_WRITE); newsk->dead &#61; 1; release_sock(newsk); skb->sk &#61; sk; kfree_skb(skb, FREE_READ); return; } buff->len &#43;&#61; tmp; t1 &#61;(struct tcphdr *)((char *)t1 &#43;tmp); memcpy(t1, skb->h.th, sizeof(*t1)); buff->h.seq &#61; newsk->send_seq; /* Swap the send and the receive. */ t1->dest &#61; skb->h.th->source; t1->source &#61; newsk->dummy_th.source; t1->seq &#61; ntohl(newsk->send_seq&#43;&#43;); t1->ack &#61; 1;//设置ACK标志 newsk->window &#61; tcp_select_window(newsk);/*newsk->prot->rspace(newsk);*/ t1->window &#61; ntohs(newsk->window); t1->res1 &#61; 0; t1->res2 &#61; 0; t1->rst &#61; 0; t1->urg &#61; 0; t1->psh &#61; 0; t1->syn &#61; 1;//设置SYNC标志 t1->ack_seq &#61; ntohl(skb->h.th->seq&#43;1); t1->doff &#61; sizeof(*t1)/4&#43;1; ptr &#61;(unsigned char *)(t1&#43;1); ptr[0] &#61; 2; ptr[1] &#61; 4; ptr[2] &#61; ((newsk->mtu) >> 8) & 0xff; ptr[3] &#61;(newsk->mtu) & 0xff; tcp_send_check(t1, daddr, saddr, sizeof(*t1)&#43;4, newsk); newsk->prot->queue_xmit(newsk, dev, buff, 0);//在这里发送SYNC/ACK报文 reset_timer(newsk, TIME_WRITE /* -1 ? FIXME ??? */, TCP_CONNECT_TIME); skb->sk &#61; newsk; /* Charge the sock_buff to newsk. */ sk->rmem_alloc -&#61; skb->mem_len; newsk->rmem_alloc &#43;&#61; skb->mem_len; skb_queue_tail(&sk->rqueue,skb);//同时把接收的这个报文上加入到sock接收队列中 sk->ack_backlog&#43;&#43;; release_sock(newsk); } 这样&#xff0c;一个tcp握手就完成了&#xff0c;下面开始用户调用accept了 accept调用关系sock_accept->inet_accept->tcp_accept&#xff0c;先看一下tcp_accept /* This will accept the next outstanding connection. */ static struct sock * tcp_accept(struct sock *sk, int flags) { struct sock *newsk; struct sk_buff *skb; DPRINTF((DBG_TCP, "tcp_accept(sk&#61;%X, flags&#61;%X, addr&#61;%s)\n", sk, flags, in_ntoa(sk->saddr))); /* * We need to make sure that this socket is listening, * and that it has something pending. */ if (sk->state !&#61; TCP_LISTEN) { sk->err &#61; EINVAL; return(NULL); } /* avoid the race. */ cli(); sk->inuse &#61; 1; //等待接收的报文&#xff0c;当有报文时&#xff0c;就传给inet_accept&#xff0c;这个报文就是sync报文了&#xff0c;刚刚在tcp_recv函数中已经分析过了&#xff0c;同时state已经变为TCP_SYN_RECV了,在看inet_accept实现 while((skb &#61; get_firstr(sk)) &#61;&#61; NULL) { if (flags & O_NONBLOCK) { sti(); release_sock(sk); sk->err &#61; EAGAIN; return(NULL); } release_sock(sk); interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); sk->err &#61; ERESTARTSYS; return(NULL); } sk->inuse &#61; 1; } sti(); /* Now all we need to do is return skb->sk. */ newsk &#61; skb->sk; kfree_skb(skb, FREE_READ); sk->ack_backlog--; release_sock(sk); return(newsk); } static int inet_accept(struct socket *sock, struct socket *newsock, int flags) { struct sock *sk1, *sk2; int err; sk1 &#61; (struct sock *) sock->data; if (sk1 &#61;&#61; NULL) { printk("Warning: sock->data &#61; NULL: %d\n" ,__LINE__); return(0); } /* * We&#39;ve been passed an extra socket. * We need to free it up because the tcp module creates * it&#39;s own when it accepts one. */ if (newsock->data) kfree_s(newsock->data, sizeof(struct sock)); newsock->data &#61; NULL; if (sk1->prot->accept &#61;&#61; NULL) return(-EOPNOTSUPP); /* Restore the state if we have been interrupted, and then returned. */ if (sk1->pair !&#61; NULL ) { sk2 &#61; sk1->pair; sk1->pair &#61; NULL; } else { sk2 &#61; sk1->prot->accept(sk1,flags); if (sk2 &#61;&#61; NULL) { if (sk1->err <&#61; 0) printk("Warning sock.c:sk1->err <&#61; 0. Returning non-error.\n"); err&#61;sk1->err; sk1->err&#61;0; return(-err); } } newsock->data &#61; (void *)sk2; sk2->sleep &#61; newsock->wait; newsock->conn &#61; NULL; if (flags & O_NONBLOCK) return(0); cli(); /* avoid the race. */ while(sk2->state &#61;&#61; TCP_SYN_RECV) {//由于这个条件肯定是成立的&#xff0c;所以进入等待状态 interruptible_sleep_on(sk2->sleep); if (current->signal & ~current->blocked) { sti(); sk1->pair &#61; sk2; sk2->sleep &#61; NULL; newsock->data &#61; NULL; return(-ERESTARTSYS); } } sti(); //在tcp_rcv中&#xff0c;如果是TCP_SYN_RECV&#xff0c;在接收到ACK后&#xff0c;状态变为TCP_ESTABLISHED if (sk2->state !&#61; TCP_ESTABLISHED && sk2->err > 0) { err &#61; -sk2->err; sk2->err&#61;0; destroy_sock(sk2); newsock->data &#61; NULL; return(err); } newsock->state &#61; SS_CONNECTED;//新的链接状态为SS_CONNECTED&#xff0c;这样一个TCP链接就建立完成了&#xff0c;可以进行SEND/RECV了。 return(0); }