|
|
|
|
|
|
|
框架如下 PING程序 A.使用的SOCKET接口 1. socket 2. sendto 3. recvfrom B.PING地址:127.0.0.1 TCP/IP協(xié)議棧: 1. IP層 2. ICMP層
很可惜,在這次學(xué)習(xí)中沒能深入路由表的檢索中~,感覺還需努力哈~ 希望大家推薦一些路由算法的資料或者書籍 T ^T 不要太深~ 夠PING使用就行了~
選擇PING本機(jī)是因?yàn)槟芰私馐瞻l(fā)的過程,同時(shí)也除去了對(duì)網(wǎng)卡硬件的了解的限制,在最小程度下了解TCP/IP協(xié)議棧的基本工作原理
在文中有對(duì)TCP/IP協(xié)議棧理解不足和錯(cuò)誤的地方,請(qǐng)大家一定要拍磚指正 = 3= 萬(wàn)分感謝
好~ = 3=)/ 首先來(lái)看看PING程序 下面這段PING程序來(lái)自網(wǎng)上,感謝這位梁生的無(wú)償奉獻(xiàn) = 3=)/ 我稍微做了一下修改,可能不大美觀和嚴(yán)謹(jǐn),C語(yǔ)言編程功夫還需提高啊
/*********************************************************** * 作者:梁俊輝 * * 時(shí)間:2001年10月 * * 名稱:myping.c * * 說(shuō)明:本程序用于演示ping命令的實(shí)現(xiàn)原理 * ***********************************************************/ #include <string.h> #include <stdio.h> #include <stdlib.h> #include <signal.h> #include <arpa/inet.h> #include <sys/types.h> #include <sys/socket.h> #include <unistd.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_icmp.h> #include <netdb.h> #include <setjmp.h> #include <errno.h> #define PACKET_SIZE 4096 #define MAX_WAIT_TIME 5 #define MAX_NO_PACKETS 3 char sendpacket[PACKET_SIZE]; char recvpacket[PACKET_SIZE]; int sockfd,datalen=56; int nsend=0,nreceived=0; struct sockaddr_in dest_addr; pid_t pid; struct sockaddr_in from; void statistics(int signo); unsigned short cal_chksum(unsigned short *addr,int len); int pack(int pack_no); void send_packet(void); void recv_packet(void); int unpack(char *buf,int len); void tv_sub(struct timeval *out,struct timeval *in); void statistics(int signo) { printf("\n--------------------PING statistics-------------------\n"); printf("%d packets transmitted, %d received , %%%d lost\n",nsend,nreceived,(nsend-nreceived)/nsend*100); close(sockfd); exit(1); } /*校驗(yàn)和算法*/ unsigned short cal_chksum(unsigned short *addr,int len) { int nleft=len; int sum=0; unsigned short *w=addr; unsigned short answer=0; /*把ICMP報(bào)頭二進(jìn)制數(shù)據(jù)以2字節(jié)為單位累加起來(lái)*/ while(nleft>1) { sum+=*w++; nleft-=2; } /*若ICMP報(bào)頭為奇數(shù)個(gè)字節(jié),會(huì)剩下最后一字節(jié)。把最后一個(gè)字節(jié)視為一個(gè)2字節(jié)數(shù)據(jù)的高字節(jié),這個(gè)2字節(jié)數(shù)據(jù)的低字節(jié)為0,繼續(xù)累加*/ if( nleft==1) { *(unsigned char *)(&answer)=*(unsigned char *)w; sum+=answer; } sum=(sum>>16)+(sum&0xffff); sum+=(sum>>16); answer=~sum; return answer; } /*設(shè)置ICMP報(bào)頭*/ int pack(int pack_no) { int i,packsize; struct icmp *icmp; struct timeval * tval; //將sendpacket強(qiáng)制轉(zhuǎn)換成icmp結(jié)構(gòu) icmp = (struct icmp*)sendpacket; icmp->icmp_type = ICMP_ECHO; //設(shè)置ICMP報(bào)文類型 icmp->icmp_code = 0; icmp->icmp_cksum = 0; icmp->icmp_seq = pack_no; icmp->icmp_id = pid; packsize = 8 + datalen; tval = (struct timeval *)icmp->icmp_data; gettimeofday(tval,NULL); icmp->icmp_cksum = cal_chksum( (unsigned short *)icmp,packsize); /*校驗(yàn)算法*/ return packsize; } /*發(fā)送三個(gè)ICMP報(bào)文*/ void send_packet() { int packetsize; while( nsend < MAX_NO_PACKETS) { nsend++; packetsize = pack(nsend); /*設(shè)置ICMP報(bào)頭*/ //int sendto ( SOCKET s , const char FAR *buf , int len , int flags , const struct sockaddr FAR *to , int token ); //[參數(shù)] //s - 指向用Socket函數(shù)生成的Socket //buf - 接受數(shù)據(jù)的緩沖區(qū)(數(shù)組)的指針 //len - 緩沖區(qū)的大小 //flag - 調(diào)用方式(MSG_DONTROUTE , MSG_OOB) //to - 指向發(fā)送方SOCKET地址的指針 //token - 發(fā)送方SOCKET地址的大小 if( sendto(sockfd,sendpacket,packetsize,0,(struct sockaddr *)&dest_addr,sizeof(dest_addr) )<0 ) { perror("sendto error"); continue; } sleep(1); /*每隔一秒發(fā)送一個(gè)ICMP報(bào)文*/ } } /*接收所有ICMP報(bào)文*/ void recv_packet() { int n,fromlen; extern int errno; signal(SIGALRM,statistics); fromlen=sizeof(from); while( nreceived<nsend) { alarm(MAX_WAIT_TIME); //recvfrom()返回讀入的字節(jié)數(shù) if( (n = recvfrom(sockfd,recvpacket,sizeof(recvpacket),0,(struct sockaddr *)&from,&fromlen)) <0) { if(errno==EINTR) continue; perror("recvfrom error"); continue; } //解讀收到的icmp包 if(unpack(recvpacket,n) == -1) continue; nreceived++; } } /*剝?nèi)CMP報(bào)頭*/ int unpack(char *buf,int len) { int i,iphdrlen; struct ip *ip; struct icmp *icmp; ip = (struct ip *)buf; iphdrlen = ip->ip_hl << 2; /*求ip報(bào)頭長(zhǎng)度,即ip報(bào)頭的長(zhǎng)度標(biāo)志乘4*/ icmp = (struct icmp *)(buf+iphdrlen); /*越過ip報(bào)頭,指向ICMP報(bào)頭*/ len -= iphdrlen; /*ICMP報(bào)頭及ICMP數(shù)據(jù)報(bào)的總長(zhǎng)度*/ if( len < 8) /*小于ICMP報(bào)頭長(zhǎng)度則不合理*/ { printf("ICMP packets\'s length is less than 8\n"); return -1; } /*確保所接收的是自己發(fā)的ICMP的回應(yīng)*/ if( (icmp->icmp_type == ICMP_ECHOREPLY) && (icmp->icmp_id == pid) ) { /*顯示相關(guān)信息*/ printf("%d byte from %s: icmp_seq=%u ttl=%d \n", len, inet_ntoa(from.sin_addr), icmp->icmp_seq, ip->ip_ttl ); } else return -1; } int main(int argc,char *argv[]) { struct hostent *host; struct protoent *protocol; unsigned long int inaddr = 0; int waittime=MAX_WAIT_TIME; int size=50*1024; //檢測(cè)參數(shù)是否過少 if(argc<2) { printf("usage:%s hostname/IP address\n",argv[0]); exit(1); } //getprotobyname()返回對(duì)應(yīng)于給定協(xié)議名的包含名字和協(xié)議號(hào)的protoent結(jié)構(gòu)指針 //結(jié)構(gòu)的成員有: //成員 用途 //p_name 正規(guī)的協(xié)議名。 //p_aliases 一個(gè)以空指針結(jié)尾的可選協(xié)議名隊(duì)列。 //p_proto 以主機(jī)字節(jié)順序排列的協(xié)議號(hào) if( (protocol=getprotobyname("icmp") )==NULL) { perror("getprotobyname"); exit(1); } /*生成使用ICMP的原始套接字,這種套接字只有root用戶才能生成*/ if( (sockfd = socket(AF_INET,SOCK_RAW,protocol->p_proto) ) < 0) { perror("socket error"); exit(1); } /* 回收root權(quán)限,設(shè)置當(dāng)前用戶權(quán)限*/ setuid(getuid()); //初始化dest_addr bzero(&dest_addr,sizeof(dest_addr)); //設(shè)置協(xié)議家族類型為 AF_INET dest_addr.sin_family = AF_INET; /*判斷是主機(jī)名還是ip地址*/ if( inaddr = inet_addr(argv[1]) == INADDR_NONE) { //通過dns取得ip地址 if((host = gethostbyname(argv[1]) )==NULL) /*是主機(jī)名*/ { perror("gethostbyname error"); exit(1); } memcpy( (char *)&dest_addr.sin_addr,host->h_addr,host->h_length); } else { /*是ip地址*/ inaddr = inet_addr(argv[1]); memcpy( (char *)&dest_addr.sin_addr,(char *)&inaddr,sizeof(inaddr)); } /*獲取main的進(jìn)程id,用于設(shè)置ICMP的標(biāo)志符*/ pid=getpid(); printf("PING %s(%s): %d bytes data in ICMP packets.\n",argv[1],inet_ntoa(dest_addr.sin_addr),datalen); send_packet(); /*發(fā)送所有ICMP報(bào)文*/ recv_packet(); /*接收所有ICMP報(bào)文*/ statistics(SIGALRM); /*進(jìn)行統(tǒng)計(jì)*/ return 0; }
|
PING的流程在上面已經(jīng)有詳細(xì)的注釋了,我就不說(shuō)了 PING程序的主要流程分為3個(gè)步驟 1. 建立一個(gè)socket結(jié)構(gòu) ->socket 2. 用這個(gè)socket發(fā)送ICMP包 ->sendto 3. 用這個(gè)socket接收ICMP包 ->recvfrom
由于是PING本機(jī),所以在TCP/IP協(xié)議棧中會(huì)有4個(gè)部分的內(nèi)容 1. 建立socket 2. 通過socket發(fā)送ICMP包 3. 本機(jī)收到ICMP包后發(fā)送應(yīng)答 4. 通過socket接收ICMP包
下面我們就來(lái)進(jìn)入TCP/IP協(xié)議棧來(lái)看看這3個(gè)系統(tǒng)調(diào)用如何為我們的PING程序服務(wù)的
首先是第1部分,建立一個(gè)socket結(jié)構(gòu)
sockfd = socket(AF_INET,SOCK_RAW,protocol->p_proto) 這個(gè)函數(shù)會(huì)執(zhí)行系統(tǒng)調(diào)用sys_socketcall sys_socketcall在/net/socket.c中
asmlinkage long sys_socketcall(int call, unsigned long __user *args) { unsigned long a[6]; unsigned long a0, a1; int err;
//檢測(cè)參數(shù)的數(shù)量是否合理 if (call < 1 || call > SYS_RECVMSG) return -EINVAL; /* copy_from_user should be SMP safe. */ //從用戶空間拷貝參數(shù)到內(nèi)核空間,復(fù)制在a[]數(shù)組里 if (copy_from_user(a, args, nargs[call])) return -EFAULT; //取得所要判斷的跳躍類型 err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = sys_socket(a0, a1, a[2]); break; ......................... case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; ............................... case SYS_RECVFROM: err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; default: err = -EINVAL; break; } return err; }
|
上面只列出了我們所用到的3個(gè)case 現(xiàn)在我們的目標(biāo)是case SYS_SOCKET,也就是要?jiǎng)?chuàng)建一個(gè)socket了
sys_socket在/net/socket.c中
asmlinkage long sys_socket(int family, int type, int protocol) { int retval; struct socket *sock; //創(chuàng)建一個(gè)socket retval = sock_create(family, type, protocol, &sock); if (retval < 0) goto out; //將該socket映射到fd中 retval = sock_map_fd(sock); if (retval < 0) goto out_release; out: /* It may be already another descriptor 8) Not kernel problem. */ return retval; out_release: sock_release(sock); return retval; }
|
很簡(jiǎn)單的調(diào)用
sock_create在/net/socket.c中
int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); }
|
繼續(xù),進(jìn)入到__sock_create中
static int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ //檢測(cè)協(xié)議家族類型是否在范圍之內(nèi) if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; //檢測(cè)協(xié)議傳輸類型是否在范圍之內(nèi) if (type < 0 || type >= SOCK_MAX) return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ //檢測(cè)協(xié)議家族類型是否為PF_INET //檢測(cè)協(xié)議傳輸類型是否為SOCK_PACKET if (family == PF_INET && type == SOCK_PACKET) { static int warned; if (!warned) { warned = 1; printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); } family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ //分配一個(gè)socket sock = sock_alloc(); //檢測(cè)分配是否成功 if (!sock) { if (net_ratelimit()) printk(KERN_WARNING "socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } //設(shè)置協(xié)議傳輸類型 sock->type = type; #if defined(CONFIG_KMOD) /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (net_families[family] == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); //根據(jù)協(xié)議類型取得對(duì)應(yīng)的協(xié)議家族結(jié)構(gòu) pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; //檢測(cè)取得協(xié)議結(jié)構(gòu)是否成功 if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ //增加協(xié)議家族的使用計(jì)數(shù)器 if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); //運(yùn)行協(xié)議家族結(jié)構(gòu)中的對(duì)socket初始化函數(shù) err = pf->create(net, sock, protocol); //檢測(cè)初始化是否成功 if (err < 0) goto out_module_put; /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ //增加socket所使用的協(xié)議的使用計(jì)數(shù)器 if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ //減少協(xié)議家族使用計(jì)數(shù)器 module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; //設(shè)置socket指針為初始化完成的socket *res = sock; return 0; out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; out_release: rcu_read_unlock(); goto out_sock_release; }
|
security_socket_create,關(guān)于security的內(nèi)容我們都略過,一來(lái)減少框架的復(fù)雜度,二來(lái)我也不知道security主要做的是啥 哈哈 不過可以肯定的是不會(huì)妨礙TCP/IP協(xié)議棧的正常運(yùn)行
首先是sock_alloc sock_alloc在/net/socket.c中
static struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; inode = new_inode(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; get_cpu_var(sockets_in_use)++; put_cpu_var(sockets_in_use); return sock; }
|
主要是申請(qǐng)一個(gè)新的socket,并對(duì)他的文件屬性進(jìn)行初始化,socket是屬于虛擬文件系統(tǒng)的一部分,我們暫時(shí)只要這一點(diǎn)就好了
回到__sock_create中,然后到 pf = rcu_dereference(net_families[family]); net_families的初始化我們也不分析,因?yàn)樯婕暗拿嫣珡V,為了緊扣PING,我們只需要知道得到了inet_family_ops這個(gè)結(jié)構(gòu)就可以了,詳細(xì)的初始化部分在/net/ipv4/af_inet.c中,大家有興趣的可以看看 inet_family_ops的結(jié)構(gòu)如下
static struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, };
|
緊接著我們就到了 err = pf->create(net, sock, protocol); 調(diào)用inet_family_ops的create函數(shù)
inet_create在/net/ipv4/af_inet.c中
static int inet_create(struct net *net, struct socket *sock, int protocol) { struct sock *sk; struct list_head *p; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; int err;
//檢測(cè)socket的協(xié)議傳輸類型是否為RAW //檢測(cè)socket的協(xié)議傳輸類型是否為DGRAM //第三個(gè)不知道檢測(cè)的什么 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) build_ehash_secret(); //設(shè)置socket的狀態(tài)為未連接 sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ //初始化協(xié)議結(jié)構(gòu) answer = NULL; lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); //歷遍協(xié)議族 list_for_each_rcu(p, &inetsw[sock->type]) { //取得對(duì)應(yīng)的協(xié)議的結(jié)構(gòu) answer = list_entry(p, struct inet_protosw, list); /* Check the non-wild match. */ //檢測(cè)需要的協(xié)議是否和當(dāng)前歷遍的協(xié)議相等 if (protocol == answer->protocol) { //檢測(cè)需要的協(xié)議是否為IP協(xié)議 if (protocol != IPPROTO_IP) //跳出循環(huán) break; } else { /* Check for the two wild cases. */ //檢測(cè)需要的協(xié)議是否為IP協(xié)議 if (IPPROTO_IP == protocol) { //設(shè)置需要的協(xié)議為當(dāng)前歷遍的協(xié)議 protocol = answer->protocol; //跳出循環(huán) break; } //檢測(cè)當(dāng)前歷遍的協(xié)議是否為IP協(xié)議 if (IPPROTO_IP == answer->protocol) //跳出循環(huán) break; } err = -EPROTONOSUPPORT; //設(shè)置協(xié)議結(jié)構(gòu)為空 answer = NULL; } //檢測(cè)取得協(xié)議是否為空 if (unlikely(answer == NULL)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; err = -EAFNOSUPPORT; if (!inet_netns_ok(net, protocol)) goto out_rcu_unlock; //設(shè)置socket的協(xié)議次操作集為當(dāng)前協(xié)議結(jié)構(gòu)的操作集 sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); BUG_TRAP(answer_prot->slab != NULL); err = -ENOBUFS; //分配一個(gè)sock結(jié)構(gòu) sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); //檢測(cè)分配是否成功 if (sk == NULL) goto out; err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = 1; //將sock結(jié)構(gòu)強(qiáng)制轉(zhuǎn)換成inet_sock結(jié)構(gòu) inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; //檢測(cè)協(xié)議傳輸類型是否為未處理 if (SOCK_RAW == sock->type) { //設(shè)置本地端口號(hào)為協(xié)議類型 inet->num = protocol; //檢測(cè)協(xié)議類型是否為未處理 if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } if (ipv4_config.no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT;
inet->id = 0; //初始化sock sock_init_data(sock, sk); //設(shè)置sock的回收處理函數(shù) sk->sk_destruct = inet_sock_destruct; //設(shè)置sock的協(xié)議家族類型 sk->sk_family = PF_INET; //設(shè)置sock的協(xié)議類型 sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_index = 0; inet->mc_list = NULL; sk_refcnt_debug_inc(sk); //檢測(cè)本地端口號(hào)是否存在 if (inet->num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ //設(shè)置對(duì)方端口號(hào)為本地端口號(hào) inet->sport = htons(inet->num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); } //檢測(cè)協(xié)議初始化函數(shù)是否存在 if (sk->sk_prot->init) { //執(zhí)行協(xié)議初始化函數(shù) err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }
|
inetsw結(jié)構(gòu)的注冊(cè)不關(guān)心,我們看結(jié)果
answer就是其中的第二項(xiàng)
這里我們的protocol為IPPROTO_ICMP answer->protocol為IPPROTO_IP
所以是進(jìn)入了if (IPPROTO_IP == answer->protocol)后break跳出了循環(huán)
之后到inet_netns_ok inet_netns_ok在/net/ipv4/af_inet.c中
static inline int inet_netns_ok(struct net *net, int protocol) { int hash; struct net_protocol *ipprot; if (net == &init_net) return 1; //取得哈希值 hash = protocol & (MAX_INET_PROTOS - 1); //取得哈希值對(duì)應(yīng)的協(xié)議 ipprot = rcu_dereference(inet_protos[hash]); //檢測(cè)協(xié)議是否為空 if (ipprot == NULL) /* raw IP is OK */ return 1; return ipprot->netns_ok; }
|
由于在__sock_create中我們傳入的net類型為init_net,所以這里是返回1,不會(huì)goto out_rcu_unlock結(jié)束的
繼續(xù)在inet_create中向下走,來(lái)到了sk_alloc sk_alloc在/net/core/sock.c中
struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) { struct sock *sk; //分配一個(gè)sock結(jié)構(gòu) sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); //檢測(cè)分配是否成功 if (sk) { //設(shè)置協(xié)議家族類型 sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ //設(shè)置協(xié)議主操作集 sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); sock_net_set(sk, get_net(net)); } return sk; }
|
sk_prot_alloc在協(xié)議結(jié)構(gòu)的高速緩存中分配一個(gè)sock結(jié)構(gòu),分配成功后進(jìn)行一些簡(jiǎn)單的初始化操作便退出了
繼續(xù)向下走,到sock_init_data sock_init_data在/net/core/sock.c中
void sock_init_data(struct socket *sock, struct sock *sk) { //初始化skb接收隊(duì)列 skb_queue_head_init(&sk->sk_receive_queue); //初始化skb發(fā)送隊(duì)列 skb_queue_head_init(&sk->sk_write_queue); //初始化skb錯(cuò)誤隊(duì)列 skb_queue_head_init(&sk->sk_error_queue); #ifdef CONFIG_NET_DMA skb_queue_head_init(&sk->sk_async_wait_queue); #endif sk->sk_send_head = NULL; init_timer(&sk->sk_timer); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; //連接socket到sock sk->sk_socket = sock; sock_set_flag(sk, SOCK_ZAPPED); //檢測(cè)socket是否存在 if (sock) { //設(shè)置sock的協(xié)議傳輸類型 sk->sk_type = sock->type; //設(shè)置sock的等待隊(duì)列 sk->sk_sleep = &sock->wait; //連接sock到socket sock->sk = sk; } else //設(shè)置sock的等待隊(duì)列為空 sk->sk_sleep = NULL; rwlock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); //設(shè)置sock的狀態(tài)改變處理函數(shù) sk->sk_state_change = sock_def_wakeup; //設(shè)置sock的數(shù)據(jù)準(zhǔn)備處理函數(shù) sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; //設(shè)置sock的錯(cuò)誤處理函數(shù) sk->sk_error_report = sock_def_error_report; //設(shè)置sock的回收處理函數(shù) sk->sk_destruct = sock_def_destruct; //發(fā)送數(shù)據(jù)的緩沖頁(yè)面 sk->sk_sndmsg_page = NULL; //發(fā)送數(shù)據(jù)的緩沖頁(yè)面偏移值 sk->sk_sndmsg_off = 0; sk->sk_peercred.pid = 0; sk->sk_peercred.uid = -1; sk->sk_peercred.gid = -1; sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = ktime_set(-1L, 0); atomic_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); }
|
這是個(gè)大家伙,負(fù)責(zé)sock結(jié)構(gòu)的詳細(xì)初始化 初始化完成后繼續(xù)inet_create的執(zhí)行 由于之前設(shè)置了inet->num為協(xié)議號(hào),這里會(huì)執(zhí)行sk->sk_prot->hash 在進(jìn)入這個(gè)函數(shù)之前讓我們先來(lái)看一下目前sock的結(jié)構(gòu)
sk_prot為一個(gè)宏 #define sk_prot __sk_common.skc_prot 指向了raw_prot,所以sk->sk_prot->hash就是執(zhí)行了raw_hash_sk
raw_hash_sk在/net/ipv4/raw.c中
void raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; write_lock_bh(&h->lock); sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); }
|
主要是將raw_prot連接到了socket的隊(duì)列中,如下圖
因?yàn)閞aw_prot是有raw_init這個(gè)函數(shù)的,所以我們進(jìn)入到sk->sk_prot->init raw_init在/net/ipv4/raw.c中
static int raw_init(struct sock *sk) { //把sock結(jié)構(gòu)強(qiáng)制轉(zhuǎn)換為raw_sock結(jié)構(gòu) struct raw_sock *rp = raw_sk(sk); //檢測(cè)端口號(hào)是否為ICMP if (inet_sk(sk)->num == IPPROTO_ICMP) //清空icmp_filter結(jié)構(gòu) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; }
|
結(jié)構(gòu)圖如下
為什么能一直這樣強(qiáng)制轉(zhuǎn)換下去,就不怕結(jié)構(gòu)超界么? 其實(shí)這是一早有預(yù)謀的,在raw_prot中有一個(gè)成員為 .obj_size = sizeof(struct raw_sock) 而在協(xié)議中分配空間的時(shí)候就已經(jīng)分配了raw_sock所需要的空間,我們一直在用他的一部分而已
好, 到這里inet_create就完成了,一路返回到sys_socket中 執(zhí)行最后一步,把初始化好的socket結(jié)構(gòu)映射到一個(gè)文件描述符中,并返回這個(gè)文件描述符 這樣,我們的ping程序的sockfd就拿到了一個(gè)按要求初始化好的socket結(jié)構(gòu)索引號(hào)了 在之后的sendto和recvfrom操作中就能夠使用這個(gè)索引號(hào)進(jìn)行發(fā)送和接收了
然后到第2部分,發(fā)送初始化好的icmp結(jié)構(gòu) sendto(sockfd,sendpacket,packetsize,0,(struct sockaddr *)&dest_addr,sizeof(dest_addr)
繼續(xù)來(lái)到系統(tǒng)調(diào)用sys_socketcall中 這次我們的目標(biāo)是case SYS_SENDTO sys_sendto在/net/socket.c中
asmlinkage long sys_sendto(int fd, void __user *buff, size_t len, unsigned flags, struct sockaddr __user *addr, int addr_len) { struct socket *sock; char address[MAX_SOCK_ADDR]; int err; struct msghdr msg; struct iovec iov; int fput_needed; //從文件描述符中返回socket sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; //取得需要發(fā)送數(shù)據(jù)的起始地址 iov.iov_base = buff; //取得需要發(fā)送數(shù)據(jù)的數(shù)據(jù)長(zhǎng)度 iov.iov_len = len; msg.msg_name = NULL; //連接iov到msg msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; //是否有地址參數(shù) if (addr) { //從用戶數(shù)據(jù)轉(zhuǎn)換為內(nèi)核數(shù)據(jù) err = move_addr_to_kernel(addr, addr_len, address); if (err < 0) goto out_put; //設(shè)置地址 msg.msg_name = address; //設(shè)置地址長(zhǎng)度 msg.msg_namelen = addr_len; } if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; err = sock_sendmsg(sock, &msg, len); out_put: fput_light(sock->file, fput_needed); out: return err; }
|
初始化好的msg結(jié)構(gòu)如下

在iovec結(jié)構(gòu)中保存了我們要發(fā)送數(shù)據(jù)的首地址和大小
然后進(jìn)入到sock_sendmsg sock_sendmsg在/net/socket.c中
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct kiocb iocb; struct sock_iocb siocb; int ret;
init_sync_kiocb(&iocb, NULL); iocb.private = &siocb; ret = __sock_sendmsg(&iocb, sock, msg, size); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&iocb); return ret; }
|
我不大明白kiocb的用處,google也不是說(shuō)得很清楚,大概就是說(shuō)關(guān)于文件同步操作方面上的,請(qǐng)明白的同學(xué)們指教一下 = 3=)/ 感謝 這里就不把kiocb的結(jié)構(gòu)畫進(jìn)來(lái)了
然后進(jìn)入到__sock_sendmsg __sock_sendmsg在/net/socket.c中
static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock_iocb *si = kiocb_to_siocb(iocb); int err;
//連接socket si->sock = sock; si->scm = NULL; //連接msg si->msg = msg; //設(shè)置需要拷貝的數(shù)據(jù)大小 si->size = size; err = security_socket_sendmsg(sock, msg, size); if (err) return err; return sock->ops->sendmsg(iocb, sock, msg, size); }
|
連接完成后的結(jié)構(gòu)圖如下
sock->ops->sendmsg調(diào)用的為inet_sockraw_ops中的sendmsg操作,也就是inet_sendmsg函數(shù)
inet_sendmsg在/net/ipv4/af_inet.c中
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; /* We may need to bind the socket. */ //檢測(cè)端口號(hào)是否存在 if (!inet_sk(sk)->num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->sendmsg(iocb, sk, msg, size); }
|
我們?cè)谥耙呀?jīng)設(shè)置了端口號(hào),所以這里直接來(lái)到了sk->sk_prot->sendmsg sk->sk_prot->sendmsg調(diào)用的是raw_prot中的sendmsg操作,也就是raw_setsockopt函數(shù)
raw_setsockopt在/net/ipv4/raw.c中
static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; int free = 0; __be32 daddr; __be32 saddr; u8 tos; int err;
err = -EMSGSIZE; //檢測(cè)數(shù)據(jù)的長(zhǎng)度是否過長(zhǎng) if (len > 0xFFFF) goto out; /* * Check the flags. */ err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ goto out; /* compatibility */ /* * Get and verify the address. */ //檢測(cè)是否有目的地址 if (msg->msg_namelen) { //將地址數(shù)據(jù)格式化成sockaddr_in結(jié)構(gòu) struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; err = -EINVAL; //檢測(cè)地址數(shù)據(jù)長(zhǎng)度是否過小 if (msg->msg_namelen < sizeof(*usin)) goto out; //檢測(cè)協(xié)議家族類型是否為AF_INET if (usin->sin_family != AF_INET) { static int complained; if (!complained++) printk(KERN_INFO "%s forgot to set AF_INET in " "raw sendmsg. Fix it!\n", current->comm); err = -EAFNOSUPPORT; //檢測(cè)是否存在協(xié)議家族類型 if (usin->sin_family) goto out; } //取得目的地址IP daddr = usin->sin_addr.s_addr; /* ANK: I did not forget to get protocol from port field. * I just do not know, who uses this weirdness. * IP_HDRINCL is much more convenient. */ } else { err = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->daddr; } ipc.addr = inet->saddr; ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; //檢測(cè)是否有控制信息 if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc); if (err) goto out; if (ipc.opt) free = 1; } saddr = ipc.addr; ipc.addr = daddr; //檢測(cè)是否存在ip_options if (!ipc.opt) //無(wú)則設(shè)置為inet_sock中的ip_options ipc.opt = inet->opt; //檢測(cè)是否存在ip_options if (ipc.opt) { err = -EINVAL; /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ if (inet->hdrincl) goto done; if (ipc.opt->srr) { if (!daddr) goto done; daddr = ipc.opt->faddr; } } //取得服務(wù)類型 tos = RT_CONN_FLAGS(sk); if (msg->msg_flags & MSG_DONTROUTE) tos |= RTO_ONLINK; //檢測(cè)是否為多播地址 if (ipv4_is_multicast(daddr)) { if (!ipc.oif) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; } //進(jìn)行路由表查詢 { struct flowi fl = { .oif = ipc.oif, .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = daddr, .saddr = saddr, .tos = tos } }, .proto = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, }; if (!inet->hdrincl) { err = raw_probe_proto_opt(&fl, msg); if (err) goto done; } security_sk_classify_flow(sk, &fl); err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); } if (err) goto done; err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) goto done; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (inet->hdrincl) { err = raw_send_hdrinc(sk, msg->msg_iov, len,rt, msg->msg_flags); } else { if (!ipc.addr) ipc.addr = rt->rt_dst; lock_sock(sk); //拷貝需要發(fā)送的數(shù)據(jù)到skb中 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, &ipc, rt, msg->msg_flags); //檢測(cè)拷貝是否成功 if (err) //不成功則釋放所有sock下發(fā)送隊(duì)列中所有的skb ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) //發(fā)送sk中的skb err = ip_push_pending_frames(sk); release_sock(sk); } done: if (free) kfree(ipc.opt); ip_rt_put(rt); out: if (err < 0) return err; return len; do_confirm: dst_confirm(&rt->u.dst); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; }
|
這里最關(guān)鍵的就是 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); 這是一個(gè)路由表查詢函數(shù)
無(wú)能為力........
不過我根據(jù)DEBUG的信息把查詢結(jié)果畫了出來(lái),分別為ipcm_cookie和rtable兩個(gè)結(jié)構(gòu),其中最關(guān)鍵的為rtable中的dst_entry
rtable中的idev連接lo這個(gè)環(huán)回虛擬網(wǎng)卡設(shè)備 lo網(wǎng)卡的注冊(cè)在/drivers/net/loopback.c中 由于牽涉到路由表的添加問題,我這里就不介紹他的注冊(cè)了
現(xiàn)在回到raw_sendmsg,進(jìn)入ip_append_data, ip_append_data負(fù)責(zé)將要發(fā)送的數(shù)據(jù)組裝到sk_buff結(jié)構(gòu)中
|
|
| |
|
|
|
|
 |
|
| |
|