日韩黑丝制服一区视频播放|日韩欧美人妻丝袜视频在线观看|九九影院一级蜜桃|亚洲中文在线导航|青草草视频在线观看|婷婷五月色伊人网站|日本一区二区在线|国产AV一二三四区毛片|正在播放久草视频|亚洲色图精品一区

分享

LINUX下PING與TCP_IP協(xié)議棧學(xué)習(xí)筆記(1) - TCP/IP - Linux

 jijo 2009-04-13





 

框架如下
PING程序
A.使用的SOCKET接口
1. socket
2. sendto
3. recvfrom
B.PING地址:127.0.0.1
TCP/IP協(xié)議棧:
1. IP層
2. ICMP層

很可惜,在這次學(xué)習(xí)中沒能深入路由表的檢索中~,感覺還需努力哈~ 希望大家推薦一些路由算法的資料或者書籍 T ^T 不要太深~ 夠PING使用就行了~

選擇PING本機(jī)是因?yàn)槟芰私馐瞻l(fā)的過程,同時(shí)也除去了對(duì)網(wǎng)卡硬件的了解的限制,在最小程度下了解TCP/IP協(xié)議棧的基本工作原理

在文中有對(duì)TCP/IP協(xié)議棧理解不足和錯(cuò)誤的地方,請(qǐng)大家一定要拍磚指正 = 3= 萬(wàn)分感謝

好~ = 3=)/ 首先來(lái)看看PING程序
下面這段PING程序來(lái)自網(wǎng)上,感謝這位梁生的無(wú)償奉獻(xiàn) = 3=)/
我稍微做了一下修改,可能不大美觀和嚴(yán)謹(jǐn),C語(yǔ)言編程功夫還需提高啊

 

/***********************************************************
 * 作者:梁俊輝 *
 * 時(shí)間:2001年10月 *
 * 名稱:myping.c *
 * 說(shuō)明:本程序用于演示ping命令的實(shí)現(xiàn)原理 *
 ***********************************************************/

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <unistd.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netdb.h>
#include <setjmp.h>
#include <errno.h>
#define PACKET_SIZE 4096
#define MAX_WAIT_TIME 5
#define MAX_NO_PACKETS 3
char sendpacket[PACKET_SIZE];
char recvpacket[PACKET_SIZE];
int sockfd,datalen=56;
int nsend=0,nreceived=0;
struct sockaddr_in dest_addr;
pid_t pid;
struct sockaddr_in from;
void statistics(int signo);
unsigned short cal_chksum(unsigned short *addr,int len);
int pack(int pack_no);
void send_packet(void);
void recv_packet(void);
int unpack(char *buf,int len);
void tv_sub(struct timeval *out,struct timeval *in);
void statistics(int signo)
{
    printf("\n--------------------PING statistics-------------------\n");
    printf("%d packets transmitted, %d received , %%%d lost\n",nsend,nreceived,(nsend-nreceived)/nsend*100);
    close(sockfd);
    exit(1);
}
/*校驗(yàn)和算法*/
unsigned short cal_chksum(unsigned short *addr,int len)
{
    int nleft=len;
    int sum=0;
    unsigned short *w=addr;
    unsigned short answer=0;
    /*把ICMP報(bào)頭二進(jìn)制數(shù)據(jù)以2字節(jié)為單位累加起來(lái)*/
    while(nleft>1)
    {
        sum+=*w++;
        nleft-=2;
    }
    /*若ICMP報(bào)頭為奇數(shù)個(gè)字節(jié),會(huì)剩下最后一字節(jié)。把最后一個(gè)字節(jié)視為一個(gè)2字節(jié)數(shù)據(jù)的高字節(jié),這個(gè)2字節(jié)數(shù)據(jù)的低字節(jié)為0,繼續(xù)累加*/
    if( nleft==1)
    {
        *(unsigned char *)(&answer)=*(unsigned char *)w;
        sum+=answer;
    }
    sum=(sum>>16)+(sum&0xffff);
    sum+=(sum>>16);
    answer=~sum;
    return answer;
}
/*設(shè)置ICMP報(bào)頭*/
int pack(int pack_no)
{
    int i,packsize;
    struct icmp *icmp;
    struct timeval * tval;
    //將sendpacket強(qiáng)制轉(zhuǎn)換成icmp結(jié)構(gòu)
    icmp = (struct icmp*)sendpacket;
    icmp->icmp_type = ICMP_ECHO; //設(shè)置ICMP報(bào)文類型
    icmp->icmp_code = 0;
    icmp->icmp_cksum = 0;
    icmp->icmp_seq = pack_no;
    icmp->icmp_id = pid;
    packsize = 8 + datalen;
    tval = (struct timeval *)icmp->icmp_data;
    gettimeofday(tval,NULL);
    icmp->icmp_cksum = cal_chksum( (unsigned short *)icmp,packsize); /*校驗(yàn)算法*/
    return packsize;
}
/*發(fā)送三個(gè)ICMP報(bào)文*/
void send_packet()
{
    int packetsize;
    while( nsend < MAX_NO_PACKETS)
    {
        nsend++;
        packetsize = pack(nsend); /*設(shè)置ICMP報(bào)頭*/
        //int sendto ( SOCKET s , const char FAR *buf , int len , int flags , const struct sockaddr FAR *to , int token );
        //[參數(shù)]
        //s - 指向用Socket函數(shù)生成的Socket
        //buf - 接受數(shù)據(jù)的緩沖區(qū)(數(shù)組)的指針
        //len - 緩沖區(qū)的大小
        //flag - 調(diào)用方式(MSG_DONTROUTE , MSG_OOB)
        //to - 指向發(fā)送方SOCKET地址的指針
        //token - 發(fā)送方SOCKET地址的大小 
        if( sendto(sockfd,sendpacket,packetsize,0,(struct sockaddr *)&dest_addr,sizeof(dest_addr) )<0 )
        {
            perror("sendto error");
            continue;
        }
        sleep(1); /*每隔一秒發(fā)送一個(gè)ICMP報(bào)文*/
    }
}
/*接收所有ICMP報(bào)文*/
void recv_packet()
{
    int n,fromlen;
    extern int errno;
    signal(SIGALRM,statistics);
    fromlen=sizeof(from);
    while( nreceived<nsend)
    {
        alarm(MAX_WAIT_TIME);
        //recvfrom()返回讀入的字節(jié)數(shù)
        if( (n = recvfrom(sockfd,recvpacket,sizeof(recvpacket),0,(struct sockaddr *)&from,&fromlen)) <0)
        {
            if(errno==EINTR)
                continue;
            perror("recvfrom error");
            continue;
        }
        //解讀收到的icmp包
        if(unpack(recvpacket,n) == -1)
            continue;
        nreceived++;
    }
}
/*剝?nèi)CMP報(bào)頭*/
int unpack(char *buf,int len)
{
    int i,iphdrlen;
    struct ip *ip;
    struct icmp *icmp;
    ip = (struct ip *)buf;
    iphdrlen = ip->ip_hl << 2; /*求ip報(bào)頭長(zhǎng)度,即ip報(bào)頭的長(zhǎng)度標(biāo)志乘4*/
    icmp = (struct icmp *)(buf+iphdrlen); /*越過ip報(bào)頭,指向ICMP報(bào)頭*/
    len -= iphdrlen; /*ICMP報(bào)頭及ICMP數(shù)據(jù)報(bào)的總長(zhǎng)度*/
    if( len < 8) /*小于ICMP報(bào)頭長(zhǎng)度則不合理*/
    {
        printf("ICMP packets\'s length is less than 8\n");
        return -1;
    }
    /*確保所接收的是自己發(fā)的ICMP的回應(yīng)*/
    if( (icmp->icmp_type == ICMP_ECHOREPLY) && (icmp->icmp_id == pid) )
    {
        /*顯示相關(guān)信息*/
        printf("%d byte from %s: icmp_seq=%u ttl=%d \n",
            len,
            inet_ntoa(from.sin_addr),
            icmp->icmp_seq,
            ip->ip_ttl
            );
    }
    else
        return -1;
}
int main(int argc,char *argv[])
{
    struct hostent *host;
    struct protoent *protocol;
    unsigned long int inaddr = 0;
    int waittime=MAX_WAIT_TIME;
    int size=50*1024;
    //檢測(cè)參數(shù)是否過少
    if(argc<2)
    {
        printf("usage:%s hostname/IP address\n",argv[0]);
        exit(1);
    }
    //getprotobyname()返回對(duì)應(yīng)于給定協(xié)議名的包含名字和協(xié)議號(hào)的protoent結(jié)構(gòu)指針    
    //結(jié)構(gòu)的成員有: 
    //成員 用途 
    //p_name 正規(guī)的協(xié)議名。 
    //p_aliases 一個(gè)以空指針結(jié)尾的可選協(xié)議名隊(duì)列。 
    //p_proto 以主機(jī)字節(jié)順序排列的協(xié)議號(hào) 
    if( (protocol=getprotobyname("icmp") )==NULL)
    {
        perror("getprotobyname");
        exit(1);
    }
    /*生成使用ICMP的原始套接字,這種套接字只有root用戶才能生成*/
    if( (sockfd = socket(AF_INET,SOCK_RAW,protocol->p_proto) ) < 0)
    {
        perror("socket error");
        exit(1);
    }
    /* 回收root權(quán)限,設(shè)置當(dāng)前用戶權(quán)限*/
    setuid(getuid());
    //初始化dest_addr
    bzero(&dest_addr,sizeof(dest_addr));
    //設(shè)置協(xié)議家族類型為    AF_INET    
    dest_addr.sin_family = AF_INET;
    /*判斷是主機(jī)名還是ip地址*/
    if( inaddr = inet_addr(argv[1]) == INADDR_NONE)
    {
        //通過dns取得ip地址
        if((host = gethostbyname(argv[1]) )==NULL) /*是主機(jī)名*/
        {
            perror("gethostbyname error");
            exit(1);
        }
        memcpy( (char *)&dest_addr.sin_addr,host->h_addr,host->h_length);
    }
    else
    { /*是ip地址*/
        inaddr = inet_addr(argv[1]);
        memcpy( (char *)&dest_addr.sin_addr,(char *)&inaddr,sizeof(inaddr));
    }
    /*獲取main的進(jìn)程id,用于設(shè)置ICMP的標(biāo)志符*/
    pid=getpid();
    printf("PING %s(%s): %d bytes data in ICMP packets.\n",argv[1],inet_ntoa(dest_addr.sin_addr),datalen);
    send_packet(); /*發(fā)送所有ICMP報(bào)文*/
    recv_packet(); /*接收所有ICMP報(bào)文*/
    statistics(SIGALRM); /*進(jìn)行統(tǒng)計(jì)*/
    return 0;
}

 

PING的流程在上面已經(jīng)有詳細(xì)的注釋了,我就不說(shuō)了
PING程序的主要流程分為3個(gè)步驟
1. 建立一個(gè)socket結(jié)構(gòu)                 ->socket
2. 用這個(gè)socket發(fā)送ICMP包           ->sendto
3. 用這個(gè)socket接收ICMP包           ->recvfrom

由于是PING本機(jī),所以在TCP/IP協(xié)議棧中會(huì)有4個(gè)部分的內(nèi)容
1. 建立socket
2. 通過socket發(fā)送ICMP包
3. 本機(jī)收到ICMP包后發(fā)送應(yīng)答
4. 通過socket接收ICMP包

下面我們就來(lái)進(jìn)入TCP/IP協(xié)議棧來(lái)看看這3個(gè)系統(tǒng)調(diào)用如何為我們的PING程序服務(wù)的

首先是第1部分,建立一個(gè)socket結(jié)構(gòu)

sockfd = socket(AF_INET,SOCK_RAW,protocol->p_proto)
這個(gè)函數(shù)會(huì)執(zhí)行系統(tǒng)調(diào)用sys_socketcall
sys_socketcall在/net/socket.c中

asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
    unsigned long a[6];
    unsigned long a0, a1;
    int err;

    //檢測(cè)參數(shù)的數(shù)量是否合理
    if (call < 1 || call > SYS_RECVMSG)
        return -EINVAL;
    /* copy_from_user should be SMP safe. */
    //從用戶空間拷貝參數(shù)到內(nèi)核空間,復(fù)制在a[]數(shù)組里
    if (copy_from_user(a, args, nargs[call]))
        return -EFAULT;
    //取得所要判斷的跳躍類型
    err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
    if (err)
        return err;
    a0 = a[0];
    a1 = a[1];
    switch (call) {
    case SYS_SOCKET:
        err = sys_socket(a0, a1, a[2]);
        break;
    .........................
    case SYS_SENDTO:
        err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
                 (struct sockaddr __user *)a[4], a[5]);
        break;
    ...............................
    case SYS_RECVFROM:
        err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                 (struct sockaddr __user *)a[4],
                 (int __user *)a[5]);
        break;
    default:
        err = -EINVAL;
        break;
    }
    return err;
}

上面只列出了我們所用到的3個(gè)case
現(xiàn)在我們的目標(biāo)是case SYS_SOCKET,也就是要?jiǎng)?chuàng)建一個(gè)socket了

sys_socket在/net/socket.c中

asmlinkage long sys_socket(int family, int type, int protocol)
{
    int retval;
    struct socket *sock;
    //創(chuàng)建一個(gè)socket
    retval = sock_create(family, type, protocol, &sock);
    if (retval < 0)
        goto out;
    //將該socket映射到fd中
    retval = sock_map_fd(sock);
    if (retval < 0)
        goto out_release;
out:
    /* It may be already another descriptor 8) Not kernel problem. */
    return retval;
out_release:
    sock_release(sock);
    return retval;
}

很簡(jiǎn)單的調(diào)用

sock_create在/net/socket.c中

int sock_create(int family, int type, int protocol, struct socket **res)
{
    return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

繼續(xù),進(jìn)入到__sock_create中

 

static int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    int err;
    struct socket *sock;
    const struct net_proto_family *pf;
    /*
     * Check protocol is in range
     */

    //檢測(cè)協(xié)議家族類型是否在范圍之內(nèi) 
    if (family < 0 || family >= NPROTO)
        return -EAFNOSUPPORT;
    //檢測(cè)協(xié)議傳輸類型是否在范圍之內(nèi) 
    if (type < 0 || type >= SOCK_MAX)
        return -EINVAL;
    /* Compatibility.
     This uglymoron is moved from INET layer to here to avoid
     deadlock in module load.
     */

     //檢測(cè)協(xié)議家族類型是否為PF_INET
     //檢測(cè)協(xié)議傳輸類型是否為SOCK_PACKET
    if (family == PF_INET && type == SOCK_PACKET)
    {
        static int warned;
        if (!warned)
        {
            warned = 1;
            printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
             current->comm);
        }
        family = PF_PACKET;
    }
    err = security_socket_create(family, type, protocol, kern);
    if (err)
        return err;
    /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
     */

     //分配一個(gè)socket
    sock = sock_alloc();
    //檢測(cè)分配是否成功
    if (!sock)
    {
        if (net_ratelimit())
            printk(KERN_WARNING "socket: no more sockets\n");
        return -ENFILE;    /* Not exactly a match, but its the
                 closest posix thing */

    }
    //設(shè)置協(xié)議傳輸類型
    sock->type = type;
#if defined(CONFIG_KMOD)
    /* Attempt to load a protocol module if the find failed.
     *
     * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
     * requested real, full-featured networking support upon configuration.
     * Otherwise module support will break!
     */

    if (net_families[family] == NULL)
        request_module("net-pf-%d", family);
#endif
    rcu_read_lock();
    //根據(jù)協(xié)議類型取得對(duì)應(yīng)的協(xié)議家族結(jié)構(gòu)
    pf = rcu_dereference(net_families[family]);
    err = -EAFNOSUPPORT;
    //檢測(cè)取得協(xié)議結(jié)構(gòu)是否成功
    if (!pf)
        goto out_release;
    /*
     * We will call the ->create function, that possibly is in a loadable
     * module, so we have to bump that loadable module refcnt first.
     */

     //增加協(xié)議家族的使用計(jì)數(shù)器
    if (!try_module_get(pf->owner))
        goto out_release;
    /* Now protected by module ref count */
    rcu_read_unlock();
    //運(yùn)行協(xié)議家族結(jié)構(gòu)中的對(duì)socket初始化函數(shù)
    err = pf->create(net, sock, protocol);
    //檢測(cè)初始化是否成功
    if (err < 0)
        goto out_module_put;
    /*
     * Now to bump the refcnt of the [loadable] module that owns this
     * socket at sock_release time we decrement its refcnt.
     */

     //增加socket所使用的協(xié)議的使用計(jì)數(shù)器
    if (!try_module_get(sock->ops->owner))
        goto out_module_busy;
    /*
     * Now that we're done with the ->create function, the [loadable]
     * module can have its refcnt decremented
     */

     //減少協(xié)議家族使用計(jì)數(shù)器
    module_put(pf->owner);
    err = security_socket_post_create(sock, family, type, protocol, kern);
    if (err)
        goto out_sock_release;
    //設(shè)置socket指針為初始化完成的socket
    *res = sock;
    return 0;
out_module_busy:
    err = -EAFNOSUPPORT;
out_module_put:
    sock->ops = NULL;
    module_put(pf->owner);
out_sock_release:
    sock_release(sock);
    return err;
out_release:
    rcu_read_unlock();
    goto out_sock_release;
}

security_socket_create,關(guān)于security的內(nèi)容我們都略過,一來(lái)減少框架的復(fù)雜度,二來(lái)我也不知道security主要做的是啥 哈哈 不過可以肯定的是不會(huì)妨礙TCP/IP協(xié)議棧的正常運(yùn)行

首先是sock_alloc
sock_alloc在/net/socket.c中

static struct socket *sock_alloc(void)
{
    struct inode *inode;
    struct socket *sock;
    inode = new_inode(sock_mnt->mnt_sb);
    if (!inode)
        return NULL;
    sock = SOCKET_I(inode);
    inode->i_mode = S_IFSOCK | S_IRWXUGO;
    inode->i_uid = current->fsuid;
    inode->i_gid = current->fsgid;
    get_cpu_var(sockets_in_use)++;
    put_cpu_var(sockets_in_use);
    return sock;
}

主要是申請(qǐng)一個(gè)新的socket,并對(duì)他的文件屬性進(jìn)行初始化,socket是屬于虛擬文件系統(tǒng)的一部分,我們暫時(shí)只要這一點(diǎn)就好了

回到__sock_create中,然后到
pf = rcu_dereference(net_families[family]);
net_families的初始化我們也不分析,因?yàn)樯婕暗拿嫣珡V,為了緊扣PING,我們只需要知道得到了inet_family_ops這個(gè)結(jié)構(gòu)就可以了,詳細(xì)的初始化部分在/net/ipv4/af_inet.c中,大家有興趣的可以看看
inet_family_ops的結(jié)構(gòu)如下

static struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
    .owner    = THIS_MODULE,
};

緊接著我們就到了
err = pf->create(net, sock, protocol);
調(diào)用inet_family_ops的create函數(shù)

inet_create在/net/ipv4/af_inet.c中

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
    struct sock *sk;
    struct list_head *p;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    char answer_no_check;
    int try_loading_module = 0;
    int err;

    //檢測(cè)socket的協(xié)議傳輸類型是否為RAW
    //檢測(cè)socket的協(xié)議傳輸類型是否為DGRAM
    //第三個(gè)不知道檢測(cè)的什么
    if (sock->type != SOCK_RAW &&
     sock->type != SOCK_DGRAM &&
     !inet_ehash_secret)
        build_ehash_secret();
    //設(shè)置socket的狀態(tài)為未連接
    sock->state = SS_UNCONNECTED;
    /* Look for the requested type/protocol pair. */
    //初始化協(xié)議結(jié)構(gòu)
    answer = NULL;
lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    //歷遍協(xié)議族
    list_for_each_rcu(p, &inetsw[sock->type])
    {
        //取得對(duì)應(yīng)的協(xié)議的結(jié)構(gòu)
        answer = list_entry(p, struct inet_protosw, list);
        /* Check the non-wild match. */
        //檢測(cè)需要的協(xié)議是否和當(dāng)前歷遍的協(xié)議相等
        if (protocol == answer->protocol)
        {
            //檢測(cè)需要的協(xié)議是否為IP協(xié)議
            if (protocol != IPPROTO_IP)
                //跳出循環(huán)
                break;
        }
        else
        {
            /* Check for the two wild cases. */
            //檢測(cè)需要的協(xié)議是否為IP協(xié)議
            if (IPPROTO_IP == protocol)
            {
                //設(shè)置需要的協(xié)議為當(dāng)前歷遍的協(xié)議
                protocol = answer->protocol;
                //跳出循環(huán)
                break;
            }
            //檢測(cè)當(dāng)前歷遍的協(xié)議是否為IP協(xié)議
            if (IPPROTO_IP == answer->protocol)
                //跳出循環(huán)
                break;
        }
        err = -EPROTONOSUPPORT;
        //設(shè)置協(xié)議結(jié)構(gòu)為空
        answer = NULL;
    }
    //檢測(cè)取得協(xié)議是否為空
    if (unlikely(answer == NULL))
    {
        if (try_loading_module < 2)
        {
            rcu_read_unlock();
            /*
             * Be more specific, e.g. net-pf-2-proto-132-type-1
             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
             */

            if (++try_loading_module == 1)
                request_module("net-pf-%d-proto-%d-type-%d",
                     PF_INET, protocol, sock->type);
            /*
             * Fall back to generic, e.g. net-pf-2-proto-132
             * (net-pf-PF_INET-proto-IPPROTO_SCTP)
             */

            else
                request_module("net-pf-%d-proto-%d",
                     PF_INET, protocol);
            goto lookup_protocol;
        }
        else
            goto out_rcu_unlock;
    }
    err = -EPERM;
    if (answer->capability > 0 && !capable(answer->capability))
        goto out_rcu_unlock;
    err = -EAFNOSUPPORT;
    if (!inet_netns_ok(net, protocol))
        goto out_rcu_unlock;
    //設(shè)置socket的協(xié)議次操作集為當(dāng)前協(xié)議結(jié)構(gòu)的操作集
    sock->ops = answer->ops;
    answer_prot = answer->prot;
    answer_no_check = answer->no_check;
    answer_flags = answer->flags;
    rcu_read_unlock();
    BUG_TRAP(answer_prot->slab != NULL);
    err = -ENOBUFS;
    //分配一個(gè)sock結(jié)構(gòu)
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
    //檢測(cè)分配是否成功
    if (sk == NULL)
        goto out;
    err = 0;
    sk->sk_no_check = answer_no_check;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = 1;
    //將sock結(jié)構(gòu)強(qiáng)制轉(zhuǎn)換成inet_sock結(jié)構(gòu)
    inet = inet_sk(sk);
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
    //檢測(cè)協(xié)議傳輸類型是否為未處理
    if (SOCK_RAW == sock->type)
    {
        //設(shè)置本地端口號(hào)為協(xié)議類型
        inet->num = protocol;
        //檢測(cè)協(xié)議類型是否為未處理
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }
    if (ipv4_config.no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->id = 0;
    //初始化sock
    sock_init_data(sock, sk);
    //設(shè)置sock的回收處理函數(shù)
    sk->sk_destruct     = inet_sock_destruct;
    //設(shè)置sock的協(xié)議家族類型
    sk->sk_family     = PF_INET;
    //設(shè)置sock的協(xié)議類型
    sk->sk_protocol     = protocol;
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
    inet->uc_ttl    = -1;
    inet->mc_loop    = 1;
    inet->mc_ttl    = 1;
    inet->mc_index    = 0;
    inet->mc_list    = NULL;
    sk_refcnt_debug_inc(sk);
    //檢測(cè)本地端口號(hào)是否存在
    if (inet->num)
    {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */

         //設(shè)置對(duì)方端口號(hào)為本地端口號(hào)
        inet->sport = htons(inet->num);
        /* Add to protocol hash chains. */
        sk->sk_prot->hash(sk);
    }
    //檢測(cè)協(xié)議初始化函數(shù)是否存在
    if (sk->sk_prot->init)
    {
        //執(zhí)行協(xié)議初始化函數(shù)
        err = sk->sk_prot->init(sk);
        if (err)
            sk_common_release(sk);
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}

inetsw結(jié)構(gòu)的注冊(cè)不關(guān)心,我們看結(jié)果

 

answer就是其中的第二項(xiàng)

這里我們的protocol為IPPROTO_ICMP
answer->protocol為IPPROTO_IP

所以是進(jìn)入了if (IPPROTO_IP == answer->protocol)后break跳出了循環(huán)

之后到inet_netns_ok
inet_netns_ok在/net/ipv4/af_inet.c中

 

static inline int inet_netns_ok(struct net *net, int protocol)
{
    int hash;
    struct net_protocol *ipprot;
    if (net == &init_net)
        return 1;
    //取得哈希值
    hash = protocol & (MAX_INET_PROTOS - 1);
    //取得哈希值對(duì)應(yīng)的協(xié)議
    ipprot = rcu_dereference(inet_protos[hash]);
    //檢測(cè)協(xié)議是否為空
    if (ipprot == NULL)
        /* raw IP is OK */
        return 1;
    return ipprot->netns_ok;
}

由于在__sock_create中我們傳入的net類型為init_net,所以這里是返回1,不會(huì)goto out_rcu_unlock結(jié)束的

繼續(xù)在inet_create中向下走,來(lái)到了sk_alloc
sk_alloc在/net/core/sock.c中

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
         struct proto *prot)
{
    struct sock *sk;
    //分配一個(gè)sock結(jié)構(gòu)
    sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
    //檢測(cè)分配是否成功
    if (sk)
    {
        //設(shè)置協(xié)議家族類型
        sk->sk_family = family;
        /*
         * See comment in struct sock definition to understand
         * why we need sk_prot_creator -acme
         */

         //設(shè)置協(xié)議主操作集
        sk->sk_prot = sk->sk_prot_creator = prot;
        sock_lock_init(sk);
        sock_net_set(sk, get_net(net));
    }
    return sk;
}

sk_prot_alloc在協(xié)議結(jié)構(gòu)的高速緩存中分配一個(gè)sock結(jié)構(gòu),分配成功后進(jìn)行一些簡(jiǎn)單的初始化操作便退出了

繼續(xù)向下走,到sock_init_data
sock_init_data在/net/core/sock.c中

void sock_init_data(struct socket *sock, struct sock *sk)
{
    //初始化skb接收隊(duì)列
    skb_queue_head_init(&sk->sk_receive_queue);
    //初始化skb發(fā)送隊(duì)列
    skb_queue_head_init(&sk->sk_write_queue);
    //初始化skb錯(cuò)誤隊(duì)列
    skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
    skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
    sk->sk_send_head    =    NULL;
    init_timer(&sk->sk_timer);
    sk->sk_allocation    =    GFP_KERNEL;
    sk->sk_rcvbuf        =    sysctl_rmem_default;
    sk->sk_sndbuf        =    sysctl_wmem_default;
    sk->sk_state        =    TCP_CLOSE;
    //連接socket到sock
    sk->sk_socket        =    sock;
    sock_set_flag(sk, SOCK_ZAPPED);
    //檢測(cè)socket是否存在
    if (sock)
    {
        //設(shè)置sock的協(xié)議傳輸類型
        sk->sk_type    =    sock->type;
        //設(shè)置sock的等待隊(duì)列
        sk->sk_sleep    =    &sock->wait;
        //連接sock到socket
        sock->sk    =    sk;
    }
    else
        //設(shè)置sock的等待隊(duì)列為空
        sk->sk_sleep    =    NULL;
    rwlock_init(&sk->sk_dst_lock);
    rwlock_init(&sk->sk_callback_lock);
    lockdep_set_class_and_name(&sk->sk_callback_lock,
            af_callback_keys + sk->sk_family,
            af_family_clock_key_strings[sk->sk_family]);
    //設(shè)置sock的狀態(tài)改變處理函數(shù)
    sk->sk_state_change    =    sock_def_wakeup;
    //設(shè)置sock的數(shù)據(jù)準(zhǔn)備處理函數(shù)
    sk->sk_data_ready    =    sock_def_readable;
    sk->sk_write_space    =    sock_def_write_space;
    //設(shè)置sock的錯(cuò)誤處理函數(shù)
    sk->sk_error_report    =    sock_def_error_report;
    //設(shè)置sock的回收處理函數(shù)
    sk->sk_destruct        =    sock_def_destruct;
    //發(fā)送數(shù)據(jù)的緩沖頁(yè)面
    sk->sk_sndmsg_page    =    NULL;
    //發(fā)送數(shù)據(jù)的緩沖頁(yè)面偏移值
    sk->sk_sndmsg_off    =    0;
    sk->sk_peercred.pid     =    0;
    sk->sk_peercred.uid    =    -1;
    sk->sk_peercred.gid    =    -1;
    sk->sk_write_pending    =    0;
    sk->sk_rcvlowat        =    1;
    sk->sk_rcvtimeo        =    MAX_SCHEDULE_TIMEOUT;
    sk->sk_sndtimeo        =    MAX_SCHEDULE_TIMEOUT;
    sk->sk_stamp = ktime_set(-1L, 0);
    atomic_set(&sk->sk_refcnt, 1);
    atomic_set(&sk->sk_drops, 0);
}

這是個(gè)大家伙,負(fù)責(zé)sock結(jié)構(gòu)的詳細(xì)初始化
初始化完成后繼續(xù)inet_create的執(zhí)行
由于之前設(shè)置了inet->num為協(xié)議號(hào),這里會(huì)執(zhí)行sk->sk_prot->hash
在進(jìn)入這個(gè)函數(shù)之前讓我們先來(lái)看一下目前sock的結(jié)構(gòu)
 
sk_prot為一個(gè)宏  #define sk_prot __sk_common.skc_prot
指向了raw_prot,所以sk->sk_prot->hash就是執(zhí)行了raw_hash_sk
raw_hash_sk在/net/ipv4/raw.c中

void raw_hash_sk(struct sock *sk)
{
    struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
    struct hlist_head *head;
    head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)];
    write_lock_bh(&h->lock);
    sk_add_node(sk, head);
    sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    write_unlock_bh(&h->lock);
}

主要是將raw_prot連接到了socket的隊(duì)列中,如下圖

 
因?yàn)閞aw_prot是有raw_init這個(gè)函數(shù)的,所以我們進(jìn)入到sk->sk_prot->init
raw_init在/net/ipv4/raw.c中

static int raw_init(struct sock *sk)
{
    //把sock結(jié)構(gòu)強(qiáng)制轉(zhuǎn)換為raw_sock結(jié)構(gòu)
    struct raw_sock *rp = raw_sk(sk);
    //檢測(cè)端口號(hào)是否為ICMP
    if (inet_sk(sk)->num == IPPROTO_ICMP)
        //清空icmp_filter結(jié)構(gòu)
        memset(&rp->filter, 0, sizeof(rp->filter));
    return 0;
}

結(jié)構(gòu)圖如下

 
為什么能一直這樣強(qiáng)制轉(zhuǎn)換下去,就不怕結(jié)構(gòu)超界么?
其實(shí)這是一早有預(yù)謀的,在raw_prot中有一個(gè)成員為
.obj_size    = sizeof(struct raw_sock)
而在協(xié)議中分配空間的時(shí)候就已經(jīng)分配了raw_sock所需要的空間,我們一直在用他的一部分而已
好, 到這里inet_create就完成了,一路返回到sys_socket中
執(zhí)行最后一步,把初始化好的socket結(jié)構(gòu)映射到一個(gè)文件描述符中,并返回這個(gè)文件描述符
這樣,我們的ping程序的sockfd就拿到了一個(gè)按要求初始化好的socket結(jié)構(gòu)索引號(hào)了
在之后的sendto和recvfrom操作中就能夠使用這個(gè)索引號(hào)進(jìn)行發(fā)送和接收了
然后到第2部分,發(fā)送初始化好的icmp結(jié)構(gòu)
sendto(sockfd,sendpacket,packetsize,0,(struct sockaddr *)&dest_addr,sizeof(dest_addr)
繼續(xù)來(lái)到系統(tǒng)調(diào)用sys_socketcall中
這次我們的目標(biāo)是case SYS_SENDTO
sys_sendto在/net/socket.c中

asmlinkage long sys_sendto(int fd, void __user *buff, size_t len,
             unsigned flags, struct sockaddr __user *addr,
             int addr_len)
{
    struct socket *sock;
    char address[MAX_SOCK_ADDR];
    int err;
    struct msghdr msg;
    struct iovec iov;
    int fput_needed;
    //從文件描述符中返回socket
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (!sock)
        goto out;
    //取得需要發(fā)送數(shù)據(jù)的起始地址
    iov.iov_base = buff;
    //取得需要發(fā)送數(shù)據(jù)的數(shù)據(jù)長(zhǎng)度
    iov.iov_len = len;
    msg.msg_name = NULL;
    //連接iov到msg
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_control = NULL;
    msg.msg_controllen = 0;
    msg.msg_namelen = 0;
    //是否有地址參數(shù)
    if (addr)
    {
        //從用戶數(shù)據(jù)轉(zhuǎn)換為內(nèi)核數(shù)據(jù)
        err = move_addr_to_kernel(addr, addr_len, address);
        if (err < 0)
            goto out_put;
        //設(shè)置地址
        msg.msg_name = address;
        //設(shè)置地址長(zhǎng)度
        msg.msg_namelen = addr_len;
    }
    if (sock->file->f_flags & O_NONBLOCK)
        flags |= MSG_DONTWAIT;
    msg.msg_flags = flags;
    err = sock_sendmsg(sock, &msg, len);
out_put:
    fput_light(sock->file, fput_needed);
out:
    return err;
}

初始化好的msg結(jié)構(gòu)如下

 

 

在iovec結(jié)構(gòu)中保存了我們要發(fā)送數(shù)據(jù)的首地址和大小

然后進(jìn)入到sock_sendmsg
sock_sendmsg在/net/socket.c中

int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
    struct kiocb iocb;
    struct sock_iocb siocb;
    int ret;

    init_sync_kiocb(&iocb, NULL);
    iocb.private = &siocb;
    ret = __sock_sendmsg(&iocb, sock, msg, size);
    if (-EIOCBQUEUED == ret)
        ret = wait_on_sync_kiocb(&iocb);
    return ret;
}

我不大明白kiocb的用處,google也不是說(shuō)得很清楚,大概就是說(shuō)關(guān)于文件同步操作方面上的,請(qǐng)明白的同學(xué)們指教一下 = 3=)/ 感謝  這里就不把kiocb的結(jié)構(gòu)畫進(jìn)來(lái)了

然后進(jìn)入到__sock_sendmsg
__sock_sendmsg在/net/socket.c中

static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
                 struct msghdr *msg, size_t size)
{
    struct sock_iocb *si = kiocb_to_siocb(iocb);
    int err;

    //連接socket
    si->sock = sock;
    si->scm = NULL;
    //連接msg
    si->msg = msg;
    //設(shè)置需要拷貝的數(shù)據(jù)大小
    si->size = size;
    err = security_socket_sendmsg(sock, msg, size);
    if (err)
        return err;
    return sock->ops->sendmsg(iocb, sock, msg, size);
}

連接完成后的結(jié)構(gòu)圖如下

 
sock->ops->sendmsg調(diào)用的為inet_sockraw_ops中的sendmsg操作,也就是inet_sendmsg函數(shù)
inet_sendmsg在/net/ipv4/af_inet.c中

int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
         size_t size)
{
    struct sock *sk = sock->sk;
    /* We may need to bind the socket. */
    //檢測(cè)端口號(hào)是否存在
    if (!inet_sk(sk)->num && inet_autobind(sk))
        return -EAGAIN;
    return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}

我們?cè)谥耙呀?jīng)設(shè)置了端口號(hào),所以這里直接來(lái)到了sk->sk_prot->sendmsg
sk->sk_prot->sendmsg調(diào)用的是raw_prot中的sendmsg操作,也就是raw_setsockopt函數(shù)

raw_setsockopt在/net/ipv4/raw.c中

static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
         size_t len)
{
    struct inet_sock *inet = inet_sk(sk);
    struct ipcm_cookie ipc;
    struct rtable *rt = NULL;
    int free = 0;
    __be32 daddr;
    __be32 saddr;
    u8 tos;
    int err;

    err = -EMSGSIZE;
    //檢測(cè)數(shù)據(jù)的長(zhǎng)度是否過長(zhǎng)
    if (len > 0xFFFF)
        goto out;
    /*
     *    Check the flags.
     */

    err = -EOPNOTSUPP;
    if (msg->msg_flags & MSG_OOB)    /* Mirror BSD error message */
        goto out; /* compatibility */
    /*
     *    Get and verify the address.
     */

    //檢測(cè)是否有目的地址
    if (msg->msg_namelen)
    {
        //將地址數(shù)據(jù)格式化成sockaddr_in結(jié)構(gòu)
        struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;    
        err = -EINVAL;
        //檢測(cè)地址數(shù)據(jù)長(zhǎng)度是否過小
        if (msg->msg_namelen < sizeof(*usin))
            goto out;
        //檢測(cè)協(xié)議家族類型是否為AF_INET
        if (usin->sin_family != AF_INET)
        {
            static int complained;      
            if (!complained++)
                printk(KERN_INFO "%s forgot to set AF_INET in "
                         "raw sendmsg. Fix it!\n",
                         current->comm);      
            err = -EAFNOSUPPORT;
            //檢測(cè)是否存在協(xié)議家族類型
            if (usin->sin_family)
                goto out;
        }
        //取得目的地址IP
        daddr = usin->sin_addr.s_addr;
        /* ANK: I did not forget to get protocol from port field.
         * I just do not know, who uses this weirdness.
         * IP_HDRINCL is much more convenient.
         */

    }
    else
    {
        err = -EDESTADDRREQ;
        if (sk->sk_state != TCP_ESTABLISHED)
            goto out;
        daddr = inet->daddr;
    }
    ipc.addr = inet->saddr;
    ipc.opt = NULL;
    ipc.oif = sk->sk_bound_dev_if;
    //檢測(cè)是否有控制信息
    if (msg->msg_controllen)
    {
        err = ip_cmsg_send(sock_net(sk), msg, &ipc);
        if (err)
            goto out;
        if (ipc.opt)
            free = 1;
    }
    saddr = ipc.addr;
    ipc.addr = daddr;
    //檢測(cè)是否存在ip_options
    if (!ipc.opt)
        //無(wú)則設(shè)置為inet_sock中的ip_options
        ipc.opt = inet->opt;
    //檢測(cè)是否存在ip_options
    if (ipc.opt)
    {
        err = -EINVAL;
        /* Linux does not mangle headers on raw sockets,
         * so that IP options + IP_HDRINCL is non-sense.
         */

        if (inet->hdrincl)
            goto done;
        if (ipc.opt->srr)
        {
            if (!daddr)
                goto done;
            daddr = ipc.opt->faddr;
        }
    }
    //取得服務(wù)類型
    tos = RT_CONN_FLAGS(sk);
    if (msg->msg_flags & MSG_DONTROUTE)
        tos |= RTO_ONLINK;
    //檢測(cè)是否為多播地址
    if (ipv4_is_multicast(daddr))
    {
        if (!ipc.oif)
            ipc.oif = inet->mc_index;
        if (!saddr)
            saddr = inet->mc_addr;
    }
    //進(jìn)行路由表查詢
    {
        struct flowi fl = { .oif = ipc.oif,
                 .mark = sk->sk_mark,
                 .nl_u = { .ip4_u =
                     { .daddr = daddr,
                        .saddr = saddr,
                        .tos = tos } },
                 .proto = inet->hdrincl ? IPPROTO_RAW :
                             sk->sk_protocol,
                 };
        if (!inet->hdrincl)
        {
            err = raw_probe_proto_opt(&fl, msg);
            if (err)
                goto done;
        }
        security_sk_classify_flow(sk, &fl);
        err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);
    }
    if (err)
        goto done;
    err = -EACCES;
    if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
        goto done;
    if (msg->msg_flags & MSG_CONFIRM)
        goto do_confirm;
back_from_confirm:
    if (inet->hdrincl)
    {
        err = raw_send_hdrinc(sk, msg->msg_iov, len,rt, msg->msg_flags);
    }
    else
    {
        if (!ipc.addr)
            ipc.addr = rt->rt_dst;
        lock_sock(sk);
        //拷貝需要發(fā)送的數(shù)據(jù)到skb中
        err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
                    &ipc, rt, msg->msg_flags);
        //檢測(cè)拷貝是否成功
        if (err)
            //不成功則釋放所有sock下發(fā)送隊(duì)列中所有的skb
            ip_flush_pending_frames(sk);
        else if (!(msg->msg_flags & MSG_MORE))
            //發(fā)送sk中的skb
            err = ip_push_pending_frames(sk);
        release_sock(sk);
    }
done:
    if (free)
        kfree(ipc.opt);
    ip_rt_put(rt);
out:
    if (err < 0)
        return err;
    return len;
do_confirm:
    dst_confirm(&rt->u.dst);
    if (!(msg->msg_flags & MSG_PROBE) || len)
        goto back_from_confirm;
    err = 0;
    goto done;
}

這里最關(guān)鍵的就是
err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);
這是一個(gè)路由表查詢函數(shù)

無(wú)能為力........

不過我根據(jù)DEBUG的信息把查詢結(jié)果畫了出來(lái),分別為ipcm_cookie和rtable兩個(gè)結(jié)構(gòu),其中最關(guān)鍵的為rtable中的dst_entry

 
rtable中的idev連接lo這個(gè)環(huán)回虛擬網(wǎng)卡設(shè)備
lo網(wǎng)卡的注冊(cè)在/drivers/net/loopback.c中
由于牽涉到路由表的添加問題,我這里就不介紹他的注冊(cè)了
現(xiàn)在回到raw_sendmsg,進(jìn)入ip_append_data, ip_append_data負(fù)責(zé)將要發(fā)送的數(shù)據(jù)組裝到sk_buff結(jié)構(gòu)中










    本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購(gòu)買等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請(qǐng)點(diǎn)擊一鍵舉報(bào)。
    轉(zhuǎn)藏 分享 獻(xiàn)花(0

    0條評(píng)論

    發(fā)表

    請(qǐng)遵守用戶 評(píng)論公約

    類似文章 更多