在ULNI上圖中寫的清楚,此函數(shù)一般由tcp或sctp調(diào)用 上層工作都已經(jīng)做好了,只差ip頭及以下部分的填充 tcp可以做到這一點,因為有mss的限制以及自己的一些控制包大小的算法
而udp不一樣,它的數(shù)據(jù)包分段沒有完成需要下層幫忙, 因此udp使用的ip_append_data函數(shù)要復雜得多。 下面簡單注釋ip_queue_xmit函數(shù)
- int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
- {
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(sk);
- struct ip_options *opt = inet->opt;
- struct rtable *rt;
- struct iphdr *iph;
- /* Skip all of this if the packet is already routed,
- * f.e. by something like SCTP.
- */
- //首先檢測skb->rtable是否為空,不為空說明已經(jīng)指定了路由,跳到packet_routed繼續(xù)執(zhí)行
- //根據(jù)上面注釋,似乎sctp可能提前指定路由
- rt = skb->rtable;
- if (rt != NULL)
- goto packet_routed;
- /* Make sure we can route this packet. */
- //檢測socket路由合法性,如果不合法也需要重新查找路由
- rt = (struct rtable *)__sk_dst_check(sk, 0);
- if (rt == NULL) {
- __be32 daddr;
- /* Use correct destination address if we have options. */
- daddr = inet->daddr;
- if(opt && opt->srr)
- daddr = opt->faddr;
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = inet->saddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .uli_u = { .ports =
- { .sport = inet->sport,
- .dport = inet->dport } } };
- /* If this fails, retransmit mechanism of transport layer will
- * keep trying until route appears or the connection times
- * itself out.
- */
- security_sk_classify_flow(sk, &fl);
- //下面是主要的出口路由查找函數(shù),等看完路由這一章再回來補充
- if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
- goto no_route;
- }
- //下面函數(shù)做的其中一件事是sk->sk_dst_cache = dst;并釋放舊的dst緩存
- sk_setup_caps(sk, &rt->u.dst);
- }
- //增加路由緩存引用計數(shù)
- skb->dst = dst_clone(&rt->u.dst);
- packet_routed:
- //如果sk_buff指向的sock的opt中包含嚴格源站路由選項,
- //而剛剛查找到的路由項目標地址又不等于網(wǎng)關(guān)地址的話前往no_route
- //說明嚴格源站路由無法滿足
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
- goto no_route;
- /* OK, we know where to send it, allocate and build IP header. */
- //在skb的數(shù)據(jù)中預留出ip首部包括選項的空間給ip報頭,并將
- //skb->network_header指向它
- skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
- skb_reset_network_header(skb);
- iph = ip_hdr(skb);
- //在ip首部填入版本號4,ip首部長度5(20字節(jié),這個值在后面要根據(jù)選項
- //的長度增加),以及服務類型
- *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- //如果socket要求ip不分片(這是通過檢測sock->pmtudisc做到的,
- //如果使用路徑mtu發(fā)現(xiàn)則說明要求不分片,否則允許分片)并且參數(shù)ipfragok等于0,
- //那么將DF標志置1,否則清0
- if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
- iph->frag_off = htons(IP_DF);
- else
- iph->frag_off = 0;
- //設(shè)置ip首部的ttl(從sock的uc_ttl獲得,如果小于0則從路由項的metrics獲得),
- //protocol(從sock->sk_protocol),源地址,目標地址(兩者都從路由項獲得)
- iph->ttl = ip_select_ttl(inet, &rt->u.dst);
- iph->protocol = sk->sk_protocol;
- iph->saddr = rt->rt_src;
- iph->daddr = rt->rt_dst;
- /* Transport layer set skb->h.foo itself. */
- //若opt不為NULL,則在ip首部長度中加上選項長度,
- //并且調(diào)用ip_options_build向IP首部中寫入ip選項
- if (opt && opt->optlen) {
- iph->ihl += opt->optlen >> 2;
- //這個函數(shù)值得一看,opt是從inet_sock中獲得的
- ip_options_build(skb, opt, inet->daddr, rt, 0);
- }
- //調(diào)用ip_select_ident_more填入IP首部的id字段
- //關(guān)于ip的id在ULNI上講得很清楚,Linux為了防止id回繞采取的策略是對于每一個ip
- //分配一個inet_peer結(jié)構(gòu),在這個inet_peer中記錄針對這個ip的id號,
- //這樣可以很大程度上減緩id回繞的速度,但是仍不能完全避免
- ip_select_ident_more(iph, &rt->u.dst, sk,
- (skb_shinfo(skb)->gso_segs ?: 1) - 1);
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- return ip_local_out(skb);
- no_route:
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
- return -EHOSTUNREACH;
- }
|