]> nv-tegra.nvidia Code Review - linux-3.10.git/blobdiff - net/ipv4/ip_output.c
ipv4: Adjust semantics of rt->rt_gateway.
[linux-3.10.git] / net / ipv4 / ip_output.c
index 167da8ba416ac3ea4e83ef9135225970daa73ad9..4494015f7e32d8c00df9923cf8478812ae2d7c50 100644 (file)
@@ -43,7 +43,6 @@
  */
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -114,18 +113,6 @@ int ip_local_out(struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip_local_out);
 
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
-{
-       skb_reset_mac_header(newskb);
-       __skb_pull(newskb, skb_network_offset(newskb));
-       newskb->pkt_type = PACKET_LOOPBACK;
-       newskb->ip_summed = CHECKSUM_UNNECESSARY;
-       WARN_ON(!skb_dst(newskb));
-       netif_rx_ni(newskb);
-       return 0;
-}
-
 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 {
        int ttl = inet->uc_ttl;
@@ -182,6 +169,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)
        struct rtable *rt = (struct rtable *)dst;
        struct net_device *dev = dst->dev;
        unsigned int hh_len = LL_RESERVED_SPACE(dev);
+       struct neighbour *neigh;
+       u32 nexthop;
 
        if (rt->rt_type == RTN_MULTICAST) {
                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -199,17 +188,25 @@ static inline int ip_finish_output2(struct sk_buff *skb)
                }
                if (skb->sk)
                        skb_set_owner_w(skb2, skb->sk);
-               kfree_skb(skb);
+               consume_skb(skb);
                skb = skb2;
        }
 
-       if (dst->hh)
-               return neigh_hh_output(dst->hh, skb);
-       else if (dst->neighbour)
-               return dst->neighbour->output(skb);
+       rcu_read_lock_bh();
+       nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
+       neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+       if (unlikely(!neigh))
+               neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+       if (neigh) {
+               int res = dst_neigh_output(dst, neigh, skb);
 
-       if (net_ratelimit())
-               printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+               rcu_read_unlock_bh();
+               return res;
+       }
+       rcu_read_unlock_bh();
+
+       net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+                           __func__);
        kfree_skb(skb);
        return -EINVAL;
 }
@@ -275,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb)
                        if (newskb)
                                NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                        newskb, NULL, newskb->dev,
-                                       ip_dev_loopback_xmit);
+                                       dev_loopback_xmit);
                }
 
                /* Multicasts with ttl 0 must not go beyond the host */
@@ -290,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb)
                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                if (newskb)
                        NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
-                               NULL, newskb->dev, ip_dev_loopback_xmit);
+                               NULL, newskb->dev, dev_loopback_xmit);
        }
 
        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -312,6 +309,20 @@ int ip_output(struct sk_buff *skb)
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
 
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ *   iph->saddr = fl4->saddr;
+ *   iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+       BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+                    offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+       memcpy(&iph->saddr, &fl4->saddr,
+              sizeof(fl4->saddr) + sizeof(fl4->daddr));
+}
+
 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 {
        struct sock *sk = skb->sk;
@@ -360,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
        skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
-       if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+       if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
                goto no_route;
 
        /* OK, we know where to send it, allocate and build IP header. */
@@ -374,8 +385,8 @@ packet_routed:
                iph->frag_off = 0;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
        iph->protocol = sk->sk_protocol;
-       iph->saddr    = fl4->saddr;
-       iph->daddr    = fl4->daddr;
+       ip_copy_addrs(iph, fl4);
+
        /* Transport layer set skb->h.foo itself. */
 
        if (inet_opt && inet_opt->opt.optlen) {
@@ -689,7 +700,7 @@ slow_path:
 
                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
        }
-       kfree_skb(skb);
+       consume_skb(skb);
        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
        return err;
 
@@ -734,7 +745,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
                        int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                        void *from, int length, int hh_len, int fragheaderlen,
-                       int transhdrlen, int mtu, unsigned int flags)
+                       int transhdrlen, int maxfraglen, unsigned int flags)
 {
        struct sk_buff *skb;
        int err;
@@ -767,7 +778,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
                skb->csum = 0;
 
                /* specify the length of each IP datagram fragment */
-               skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+               skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
                __skb_queue_tail(queue, skb);
        }
@@ -802,8 +813,6 @@ static int __ip_append_data(struct sock *sk,
        skb = skb_peek_tail(queue);
 
        exthdrlen = !skb ? rt->dst.header_len : 0;
-       length += exthdrlen;
-       transhdrlen += exthdrlen;
        mtu = cork->fragsize;
 
        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -830,10 +839,10 @@ static int __ip_append_data(struct sock *sk,
        cork->length += length;
        if (((length > mtu) || (skb && skb_is_gso(skb))) &&
            (sk->sk_protocol == IPPROTO_UDP) &&
-           (rt->dst.dev->features & NETIF_F_UFO)) {
+           (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
                err = ip_ufo_append_data(sk, queue, getfrag, from, length,
                                         hh_len, fragheaderlen, transhdrlen,
-                                        mtu, flags);
+                                        maxfraglen, flags);
                if (err)
                        goto error;
                return 0;
@@ -883,17 +892,16 @@ alloc_new_skb:
                        else
                                alloclen = fraglen;
 
+                       alloclen += exthdrlen;
+
                        /* The last fragment gets additional space at tail.
                         * Note, with MSG_MORE we overallocate on fragments,
                         * because we have no idea what fragment will be
                         * the last.
                         */
-                       if (datalen == length + fraggap) {
+                       if (datalen == length + fraggap)
                                alloclen += rt->dst.trailer_len;
-                               /* make sure mtu is not reached */
-                               if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
-                                       datalen -= ALIGN(rt->dst.trailer_len, 8);
-                       }
+
                        if (transhdrlen) {
                                skb = sock_alloc_send_skb(sk,
                                                alloclen + hh_len + 15,
@@ -926,11 +934,11 @@ alloc_new_skb:
                        /*
                         *      Find where to start putting bytes.
                         */
-                       data = skb_put(skb, fraglen);
+                       data = skb_put(skb, fraglen + exthdrlen);
                        skb_set_network_header(skb, exthdrlen);
                        skb->transport_header = (skb->network_header +
                                                 fragheaderlen);
-                       data += fragheaderlen;
+                       data += fragheaderlen + exthdrlen;
 
                        if (fraggap) {
                                skb->csum = skb_copy_and_csum_bits(
@@ -985,13 +993,13 @@ alloc_new_skb:
                        if (page && (left = PAGE_SIZE - off) > 0) {
                                if (copy >= left)
                                        copy = left;
-                               if (page != frag->page) {
+                               if (page != skb_frag_page(frag)) {
                                        if (i == MAX_SKB_FRAGS) {
                                                err = -EMSGSIZE;
                                                goto error;
                                        }
-                                       get_page(page);
                                        skb_fill_page_desc(skb, i, page, off, 0);
+                                       skb_frag_ref(skb, i);
                                        frag = &skb_shinfo(skb)->frags[i];
                                }
                        } else if (i < MAX_SKB_FRAGS) {
@@ -1011,12 +1019,13 @@ alloc_new_skb:
                                err = -EMSGSIZE;
                                goto error;
                        }
-                       if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+                       if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
+                                   offset, copy, skb->len, skb) < 0) {
                                err = -EFAULT;
                                goto error;
                        }
                        cork->off += copy;
-                       frag->size += copy;
+                       skb_frag_size_add(frag, copy);
                        skb->len += copy;
                        skb->data_len += copy;
                        skb->truesize += copy;
@@ -1064,7 +1073,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
         */
        *rtp = NULL;
        cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-                        rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+                        rt->dst.dev->mtu : dst_mtu(&rt->dst);
        cork->dst = &rt->dst;
        cork->length = 0;
        cork->tx_flags = ipc->tx_flags;
@@ -1225,7 +1234,7 @@ ssize_t   ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
                if (len > size)
                        len = size;
                if (skb_can_coalesce(skb, i, page, offset)) {
-                       skb_shinfo(skb)->frags[i-1].size += len;
+                       skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
                } else if (i < MAX_SKB_FRAGS) {
                        get_page(page);
                        skb_fill_page_desc(skb, i, page, offset, len);
@@ -1332,8 +1341,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
        ip_select_ident(iph, &rt->dst, sk);
        iph->ttl = ttl;
        iph->protocol = sk->sk_protocol;
-       iph->saddr = fl4->saddr;
-       iph->daddr = fl4->daddr;
+       ip_copy_addrs(iph, fl4);
 
        if (opt) {
                iph->ihl += opt->optlen>>2;
@@ -1455,19 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 
 /*
  *     Generic function to send a packet as reply to another packet.
- *     Used to send TCP resets so far. ICMP should use this function too.
+ *     Used to send some TCP resets/acks so far.
  *
- *     Should run single threaded per socket because it uses the sock
- *             structure to pass arguments.
+ *     Use a fake percpu inet socket to avoid false sharing and contention.
  */
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
-                  struct ip_reply_arg *arg, unsigned int len)
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+       .sk = {
+               .__sk_common = {
+                       .skc_refcnt = ATOMIC_INIT(1),
+               },
+               .sk_wmem_alloc  = ATOMIC_INIT(1),
+               .sk_allocation  = GFP_ATOMIC,
+               .sk_flags       = (1UL << SOCK_USE_WRITE_QUEUE),
+       },
+       .pmtudisc = IP_PMTUDISC_WANT,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+                          __be32 saddr, const struct ip_reply_arg *arg,
+                          unsigned int len)
 {
-       struct inet_sock *inet = inet_sk(sk);
        struct ip_options_data replyopts;
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
+       struct sk_buff *nskb;
+       struct sock *sk;
+       struct inet_sock *inet;
 
        if (ip_options_echo(&replyopts.opt.opt, skb))
                return;
@@ -1484,39 +1506,40 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
        }
 
        flowi4_init_output(&fl4, arg->bound_dev_if, 0,
-                          RT_TOS(ip_hdr(skb)->tos),
-                          RT_SCOPE_UNIVERSE, sk->sk_protocol,
+                          RT_TOS(arg->tos),
+                          RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
                           ip_reply_arg_flowi_flags(arg),
-                          daddr, rt->rt_spec_dst,
+                          daddr, saddr,
                           tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-       rt = ip_route_output_key(sock_net(sk), &fl4);
+       rt = ip_route_output_key(net, &fl4);
        if (IS_ERR(rt))
                return;
 
-       /* And let IP do all the hard work.
+       inet = &get_cpu_var(unicast_sock);
 
-          This chunk is not reenterable, hence spinlock.
-          Note that it uses the fact, that this function is called
-          with locally disabled BH and that sk cannot be already spinlocked.
-        */
-       bh_lock_sock(sk);
-       inet->tos = ip_hdr(skb)->tos;
+       inet->tos = arg->tos;
+       sk = &inet->sk;
        sk->sk_priority = skb->priority;
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
+       sock_net_set(sk, net);
+       __skb_queue_head_init(&sk->sk_write_queue);
+       sk->sk_sndbuf = sysctl_wmem_default;
        ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
                       &ipc, &rt, MSG_DONTWAIT);
-       if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+       nskb = skb_peek(&sk->sk_write_queue);
+       if (nskb) {
                if (arg->csumoffset >= 0)
-                       *((__sum16 *)skb_transport_header(skb) +
-                         arg->csumoffset) = csum_fold(csum_add(skb->csum,
+                       *((__sum16 *)skb_transport_header(nskb) +
+                         arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                arg->csum));
-               skb->ip_summed = CHECKSUM_NONE;
+               nskb->ip_summed = CHECKSUM_NONE;
+               skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
                ip_push_pending_frames(sk, &fl4);
        }
 
-       bh_unlock_sock(sk);
+       put_cpu_var(unicast_sock);
 
        ip_rt_put(rt);
 }