ipv4: Adjust semantics of rt->rt_gateway.

[linux-3.10.git] / net / ipv4 / ip_output.c
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c

index 167da8ba416ac3ea4e83ef9135225970daa73ad9..4494015f7e32d8c00df9923cf8478812ae2d7c50 100644 (file)
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -43,7 +43,6 @@
   */
  
  #include <asm/uaccess.h>
-#include <asm/system.h>
  #include <linux/module.h>
  #include <linux/types.h>
  #include <linux/kernel.h>
@@ -114,18 +113,6 @@ int ip_local_out(struct sk_buff *skb)
  }
  EXPORT_SYMBOL_GPL(ip_local_out);
  
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
-{
-       skb_reset_mac_header(newskb);
-       __skb_pull(newskb, skb_network_offset(newskb));
-       newskb->pkt_type = PACKET_LOOPBACK;
-       newskb->ip_summed = CHECKSUM_UNNECESSARY;
-       WARN_ON(!skb_dst(newskb));
-       netif_rx_ni(newskb);
-       return 0;
-}
-
  static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
  {
         int ttl = inet->uc_ttl;
@@ -182,6 +169,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)
         struct rtable *rt = (struct rtable *)dst;
         struct net_device *dev = dst->dev;
         unsigned int hh_len = LL_RESERVED_SPACE(dev);
+       struct neighbour *neigh;
+       u32 nexthop;
  
         if (rt->rt_type == RTN_MULTICAST) {
                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -199,17 +188,25 @@ static inline int ip_finish_output2(struct sk_buff *skb)
                 }
                 if (skb->sk)
                         skb_set_owner_w(skb2, skb->sk);
-               kfree_skb(skb);
+               consume_skb(skb);
                 skb = skb2;
         }
  
-       if (dst->hh)
-               return neigh_hh_output(dst->hh, skb);
-       else if (dst->neighbour)
-               return dst->neighbour->output(skb);
+       rcu_read_lock_bh();
+       nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
+       neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+       if (unlikely(!neigh))
+               neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+       if (neigh) {
+               int res = dst_neigh_output(dst, neigh, skb);
  
-       if (net_ratelimit())
-               printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+               rcu_read_unlock_bh();
+               return res;
+       }
+       rcu_read_unlock_bh();
+
+       net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+                           __func__);
         kfree_skb(skb);
         return -EINVAL;
  }
@@ -275,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb)
                         if (newskb)
                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                         newskb, NULL, newskb->dev,
-                                       ip_dev_loopback_xmit);
+                                       dev_loopback_xmit);
                 }
  
                 /* Multicasts with ttl 0 must not go beyond the host */
@@ -290,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb)
                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                 if (newskb)
                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
-                               NULL, newskb->dev, ip_dev_loopback_xmit);
+                               NULL, newskb->dev, dev_loopback_xmit);
         }
  
         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -312,6 +309,20 @@ int ip_output(struct sk_buff *skb)
                             !(IPCB(skb)->flags & IPSKB_REROUTED));
  }
  
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ *   iph->saddr = fl4->saddr;
+ *   iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+       BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+                    offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+       memcpy(&iph->saddr, &fl4->saddr,
+              sizeof(fl4->saddr) + sizeof(fl4->daddr));
+}
+
  int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
  {
         struct sock *sk = skb->sk;
@@ -360,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
         skb_dst_set_noref(skb, &rt->dst);
  
  packet_routed:
-       if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+       if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
                 goto no_route;
  
         /* OK, we know where to send it, allocate and build IP header. */
@@ -374,8 +385,8 @@ packet_routed:
                 iph->frag_off = 0;
         iph->ttl      = ip_select_ttl(inet, &rt->dst);
         iph->protocol = sk->sk_protocol;
-       iph->saddr    = fl4->saddr;
-       iph->daddr    = fl4->daddr;
+       ip_copy_addrs(iph, fl4);
+
         /* Transport layer set skb->h.foo itself. */
  
         if (inet_opt && inet_opt->opt.optlen) {
@@ -689,7 +700,7 @@ slow_path:
  
                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
         }
-       kfree_skb(skb);
+       consume_skb(skb);
         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
         return err;
  
@@ -734,7 +745,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
                         int getfrag(void *from, char *to, int offset, int len,
                                int odd, struct sk_buff *skb),
                         void *from, int length, int hh_len, int fragheaderlen,
-                       int transhdrlen, int mtu, unsigned int flags)
+                       int transhdrlen, int maxfraglen, unsigned int flags)
  {
         struct sk_buff *skb;
         int err;
@@ -767,7 +778,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
                 skb->csum = 0;
  
                 /* specify the length of each IP datagram fragment */
-               skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+               skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
                 __skb_queue_tail(queue, skb);
         }
@@ -802,8 +813,6 @@ static int __ip_append_data(struct sock *sk,
         skb = skb_peek_tail(queue);
  
         exthdrlen = !skb ? rt->dst.header_len : 0;
-       length += exthdrlen;
-       transhdrlen += exthdrlen;
         mtu = cork->fragsize;
  
         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -830,10 +839,10 @@ static int __ip_append_data(struct sock *sk,
         cork->length += length;
         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
             (sk->sk_protocol == IPPROTO_UDP) &&
-           (rt->dst.dev->features & NETIF_F_UFO)) {
+           (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
                                          hh_len, fragheaderlen, transhdrlen,
-                                        mtu, flags);
+                                        maxfraglen, flags);
                 if (err)
                         goto error;
                 return 0;
@@ -883,17 +892,16 @@ alloc_new_skb:
                         else
                                 alloclen = fraglen;
  
+                       alloclen += exthdrlen;
+
                         /* The last fragment gets additional space at tail.
                          * Note, with MSG_MORE we overallocate on fragments,
                          * because we have no idea what fragment will be
                          * the last.
                          */
-                       if (datalen == length + fraggap) {
+                       if (datalen == length + fraggap)
                                 alloclen += rt->dst.trailer_len;
-                               /* make sure mtu is not reached */
-                               if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
-                                       datalen -= ALIGN(rt->dst.trailer_len, 8);
-                       }
+
                         if (transhdrlen) {
                                 skb = sock_alloc_send_skb(sk,
                                                 alloclen + hh_len + 15,
@@ -926,11 +934,11 @@ alloc_new_skb:
                         /*
                          *      Find where to start putting bytes.
                          */
-                       data = skb_put(skb, fraglen);
+                       data = skb_put(skb, fraglen + exthdrlen);
                         skb_set_network_header(skb, exthdrlen);
                         skb->transport_header = (skb->network_header +
                                                  fragheaderlen);
-                       data += fragheaderlen;
+                       data += fragheaderlen + exthdrlen;
  
                         if (fraggap) {
                                 skb->csum = skb_copy_and_csum_bits(
@@ -985,13 +993,13 @@ alloc_new_skb:
                         if (page && (left = PAGE_SIZE - off) > 0) {
                                 if (copy >= left)
                                         copy = left;
-                               if (page != frag->page) {
+                               if (page != skb_frag_page(frag)) {
                                         if (i == MAX_SKB_FRAGS) {
                                                 err = -EMSGSIZE;
                                                 goto error;
                                         }
-                                       get_page(page);
                                         skb_fill_page_desc(skb, i, page, off, 0);
+                                       skb_frag_ref(skb, i);
                                         frag = &skb_shinfo(skb)->frags[i];
                                 }
                         } else if (i < MAX_SKB_FRAGS) {
@@ -1011,12 +1019,13 @@ alloc_new_skb:
                                 err = -EMSGSIZE;
                                 goto error;
                         }
-                       if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+                       if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
+                                   offset, copy, skb->len, skb) < 0) {
                                 err = -EFAULT;
                                 goto error;
                         }
                         cork->off += copy;
-                       frag->size += copy;
+                       skb_frag_size_add(frag, copy);
                         skb->len += copy;
                         skb->data_len += copy;
                         skb->truesize += copy;
@@ -1064,7 +1073,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
          */
         *rtp = NULL;
         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-                        rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+                        rt->dst.dev->mtu : dst_mtu(&rt->dst);
         cork->dst = &rt->dst;
         cork->length = 0;
         cork->tx_flags = ipc->tx_flags;
@@ -1225,7 +1234,7 @@ ssize_t   ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
                 if (len > size)
                         len = size;
                 if (skb_can_coalesce(skb, i, page, offset)) {
-                       skb_shinfo(skb)->frags[i-1].size += len;
+                       skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
                 } else if (i < MAX_SKB_FRAGS) {
                         get_page(page);
                         skb_fill_page_desc(skb, i, page, offset, len);
@@ -1332,8 +1341,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
         ip_select_ident(iph, &rt->dst, sk);
         iph->ttl = ttl;
         iph->protocol = sk->sk_protocol;
-       iph->saddr = fl4->saddr;
-       iph->daddr = fl4->daddr;
+       ip_copy_addrs(iph, fl4);
  
         if (opt) {
                 iph->ihl += opt->optlen>>2;
@@ -1455,19 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
  
  /*
   *     Generic function to send a packet as reply to another packet.
- *     Used to send TCP resets so far. ICMP should use this function too.
+ *     Used to send some TCP resets/acks so far.
   *
- *     Should run single threaded per socket because it uses the sock
- *             structure to pass arguments.
+ *     Use a fake percpu inet socket to avoid false sharing and contention.
   */
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
-                  struct ip_reply_arg *arg, unsigned int len)
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+       .sk = {
+               .__sk_common = {
+                       .skc_refcnt = ATOMIC_INIT(1),
+               },
+               .sk_wmem_alloc  = ATOMIC_INIT(1),
+               .sk_allocation  = GFP_ATOMIC,
+               .sk_flags       = (1UL << SOCK_USE_WRITE_QUEUE),
+       },
+       .pmtudisc = IP_PMTUDISC_WANT,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+                          __be32 saddr, const struct ip_reply_arg *arg,
+                          unsigned int len)
  {
-       struct inet_sock *inet = inet_sk(sk);
         struct ip_options_data replyopts;
         struct ipcm_cookie ipc;
         struct flowi4 fl4;
         struct rtable *rt = skb_rtable(skb);
+       struct sk_buff *nskb;
+       struct sock *sk;
+       struct inet_sock *inet;
  
         if (ip_options_echo(&replyopts.opt.opt, skb))
                 return;
@@ -1484,39 +1506,40 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
         }
  
         flowi4_init_output(&fl4, arg->bound_dev_if, 0,
-                          RT_TOS(ip_hdr(skb)->tos),
-                          RT_SCOPE_UNIVERSE, sk->sk_protocol,
+                          RT_TOS(arg->tos),
+                          RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
                            ip_reply_arg_flowi_flags(arg),
-                          daddr, rt->rt_spec_dst,
+                          daddr, saddr,
                            tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
         security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-       rt = ip_route_output_key(sock_net(sk), &fl4);
+       rt = ip_route_output_key(net, &fl4);
         if (IS_ERR(rt))
                 return;
  
-       /* And let IP do all the hard work.
+       inet = &get_cpu_var(unicast_sock);
  
-          This chunk is not reenterable, hence spinlock.
-          Note that it uses the fact, that this function is called
-          with locally disabled BH and that sk cannot be already spinlocked.
-        */
-       bh_lock_sock(sk);
-       inet->tos = ip_hdr(skb)->tos;
+       inet->tos = arg->tos;
+       sk = &inet->sk;
         sk->sk_priority = skb->priority;
         sk->sk_protocol = ip_hdr(skb)->protocol;
         sk->sk_bound_dev_if = arg->bound_dev_if;
+       sock_net_set(sk, net);
+       __skb_queue_head_init(&sk->sk_write_queue);
+       sk->sk_sndbuf = sysctl_wmem_default;
         ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
                        &ipc, &rt, MSG_DONTWAIT);
-       if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+       nskb = skb_peek(&sk->sk_write_queue);
+       if (nskb) {
                 if (arg->csumoffset >= 0)
-                       *((__sum16 *)skb_transport_header(skb) +
-                         arg->csumoffset) = csum_fold(csum_add(skb->csum,
+                       *((__sum16 *)skb_transport_header(nskb) +
+                         arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                 arg->csum));
-               skb->ip_summed = CHECKSUM_NONE;
+               nskb->ip_summed = CHECKSUM_NONE;
+               skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
                 ip_push_pending_frames(sk, &fl4);
         }
  
-       bh_unlock_sock(sk);
+       put_cpu_var(unicast_sock);
  
         ip_rt_put(rt);
  }