*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
}
EXPORT_SYMBOL_GPL(ip_local_out);
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
-{
- skb_reset_mac_header(newskb);
- __skb_pull(newskb, skb_network_offset(newskb));
- newskb->pkt_type = PACKET_LOOPBACK;
- newskb->ip_summed = CHECKSUM_UNNECESSARY;
- WARN_ON(!skb_dst(newskb));
- netif_rx_ni(newskb);
- return 0;
-}
-
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
int ttl = inet->uc_ttl;
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
- kfree_skb(skb);
+ consume_skb(skb);
skb = skb2;
}
- if (dst->hh)
- return neigh_hh_output(dst->hh, skb);
- else if (dst->neighbour)
- return dst->neighbour->output(skb);
+ rcu_read_lock_bh();
+ nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
+ neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+ if (neigh) {
+ int res = dst_neigh_output(dst, neigh, skb);
- if (net_ratelimit())
- printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+ rcu_read_unlock_bh();
+ return res;
+ }
+ rcu_read_unlock_bh();
+
+ net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+ __func__);
kfree_skb(skb);
return -EINVAL;
}
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
newskb, NULL, newskb->dev,
- ip_dev_loopback_xmit);
+ dev_loopback_xmit);
}
/* Multicasts with ttl 0 must not go beyond the host */
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
- NULL, newskb->dev, ip_dev_loopback_xmit);
+ NULL, newskb->dev, dev_loopback_xmit);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ * iph->saddr = fl4->saddr;
+ * iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+ BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+ offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+ memcpy(&iph->saddr, &fl4->saddr,
+ sizeof(fl4->saddr) + sizeof(fl4->daddr));
+}
+
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
{
struct sock *sk = skb->sk;
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
- if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
- iph->saddr = fl4->saddr;
- iph->daddr = fl4->daddr;
+ ip_copy_addrs(iph, fl4);
+
/* Transport layer set skb->h.foo itself. */
if (inet_opt && inet_opt->opt.optlen) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
}
- kfree_skb(skb);
+ consume_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return err;
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int mtu, unsigned int flags)
+ int transhdrlen, int maxfraglen, unsigned int flags)
{
struct sk_buff *skb;
int err;
skb->csum = 0;
/* specify the length of each IP datagram fragment */
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+ skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
__skb_queue_tail(queue, skb);
}
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
- length += exthdrlen;
- transhdrlen += exthdrlen;
mtu = cork->fragsize;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
cork->length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO)) {
+ (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
- mtu, flags);
+ maxfraglen, flags);
if (err)
goto error;
return 0;
else
alloclen = fraglen;
+ alloclen += exthdrlen;
+
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
- if (datalen == length + fraggap) {
+ if (datalen == length + fraggap)
alloclen += rt->dst.trailer_len;
- /* make sure mtu is not reached */
- if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
- datalen -= ALIGN(rt->dst.trailer_len, 8);
- }
+
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen);
+ data = skb_put(skb, fraglen + exthdrlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
- data += fragheaderlen;
+ data += fragheaderlen + exthdrlen;
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
if (page && (left = PAGE_SIZE - off) > 0) {
if (copy >= left)
copy = left;
- if (page != frag->page) {
+ if (page != skb_frag_page(frag)) {
if (i == MAX_SKB_FRAGS) {
err = -EMSGSIZE;
goto error;
}
- get_page(page);
skb_fill_page_desc(skb, i, page, off, 0);
+ skb_frag_ref(skb, i);
frag = &skb_shinfo(skb)->frags[i];
}
} else if (i < MAX_SKB_FRAGS) {
err = -EMSGSIZE;
goto error;
}
- if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+ if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
+ offset, copy, skb->len, skb) < 0) {
err = -EFAULT;
goto error;
}
cork->off += copy;
- frag->size += copy;
+ skb_frag_size_add(frag, copy);
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
*/
*rtp = NULL;
cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
- rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+ rt->dst.dev->mtu : dst_mtu(&rt->dst);
cork->dst = &rt->dst;
cork->length = 0;
cork->tx_flags = ipc->tx_flags;
if (len > size)
len = size;
if (skb_can_coalesce(skb, i, page, offset)) {
- skb_shinfo(skb)->frags[i-1].size += len;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
} else if (i < MAX_SKB_FRAGS) {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, len);
ip_select_ident(iph, &rt->dst, sk);
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
- iph->saddr = fl4->saddr;
- iph->daddr = fl4->daddr;
+ ip_copy_addrs(iph, fl4);
if (opt) {
iph->ihl += opt->optlen>>2;
/*
* Generic function to send a packet as reply to another packet.
- * Used to send TCP resets so far. ICMP should use this function too.
+ * Used to send some TCP resets/acks so far.
*
- * Should run single threaded per socket because it uses the sock
- * structure to pass arguments.
+ * Use a fake percpu inet socket to avoid false sharing and contention.
*/
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
- struct ip_reply_arg *arg, unsigned int len)
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+ .sk = {
+ .__sk_common = {
+ .skc_refcnt = ATOMIC_INIT(1),
+ },
+ .sk_wmem_alloc = ATOMIC_INIT(1),
+ .sk_allocation = GFP_ATOMIC,
+ .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
+ },
+ .pmtudisc = IP_PMTUDISC_WANT,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+ __be32 saddr, const struct ip_reply_arg *arg,
+ unsigned int len)
{
- struct inet_sock *inet = inet_sk(sk);
struct ip_options_data replyopts;
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
+ struct sk_buff *nskb;
+ struct sock *sk;
+ struct inet_sock *inet;
if (ip_options_echo(&replyopts.opt.opt, skb))
return;
}
flowi4_init_output(&fl4, arg->bound_dev_if, 0,
- RT_TOS(ip_hdr(skb)->tos),
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ RT_TOS(arg->tos),
+ RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
- daddr, rt->rt_spec_dst,
+ daddr, saddr,
tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
- rt = ip_route_output_key(sock_net(sk), &fl4);
+ rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
return;
- /* And let IP do all the hard work.
+ inet = &get_cpu_var(unicast_sock);
- This chunk is not reenterable, hence spinlock.
- Note that it uses the fact, that this function is called
- with locally disabled BH and that sk cannot be already spinlocked.
- */
- bh_lock_sock(sk);
- inet->tos = ip_hdr(skb)->tos;
+ inet->tos = arg->tos;
+ sk = &inet->sk;
sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
+ sock_net_set(sk, net);
+ __skb_queue_head_init(&sk->sk_write_queue);
+ sk->sk_sndbuf = sysctl_wmem_default;
ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
&ipc, &rt, MSG_DONTWAIT);
- if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ nskb = skb_peek(&sk->sk_write_queue);
+ if (nskb) {
if (arg->csumoffset >= 0)
- *((__sum16 *)skb_transport_header(skb) +
- arg->csumoffset) = csum_fold(csum_add(skb->csum,
+ *((__sum16 *)skb_transport_header(nskb) +
+ arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
- skb->ip_summed = CHECKSUM_NONE;
+ nskb->ip_summed = CHECKSUM_NONE;
+ skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
- bh_unlock_sock(sk);
+ put_cpu_var(unicast_sock);
ip_rt_put(rt);
}