ipv4: Adjust semantics of rt->rt_gateway.
[linux-3.10.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58    Problems & solutions
59    --------------------
60
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110
111
112
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119
120    Alexey Kuznetsov.
121  */
122
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127
128 /* Fallback tunnel: no source, no destination, no key, no options */
129
130 #define HASH_SIZE  16
131
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135
136         struct net_device *fb_tunnel_dev;
137 };
138
139 /* Tunnel hash table */
140
141 /*
142    4 hash tables:
143
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158
159 #define tunnels_r_l     tunnels[3]
160 #define tunnels_r       tunnels[2]
161 #define tunnels_l       tunnels[1]
162 #define tunnels_wc      tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166
167 #define for_each_ip_tunnel_rcu(start) \
168         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172         u64     rx_packets;
173         u64     rx_bytes;
174         u64     tx_packets;
175         u64     tx_bytes;
176         struct u64_stats_sync   syncp;
177 };
178
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180                                                    struct rtnl_link_stats64 *tot)
181 {
182         int i;
183
184         for_each_possible_cpu(i) {
185                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187                 unsigned int start;
188
189                 do {
190                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
191                         rx_packets = tstats->rx_packets;
192                         tx_packets = tstats->tx_packets;
193                         rx_bytes = tstats->rx_bytes;
194                         tx_bytes = tstats->tx_bytes;
195                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196
197                 tot->rx_packets += rx_packets;
198                 tot->tx_packets += tx_packets;
199                 tot->rx_bytes   += rx_bytes;
200                 tot->tx_bytes   += tx_bytes;
201         }
202
203         tot->multicast = dev->stats.multicast;
204         tot->rx_crc_errors = dev->stats.rx_crc_errors;
205         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206         tot->rx_length_errors = dev->stats.rx_length_errors;
207         tot->rx_errors = dev->stats.rx_errors;
208         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210         tot->tx_dropped = dev->stats.tx_dropped;
211         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212         tot->tx_errors = dev->stats.tx_errors;
213
214         return tot;
215 }
216
217 /* Given src, dst and key, find appropriate for input tunnel. */
218
219 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220                                              __be32 remote, __be32 local,
221                                              __be32 key, __be16 gre_proto)
222 {
223         struct net *net = dev_net(dev);
224         int link = dev->ifindex;
225         unsigned int h0 = HASH(remote);
226         unsigned int h1 = HASH(key);
227         struct ip_tunnel *t, *cand = NULL;
228         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
229         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
230                        ARPHRD_ETHER : ARPHRD_IPGRE;
231         int score, cand_score = 4;
232
233         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234                 if (local != t->parms.iph.saddr ||
235                     remote != t->parms.iph.daddr ||
236                     key != t->parms.i_key ||
237                     !(t->dev->flags & IFF_UP))
238                         continue;
239
240                 if (t->dev->type != ARPHRD_IPGRE &&
241                     t->dev->type != dev_type)
242                         continue;
243
244                 score = 0;
245                 if (t->parms.link != link)
246                         score |= 1;
247                 if (t->dev->type != dev_type)
248                         score |= 2;
249                 if (score == 0)
250                         return t;
251
252                 if (score < cand_score) {
253                         cand = t;
254                         cand_score = score;
255                 }
256         }
257
258         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259                 if (remote != t->parms.iph.daddr ||
260                     key != t->parms.i_key ||
261                     !(t->dev->flags & IFF_UP))
262                         continue;
263
264                 if (t->dev->type != ARPHRD_IPGRE &&
265                     t->dev->type != dev_type)
266                         continue;
267
268                 score = 0;
269                 if (t->parms.link != link)
270                         score |= 1;
271                 if (t->dev->type != dev_type)
272                         score |= 2;
273                 if (score == 0)
274                         return t;
275
276                 if (score < cand_score) {
277                         cand = t;
278                         cand_score = score;
279                 }
280         }
281
282         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
283                 if ((local != t->parms.iph.saddr &&
284                      (local != t->parms.iph.daddr ||
285                       !ipv4_is_multicast(local))) ||
286                     key != t->parms.i_key ||
287                     !(t->dev->flags & IFF_UP))
288                         continue;
289
290                 if (t->dev->type != ARPHRD_IPGRE &&
291                     t->dev->type != dev_type)
292                         continue;
293
294                 score = 0;
295                 if (t->parms.link != link)
296                         score |= 1;
297                 if (t->dev->type != dev_type)
298                         score |= 2;
299                 if (score == 0)
300                         return t;
301
302                 if (score < cand_score) {
303                         cand = t;
304                         cand_score = score;
305                 }
306         }
307
308         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
309                 if (t->parms.i_key != key ||
310                     !(t->dev->flags & IFF_UP))
311                         continue;
312
313                 if (t->dev->type != ARPHRD_IPGRE &&
314                     t->dev->type != dev_type)
315                         continue;
316
317                 score = 0;
318                 if (t->parms.link != link)
319                         score |= 1;
320                 if (t->dev->type != dev_type)
321                         score |= 2;
322                 if (score == 0)
323                         return t;
324
325                 if (score < cand_score) {
326                         cand = t;
327                         cand_score = score;
328                 }
329         }
330
331         if (cand != NULL)
332                 return cand;
333
334         dev = ign->fb_tunnel_dev;
335         if (dev->flags & IFF_UP)
336                 return netdev_priv(dev);
337
338         return NULL;
339 }
340
341 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
342                 struct ip_tunnel_parm *parms)
343 {
344         __be32 remote = parms->iph.daddr;
345         __be32 local = parms->iph.saddr;
346         __be32 key = parms->i_key;
347         unsigned int h = HASH(key);
348         int prio = 0;
349
350         if (local)
351                 prio |= 1;
352         if (remote && !ipv4_is_multicast(remote)) {
353                 prio |= 2;
354                 h ^= HASH(remote);
355         }
356
357         return &ign->tunnels[prio][h];
358 }
359
360 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
361                 struct ip_tunnel *t)
362 {
363         return __ipgre_bucket(ign, &t->parms);
364 }
365
366 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
367 {
368         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
369
370         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
371         rcu_assign_pointer(*tp, t);
372 }
373
374 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
375 {
376         struct ip_tunnel __rcu **tp;
377         struct ip_tunnel *iter;
378
379         for (tp = ipgre_bucket(ign, t);
380              (iter = rtnl_dereference(*tp)) != NULL;
381              tp = &iter->next) {
382                 if (t == iter) {
383                         rcu_assign_pointer(*tp, t->next);
384                         break;
385                 }
386         }
387 }
388
389 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
390                                            struct ip_tunnel_parm *parms,
391                                            int type)
392 {
393         __be32 remote = parms->iph.daddr;
394         __be32 local = parms->iph.saddr;
395         __be32 key = parms->i_key;
396         int link = parms->link;
397         struct ip_tunnel *t;
398         struct ip_tunnel __rcu **tp;
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         for (tp = __ipgre_bucket(ign, parms);
402              (t = rtnl_dereference(*tp)) != NULL;
403              tp = &t->next)
404                 if (local == t->parms.iph.saddr &&
405                     remote == t->parms.iph.daddr &&
406                     key == t->parms.i_key &&
407                     link == t->parms.link &&
408                     type == t->dev->type)
409                         break;
410
411         return t;
412 }
413
414 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
415                 struct ip_tunnel_parm *parms, int create)
416 {
417         struct ip_tunnel *t, *nt;
418         struct net_device *dev;
419         char name[IFNAMSIZ];
420         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
421
422         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
423         if (t || !create)
424                 return t;
425
426         if (parms->name[0])
427                 strlcpy(name, parms->name, IFNAMSIZ);
428         else
429                 strcpy(name, "gre%d");
430
431         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
432         if (!dev)
433                 return NULL;
434
435         dev_net_set(dev, net);
436
437         nt = netdev_priv(dev);
438         nt->parms = *parms;
439         dev->rtnl_link_ops = &ipgre_link_ops;
440
441         dev->mtu = ipgre_tunnel_bind_dev(dev);
442
443         if (register_netdevice(dev) < 0)
444                 goto failed_free;
445
446         /* Can use a lockless transmit, unless we generate output sequences */
447         if (!(nt->parms.o_flags & GRE_SEQ))
448                 dev->features |= NETIF_F_LLTX;
449
450         dev_hold(dev);
451         ipgre_tunnel_link(ign, nt);
452         return nt;
453
454 failed_free:
455         free_netdev(dev);
456         return NULL;
457 }
458
459 static void ipgre_tunnel_uninit(struct net_device *dev)
460 {
461         struct net *net = dev_net(dev);
462         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
463
464         ipgre_tunnel_unlink(ign, netdev_priv(dev));
465         dev_put(dev);
466 }
467
468
469 static void ipgre_err(struct sk_buff *skb, u32 info)
470 {
471
472 /* All the routers (except for Linux) return only
473    8 bytes of packet payload. It means, that precise relaying of
474    ICMP in the real Internet is absolutely infeasible.
475
476    Moreover, Cisco "wise men" put GRE key to the third word
477    in GRE header. It makes impossible maintaining even soft state for keyed
478    GRE tunnels with enabled checksum. Tell them "thank you".
479
480    Well, I wonder, rfc1812 was written by Cisco employee,
481    what the hell these idiots break standards established
482    by themselves???
483  */
484
485         const struct iphdr *iph = (const struct iphdr *)skb->data;
486         __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
487         int grehlen = (iph->ihl<<2) + 4;
488         const int type = icmp_hdr(skb)->type;
489         const int code = icmp_hdr(skb)->code;
490         struct ip_tunnel *t;
491         __be16 flags;
492
493         flags = p[0];
494         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
495                 if (flags&(GRE_VERSION|GRE_ROUTING))
496                         return;
497                 if (flags&GRE_KEY) {
498                         grehlen += 4;
499                         if (flags&GRE_CSUM)
500                                 grehlen += 4;
501                 }
502         }
503
504         /* If only 8 bytes returned, keyed message will be dropped here */
505         if (skb_headlen(skb) < grehlen)
506                 return;
507
508         switch (type) {
509         default:
510         case ICMP_PARAMETERPROB:
511                 return;
512
513         case ICMP_DEST_UNREACH:
514                 switch (code) {
515                 case ICMP_SR_FAILED:
516                 case ICMP_PORT_UNREACH:
517                         /* Impossible event. */
518                         return;
519                 default:
520                         /* All others are translated to HOST_UNREACH.
521                            rfc2003 contains "deep thoughts" about NET_UNREACH,
522                            I believe they are just ether pollution. --ANK
523                          */
524                         break;
525                 }
526                 break;
527         case ICMP_TIME_EXCEEDED:
528                 if (code != ICMP_EXC_TTL)
529                         return;
530                 break;
531
532         case ICMP_REDIRECT:
533                 break;
534         }
535
536         rcu_read_lock();
537         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538                                 flags & GRE_KEY ?
539                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540                                 p[1]);
541         if (t == NULL)
542                 goto out;
543
544         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
545                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
546                                  t->parms.link, 0, IPPROTO_GRE, 0);
547                 goto out;
548         }
549         if (type == ICMP_REDIRECT) {
550                 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
551                               IPPROTO_GRE, 0);
552                 goto out;
553         }
554         if (t->parms.iph.daddr == 0 ||
555             ipv4_is_multicast(t->parms.iph.daddr))
556                 goto out;
557
558         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
559                 goto out;
560
561         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
562                 t->err_count++;
563         else
564                 t->err_count = 1;
565         t->err_time = jiffies;
566 out:
567         rcu_read_unlock();
568 }
569
570 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
571 {
572         if (INET_ECN_is_ce(iph->tos)) {
573                 if (skb->protocol == htons(ETH_P_IP)) {
574                         IP_ECN_set_ce(ip_hdr(skb));
575                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
576                         IP6_ECN_set_ce(ipv6_hdr(skb));
577                 }
578         }
579 }
580
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583 {
584         u8 inner = 0;
585         if (skb->protocol == htons(ETH_P_IP))
586                 inner = old_iph->tos;
587         else if (skb->protocol == htons(ETH_P_IPV6))
588                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589         return INET_ECN_encapsulate(tos, inner);
590 }
591
592 static int ipgre_rcv(struct sk_buff *skb)
593 {
594         const struct iphdr *iph;
595         u8     *h;
596         __be16    flags;
597         __sum16   csum = 0;
598         __be32 key = 0;
599         u32    seqno = 0;
600         struct ip_tunnel *tunnel;
601         int    offset = 4;
602         __be16 gre_proto;
603
604         if (!pskb_may_pull(skb, 16))
605                 goto drop_nolock;
606
607         iph = ip_hdr(skb);
608         h = skb->data;
609         flags = *(__be16 *)h;
610
611         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
612                 /* - Version must be 0.
613                    - We do not support routing headers.
614                  */
615                 if (flags&(GRE_VERSION|GRE_ROUTING))
616                         goto drop_nolock;
617
618                 if (flags&GRE_CSUM) {
619                         switch (skb->ip_summed) {
620                         case CHECKSUM_COMPLETE:
621                                 csum = csum_fold(skb->csum);
622                                 if (!csum)
623                                         break;
624                                 /* fall through */
625                         case CHECKSUM_NONE:
626                                 skb->csum = 0;
627                                 csum = __skb_checksum_complete(skb);
628                                 skb->ip_summed = CHECKSUM_COMPLETE;
629                         }
630                         offset += 4;
631                 }
632                 if (flags&GRE_KEY) {
633                         key = *(__be32 *)(h + offset);
634                         offset += 4;
635                 }
636                 if (flags&GRE_SEQ) {
637                         seqno = ntohl(*(__be32 *)(h + offset));
638                         offset += 4;
639                 }
640         }
641
642         gre_proto = *(__be16 *)(h + 2);
643
644         rcu_read_lock();
645         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
646                                           iph->saddr, iph->daddr, key,
647                                           gre_proto))) {
648                 struct pcpu_tstats *tstats;
649
650                 secpath_reset(skb);
651
652                 skb->protocol = gre_proto;
653                 /* WCCP version 1 and 2 protocol decoding.
654                  * - Change protocol to IP
655                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
656                  */
657                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
658                         skb->protocol = htons(ETH_P_IP);
659                         if ((*(h + offset) & 0xF0) != 0x40)
660                                 offset += 4;
661                 }
662
663                 skb->mac_header = skb->network_header;
664                 __pskb_pull(skb, offset);
665                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
666                 skb->pkt_type = PACKET_HOST;
667 #ifdef CONFIG_NET_IPGRE_BROADCAST
668                 if (ipv4_is_multicast(iph->daddr)) {
669                         /* Looped back packet, drop it! */
670                         if (rt_is_output_route(skb_rtable(skb)))
671                                 goto drop;
672                         tunnel->dev->stats.multicast++;
673                         skb->pkt_type = PACKET_BROADCAST;
674                 }
675 #endif
676
677                 if (((flags&GRE_CSUM) && csum) ||
678                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
679                         tunnel->dev->stats.rx_crc_errors++;
680                         tunnel->dev->stats.rx_errors++;
681                         goto drop;
682                 }
683                 if (tunnel->parms.i_flags&GRE_SEQ) {
684                         if (!(flags&GRE_SEQ) ||
685                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
686                                 tunnel->dev->stats.rx_fifo_errors++;
687                                 tunnel->dev->stats.rx_errors++;
688                                 goto drop;
689                         }
690                         tunnel->i_seqno = seqno + 1;
691                 }
692
693                 /* Warning: All skb pointers will be invalidated! */
694                 if (tunnel->dev->type == ARPHRD_ETHER) {
695                         if (!pskb_may_pull(skb, ETH_HLEN)) {
696                                 tunnel->dev->stats.rx_length_errors++;
697                                 tunnel->dev->stats.rx_errors++;
698                                 goto drop;
699                         }
700
701                         iph = ip_hdr(skb);
702                         skb->protocol = eth_type_trans(skb, tunnel->dev);
703                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
704                 }
705
706                 tstats = this_cpu_ptr(tunnel->dev->tstats);
707                 u64_stats_update_begin(&tstats->syncp);
708                 tstats->rx_packets++;
709                 tstats->rx_bytes += skb->len;
710                 u64_stats_update_end(&tstats->syncp);
711
712                 __skb_tunnel_rx(skb, tunnel->dev);
713
714                 skb_reset_network_header(skb);
715                 ipgre_ecn_decapsulate(iph, skb);
716
717                 netif_rx(skb);
718
719                 rcu_read_unlock();
720                 return 0;
721         }
722         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
723
724 drop:
725         rcu_read_unlock();
726 drop_nolock:
727         kfree_skb(skb);
728         return 0;
729 }
730
731 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
732 {
733         struct ip_tunnel *tunnel = netdev_priv(dev);
734         struct pcpu_tstats *tstats;
735         const struct iphdr  *old_iph = ip_hdr(skb);
736         const struct iphdr  *tiph;
737         struct flowi4 fl4;
738         u8     tos;
739         __be16 df;
740         struct rtable *rt;                      /* Route to the other host */
741         struct net_device *tdev;                /* Device to other host */
742         struct iphdr  *iph;                     /* Our new IP header */
743         unsigned int max_headroom;              /* The extra header space needed */
744         int    gre_hlen;
745         __be32 dst;
746         int    mtu;
747
748         if (dev->type == ARPHRD_ETHER)
749                 IPCB(skb)->flags = 0;
750
751         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
752                 gre_hlen = 0;
753                 tiph = (const struct iphdr *)skb->data;
754         } else {
755                 gre_hlen = tunnel->hlen;
756                 tiph = &tunnel->parms.iph;
757         }
758
759         if ((dst = tiph->daddr) == 0) {
760                 /* NBMA tunnel */
761
762                 if (skb_dst(skb) == NULL) {
763                         dev->stats.tx_fifo_errors++;
764                         goto tx_error;
765                 }
766
767                 if (skb->protocol == htons(ETH_P_IP)) {
768                         rt = skb_rtable(skb);
769                         dst = rt_nexthop(rt, old_iph->daddr);
770                 }
771 #if IS_ENABLED(CONFIG_IPV6)
772                 else if (skb->protocol == htons(ETH_P_IPV6)) {
773                         const struct in6_addr *addr6;
774                         struct neighbour *neigh;
775                         bool do_tx_error_icmp;
776                         int addr_type;
777
778                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
779                         if (neigh == NULL)
780                                 goto tx_error;
781
782                         addr6 = (const struct in6_addr *)&neigh->primary_key;
783                         addr_type = ipv6_addr_type(addr6);
784
785                         if (addr_type == IPV6_ADDR_ANY) {
786                                 addr6 = &ipv6_hdr(skb)->daddr;
787                                 addr_type = ipv6_addr_type(addr6);
788                         }
789
790                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
791                                 do_tx_error_icmp = true;
792                         else {
793                                 do_tx_error_icmp = false;
794                                 dst = addr6->s6_addr32[3];
795                         }
796                         neigh_release(neigh);
797                         if (do_tx_error_icmp)
798                                 goto tx_error_icmp;
799                 }
800 #endif
801                 else
802                         goto tx_error;
803         }
804
805         tos = tiph->tos;
806         if (tos == 1) {
807                 tos = 0;
808                 if (skb->protocol == htons(ETH_P_IP))
809                         tos = old_iph->tos;
810                 else if (skb->protocol == htons(ETH_P_IPV6))
811                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
812         }
813
814         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
815                                  tunnel->parms.o_key, RT_TOS(tos),
816                                  tunnel->parms.link);
817         if (IS_ERR(rt)) {
818                 dev->stats.tx_carrier_errors++;
819                 goto tx_error;
820         }
821         tdev = rt->dst.dev;
822
823         if (tdev == dev) {
824                 ip_rt_put(rt);
825                 dev->stats.collisions++;
826                 goto tx_error;
827         }
828
829         df = tiph->frag_off;
830         if (df)
831                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
832         else
833                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
834
835         if (skb_dst(skb))
836                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
837
838         if (skb->protocol == htons(ETH_P_IP)) {
839                 df |= (old_iph->frag_off&htons(IP_DF));
840
841                 if ((old_iph->frag_off&htons(IP_DF)) &&
842                     mtu < ntohs(old_iph->tot_len)) {
843                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
844                         ip_rt_put(rt);
845                         goto tx_error;
846                 }
847         }
848 #if IS_ENABLED(CONFIG_IPV6)
849         else if (skb->protocol == htons(ETH_P_IPV6)) {
850                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
851
852                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
853                         if ((tunnel->parms.iph.daddr &&
854                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
855                             rt6->rt6i_dst.plen == 128) {
856                                 rt6->rt6i_flags |= RTF_MODIFIED;
857                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
858                         }
859                 }
860
861                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
862                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
863                         ip_rt_put(rt);
864                         goto tx_error;
865                 }
866         }
867 #endif
868
869         if (tunnel->err_count > 0) {
870                 if (time_before(jiffies,
871                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
872                         tunnel->err_count--;
873
874                         dst_link_failure(skb);
875                 } else
876                         tunnel->err_count = 0;
877         }
878
879         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
880
881         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
882             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
883                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
884                 if (max_headroom > dev->needed_headroom)
885                         dev->needed_headroom = max_headroom;
886                 if (!new_skb) {
887                         ip_rt_put(rt);
888                         dev->stats.tx_dropped++;
889                         dev_kfree_skb(skb);
890                         return NETDEV_TX_OK;
891                 }
892                 if (skb->sk)
893                         skb_set_owner_w(new_skb, skb->sk);
894                 dev_kfree_skb(skb);
895                 skb = new_skb;
896                 old_iph = ip_hdr(skb);
897         }
898
899         skb_reset_transport_header(skb);
900         skb_push(skb, gre_hlen);
901         skb_reset_network_header(skb);
902         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
903         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
904                               IPSKB_REROUTED);
905         skb_dst_drop(skb);
906         skb_dst_set(skb, &rt->dst);
907
908         /*
909          *      Push down and install the IPIP header.
910          */
911
912         iph                     =       ip_hdr(skb);
913         iph->version            =       4;
914         iph->ihl                =       sizeof(struct iphdr) >> 2;
915         iph->frag_off           =       df;
916         iph->protocol           =       IPPROTO_GRE;
917         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
918         iph->daddr              =       fl4.daddr;
919         iph->saddr              =       fl4.saddr;
920
921         if ((iph->ttl = tiph->ttl) == 0) {
922                 if (skb->protocol == htons(ETH_P_IP))
923                         iph->ttl = old_iph->ttl;
924 #if IS_ENABLED(CONFIG_IPV6)
925                 else if (skb->protocol == htons(ETH_P_IPV6))
926                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
927 #endif
928                 else
929                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
930         }
931
932         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
933         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
934                                    htons(ETH_P_TEB) : skb->protocol;
935
936         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
937                 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
938
939                 if (tunnel->parms.o_flags&GRE_SEQ) {
940                         ++tunnel->o_seqno;
941                         *ptr = htonl(tunnel->o_seqno);
942                         ptr--;
943                 }
944                 if (tunnel->parms.o_flags&GRE_KEY) {
945                         *ptr = tunnel->parms.o_key;
946                         ptr--;
947                 }
948                 if (tunnel->parms.o_flags&GRE_CSUM) {
949                         *ptr = 0;
950                         *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
951                 }
952         }
953
954         nf_reset(skb);
955         tstats = this_cpu_ptr(dev->tstats);
956         __IPTUNNEL_XMIT(tstats, &dev->stats);
957         return NETDEV_TX_OK;
958
959 #if IS_ENABLED(CONFIG_IPV6)
960 tx_error_icmp:
961         dst_link_failure(skb);
962 #endif
963 tx_error:
964         dev->stats.tx_errors++;
965         dev_kfree_skb(skb);
966         return NETDEV_TX_OK;
967 }
968
969 static int ipgre_tunnel_bind_dev(struct net_device *dev)
970 {
971         struct net_device *tdev = NULL;
972         struct ip_tunnel *tunnel;
973         const struct iphdr *iph;
974         int hlen = LL_MAX_HEADER;
975         int mtu = ETH_DATA_LEN;
976         int addend = sizeof(struct iphdr) + 4;
977
978         tunnel = netdev_priv(dev);
979         iph = &tunnel->parms.iph;
980
981         /* Guess output device to choose reasonable mtu and needed_headroom */
982
983         if (iph->daddr) {
984                 struct flowi4 fl4;
985                 struct rtable *rt;
986
987                 rt = ip_route_output_gre(dev_net(dev), &fl4,
988                                          iph->daddr, iph->saddr,
989                                          tunnel->parms.o_key,
990                                          RT_TOS(iph->tos),
991                                          tunnel->parms.link);
992                 if (!IS_ERR(rt)) {
993                         tdev = rt->dst.dev;
994                         ip_rt_put(rt);
995                 }
996
997                 if (dev->type != ARPHRD_ETHER)
998                         dev->flags |= IFF_POINTOPOINT;
999         }
1000
1001         if (!tdev && tunnel->parms.link)
1002                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1003
1004         if (tdev) {
1005                 hlen = tdev->hard_header_len + tdev->needed_headroom;
1006                 mtu = tdev->mtu;
1007         }
1008         dev->iflink = tunnel->parms.link;
1009
1010         /* Precalculate GRE options length */
1011         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1012                 if (tunnel->parms.o_flags&GRE_CSUM)
1013                         addend += 4;
1014                 if (tunnel->parms.o_flags&GRE_KEY)
1015                         addend += 4;
1016                 if (tunnel->parms.o_flags&GRE_SEQ)
1017                         addend += 4;
1018         }
1019         dev->needed_headroom = addend + hlen;
1020         mtu -= dev->hard_header_len + addend;
1021
1022         if (mtu < 68)
1023                 mtu = 68;
1024
1025         tunnel->hlen = addend;
1026
1027         return mtu;
1028 }
1029
1030 static int
1031 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1032 {
1033         int err = 0;
1034         struct ip_tunnel_parm p;
1035         struct ip_tunnel *t;
1036         struct net *net = dev_net(dev);
1037         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1038
1039         switch (cmd) {
1040         case SIOCGETTUNNEL:
1041                 t = NULL;
1042                 if (dev == ign->fb_tunnel_dev) {
1043                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1044                                 err = -EFAULT;
1045                                 break;
1046                         }
1047                         t = ipgre_tunnel_locate(net, &p, 0);
1048                 }
1049                 if (t == NULL)
1050                         t = netdev_priv(dev);
1051                 memcpy(&p, &t->parms, sizeof(p));
1052                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1053                         err = -EFAULT;
1054                 break;
1055
1056         case SIOCADDTUNNEL:
1057         case SIOCCHGTUNNEL:
1058                 err = -EPERM;
1059                 if (!capable(CAP_NET_ADMIN))
1060                         goto done;
1061
1062                 err = -EFAULT;
1063                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1064                         goto done;
1065
1066                 err = -EINVAL;
1067                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1068                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1069                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1070                         goto done;
1071                 if (p.iph.ttl)
1072                         p.iph.frag_off |= htons(IP_DF);
1073
1074                 if (!(p.i_flags&GRE_KEY))
1075                         p.i_key = 0;
1076                 if (!(p.o_flags&GRE_KEY))
1077                         p.o_key = 0;
1078
1079                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1080
1081                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1082                         if (t != NULL) {
1083                                 if (t->dev != dev) {
1084                                         err = -EEXIST;
1085                                         break;
1086                                 }
1087                         } else {
1088                                 unsigned int nflags = 0;
1089
1090                                 t = netdev_priv(dev);
1091
1092                                 if (ipv4_is_multicast(p.iph.daddr))
1093                                         nflags = IFF_BROADCAST;
1094                                 else if (p.iph.daddr)
1095                                         nflags = IFF_POINTOPOINT;
1096
1097                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1098                                         err = -EINVAL;
1099                                         break;
1100                                 }
1101                                 ipgre_tunnel_unlink(ign, t);
1102                                 synchronize_net();
1103                                 t->parms.iph.saddr = p.iph.saddr;
1104                                 t->parms.iph.daddr = p.iph.daddr;
1105                                 t->parms.i_key = p.i_key;
1106                                 t->parms.o_key = p.o_key;
1107                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1108                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1109                                 ipgre_tunnel_link(ign, t);
1110                                 netdev_state_change(dev);
1111                         }
1112                 }
1113
1114                 if (t) {
1115                         err = 0;
1116                         if (cmd == SIOCCHGTUNNEL) {
1117                                 t->parms.iph.ttl = p.iph.ttl;
1118                                 t->parms.iph.tos = p.iph.tos;
1119                                 t->parms.iph.frag_off = p.iph.frag_off;
1120                                 if (t->parms.link != p.link) {
1121                                         t->parms.link = p.link;
1122                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1123                                         netdev_state_change(dev);
1124                                 }
1125                         }
1126                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1127                                 err = -EFAULT;
1128                 } else
1129                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1130                 break;
1131
1132         case SIOCDELTUNNEL:
1133                 err = -EPERM;
1134                 if (!capable(CAP_NET_ADMIN))
1135                         goto done;
1136
1137                 if (dev == ign->fb_tunnel_dev) {
1138                         err = -EFAULT;
1139                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1140                                 goto done;
1141                         err = -ENOENT;
1142                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1143                                 goto done;
1144                         err = -EPERM;
1145                         if (t == netdev_priv(ign->fb_tunnel_dev))
1146                                 goto done;
1147                         dev = t->dev;
1148                 }
1149                 unregister_netdevice(dev);
1150                 err = 0;
1151                 break;
1152
1153         default:
1154                 err = -EINVAL;
1155         }
1156
1157 done:
1158         return err;
1159 }
1160
1161 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1162 {
1163         struct ip_tunnel *tunnel = netdev_priv(dev);
1164         if (new_mtu < 68 ||
1165             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1166                 return -EINVAL;
1167         dev->mtu = new_mtu;
1168         return 0;
1169 }
1170
1171 /* Nice toy. Unfortunately, useless in real life :-)
1172    It allows to construct virtual multiprotocol broadcast "LAN"
1173    over the Internet, provided multicast routing is tuned.
1174
1175
1176    I have no idea was this bicycle invented before me,
1177    so that I had to set ARPHRD_IPGRE to a random value.
1178    I have an impression, that Cisco could make something similar,
1179    but this feature is apparently missing in IOS<=11.2(8).
1180
1181    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1182    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1183
1184    ping -t 255 224.66.66.66
1185
1186    If nobody answers, mbone does not work.
1187
1188    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1189    ip addr add 10.66.66.<somewhat>/24 dev Universe
1190    ifconfig Universe up
1191    ifconfig Universe add fe80::<Your_real_addr>/10
1192    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1193    ftp 10.66.66.66
1194    ...
1195    ftp fec0:6666:6666::193.233.7.65
1196    ...
1197
1198  */
1199
1200 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1201                         unsigned short type,
1202                         const void *daddr, const void *saddr, unsigned int len)
1203 {
1204         struct ip_tunnel *t = netdev_priv(dev);
1205         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1206         __be16 *p = (__be16 *)(iph+1);
1207
1208         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1209         p[0]            = t->parms.o_flags;
1210         p[1]            = htons(type);
1211
1212         /*
1213          *      Set the source hardware address.
1214          */
1215
1216         if (saddr)
1217                 memcpy(&iph->saddr, saddr, 4);
1218         if (daddr)
1219                 memcpy(&iph->daddr, daddr, 4);
1220         if (iph->daddr)
1221                 return t->hlen;
1222
1223         return -t->hlen;
1224 }
1225
1226 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1227 {
1228         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1229         memcpy(haddr, &iph->saddr, 4);
1230         return 4;
1231 }
1232
1233 static const struct header_ops ipgre_header_ops = {
1234         .create = ipgre_header,
1235         .parse  = ipgre_header_parse,
1236 };
1237
1238 #ifdef CONFIG_NET_IPGRE_BROADCAST
1239 static int ipgre_open(struct net_device *dev)
1240 {
1241         struct ip_tunnel *t = netdev_priv(dev);
1242
1243         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1244                 struct flowi4 fl4;
1245                 struct rtable *rt;
1246
1247                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1248                                          t->parms.iph.daddr,
1249                                          t->parms.iph.saddr,
1250                                          t->parms.o_key,
1251                                          RT_TOS(t->parms.iph.tos),
1252                                          t->parms.link);
1253                 if (IS_ERR(rt))
1254                         return -EADDRNOTAVAIL;
1255                 dev = rt->dst.dev;
1256                 ip_rt_put(rt);
1257                 if (__in_dev_get_rtnl(dev) == NULL)
1258                         return -EADDRNOTAVAIL;
1259                 t->mlink = dev->ifindex;
1260                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1261         }
1262         return 0;
1263 }
1264
1265 static int ipgre_close(struct net_device *dev)
1266 {
1267         struct ip_tunnel *t = netdev_priv(dev);
1268
1269         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1270                 struct in_device *in_dev;
1271                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1272                 if (in_dev)
1273                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1274         }
1275         return 0;
1276 }
1277
1278 #endif
1279
1280 static const struct net_device_ops ipgre_netdev_ops = {
1281         .ndo_init               = ipgre_tunnel_init,
1282         .ndo_uninit             = ipgre_tunnel_uninit,
1283 #ifdef CONFIG_NET_IPGRE_BROADCAST
1284         .ndo_open               = ipgre_open,
1285         .ndo_stop               = ipgre_close,
1286 #endif
1287         .ndo_start_xmit         = ipgre_tunnel_xmit,
1288         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1289         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1290         .ndo_get_stats64        = ipgre_get_stats64,
1291 };
1292
1293 static void ipgre_dev_free(struct net_device *dev)
1294 {
1295         free_percpu(dev->tstats);
1296         free_netdev(dev);
1297 }
1298
1299 static void ipgre_tunnel_setup(struct net_device *dev)
1300 {
1301         dev->netdev_ops         = &ipgre_netdev_ops;
1302         dev->destructor         = ipgre_dev_free;
1303
1304         dev->type               = ARPHRD_IPGRE;
1305         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1306         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1307         dev->flags              = IFF_NOARP;
1308         dev->iflink             = 0;
1309         dev->addr_len           = 4;
1310         dev->features           |= NETIF_F_NETNS_LOCAL;
1311         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1312 }
1313
1314 static int ipgre_tunnel_init(struct net_device *dev)
1315 {
1316         struct ip_tunnel *tunnel;
1317         struct iphdr *iph;
1318
1319         tunnel = netdev_priv(dev);
1320         iph = &tunnel->parms.iph;
1321
1322         tunnel->dev = dev;
1323         strcpy(tunnel->parms.name, dev->name);
1324
1325         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1326         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1327
1328         if (iph->daddr) {
1329 #ifdef CONFIG_NET_IPGRE_BROADCAST
1330                 if (ipv4_is_multicast(iph->daddr)) {
1331                         if (!iph->saddr)
1332                                 return -EINVAL;
1333                         dev->flags = IFF_BROADCAST;
1334                         dev->header_ops = &ipgre_header_ops;
1335                 }
1336 #endif
1337         } else
1338                 dev->header_ops = &ipgre_header_ops;
1339
1340         dev->tstats = alloc_percpu(struct pcpu_tstats);
1341         if (!dev->tstats)
1342                 return -ENOMEM;
1343
1344         return 0;
1345 }
1346
1347 static void ipgre_fb_tunnel_init(struct net_device *dev)
1348 {
1349         struct ip_tunnel *tunnel = netdev_priv(dev);
1350         struct iphdr *iph = &tunnel->parms.iph;
1351
1352         tunnel->dev = dev;
1353         strcpy(tunnel->parms.name, dev->name);
1354
1355         iph->version            = 4;
1356         iph->protocol           = IPPROTO_GRE;
1357         iph->ihl                = 5;
1358         tunnel->hlen            = sizeof(struct iphdr) + 4;
1359
1360         dev_hold(dev);
1361 }
1362
1363
1364 static const struct gre_protocol ipgre_protocol = {
1365         .handler     = ipgre_rcv,
1366         .err_handler = ipgre_err,
1367 };
1368
1369 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1370 {
1371         int prio;
1372
1373         for (prio = 0; prio < 4; prio++) {
1374                 int h;
1375                 for (h = 0; h < HASH_SIZE; h++) {
1376                         struct ip_tunnel *t;
1377
1378                         t = rtnl_dereference(ign->tunnels[prio][h]);
1379
1380                         while (t != NULL) {
1381                                 unregister_netdevice_queue(t->dev, head);
1382                                 t = rtnl_dereference(t->next);
1383                         }
1384                 }
1385         }
1386 }
1387
1388 static int __net_init ipgre_init_net(struct net *net)
1389 {
1390         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1391         int err;
1392
1393         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1394                                            ipgre_tunnel_setup);
1395         if (!ign->fb_tunnel_dev) {
1396                 err = -ENOMEM;
1397                 goto err_alloc_dev;
1398         }
1399         dev_net_set(ign->fb_tunnel_dev, net);
1400
1401         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1402         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1403
1404         if ((err = register_netdev(ign->fb_tunnel_dev)))
1405                 goto err_reg_dev;
1406
1407         rcu_assign_pointer(ign->tunnels_wc[0],
1408                            netdev_priv(ign->fb_tunnel_dev));
1409         return 0;
1410
1411 err_reg_dev:
1412         ipgre_dev_free(ign->fb_tunnel_dev);
1413 err_alloc_dev:
1414         return err;
1415 }
1416
1417 static void __net_exit ipgre_exit_net(struct net *net)
1418 {
1419         struct ipgre_net *ign;
1420         LIST_HEAD(list);
1421
1422         ign = net_generic(net, ipgre_net_id);
1423         rtnl_lock();
1424         ipgre_destroy_tunnels(ign, &list);
1425         unregister_netdevice_many(&list);
1426         rtnl_unlock();
1427 }
1428
1429 static struct pernet_operations ipgre_net_ops = {
1430         .init = ipgre_init_net,
1431         .exit = ipgre_exit_net,
1432         .id   = &ipgre_net_id,
1433         .size = sizeof(struct ipgre_net),
1434 };
1435
1436 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1437 {
1438         __be16 flags;
1439
1440         if (!data)
1441                 return 0;
1442
1443         flags = 0;
1444         if (data[IFLA_GRE_IFLAGS])
1445                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1446         if (data[IFLA_GRE_OFLAGS])
1447                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1448         if (flags & (GRE_VERSION|GRE_ROUTING))
1449                 return -EINVAL;
1450
1451         return 0;
1452 }
1453
1454 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1455 {
1456         __be32 daddr;
1457
1458         if (tb[IFLA_ADDRESS]) {
1459                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1460                         return -EINVAL;
1461                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1462                         return -EADDRNOTAVAIL;
1463         }
1464
1465         if (!data)
1466                 goto out;
1467
1468         if (data[IFLA_GRE_REMOTE]) {
1469                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1470                 if (!daddr)
1471                         return -EINVAL;
1472         }
1473
1474 out:
1475         return ipgre_tunnel_validate(tb, data);
1476 }
1477
1478 static void ipgre_netlink_parms(struct nlattr *data[],
1479                                 struct ip_tunnel_parm *parms)
1480 {
1481         memset(parms, 0, sizeof(*parms));
1482
1483         parms->iph.protocol = IPPROTO_GRE;
1484
1485         if (!data)
1486                 return;
1487
1488         if (data[IFLA_GRE_LINK])
1489                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1490
1491         if (data[IFLA_GRE_IFLAGS])
1492                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1493
1494         if (data[IFLA_GRE_OFLAGS])
1495                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1496
1497         if (data[IFLA_GRE_IKEY])
1498                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1499
1500         if (data[IFLA_GRE_OKEY])
1501                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1502
1503         if (data[IFLA_GRE_LOCAL])
1504                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1505
1506         if (data[IFLA_GRE_REMOTE])
1507                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1508
1509         if (data[IFLA_GRE_TTL])
1510                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1511
1512         if (data[IFLA_GRE_TOS])
1513                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1514
1515         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1516                 parms->iph.frag_off = htons(IP_DF);
1517 }
1518
1519 static int ipgre_tap_init(struct net_device *dev)
1520 {
1521         struct ip_tunnel *tunnel;
1522
1523         tunnel = netdev_priv(dev);
1524
1525         tunnel->dev = dev;
1526         strcpy(tunnel->parms.name, dev->name);
1527
1528         ipgre_tunnel_bind_dev(dev);
1529
1530         dev->tstats = alloc_percpu(struct pcpu_tstats);
1531         if (!dev->tstats)
1532                 return -ENOMEM;
1533
1534         return 0;
1535 }
1536
1537 static const struct net_device_ops ipgre_tap_netdev_ops = {
1538         .ndo_init               = ipgre_tap_init,
1539         .ndo_uninit             = ipgre_tunnel_uninit,
1540         .ndo_start_xmit         = ipgre_tunnel_xmit,
1541         .ndo_set_mac_address    = eth_mac_addr,
1542         .ndo_validate_addr      = eth_validate_addr,
1543         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1544         .ndo_get_stats64        = ipgre_get_stats64,
1545 };
1546
1547 static void ipgre_tap_setup(struct net_device *dev)
1548 {
1549
1550         ether_setup(dev);
1551
1552         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1553         dev->destructor         = ipgre_dev_free;
1554
1555         dev->iflink             = 0;
1556         dev->features           |= NETIF_F_NETNS_LOCAL;
1557 }
1558
1559 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1560                          struct nlattr *data[])
1561 {
1562         struct ip_tunnel *nt;
1563         struct net *net = dev_net(dev);
1564         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1565         int mtu;
1566         int err;
1567
1568         nt = netdev_priv(dev);
1569         ipgre_netlink_parms(data, &nt->parms);
1570
1571         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1572                 return -EEXIST;
1573
1574         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1575                 eth_hw_addr_random(dev);
1576
1577         mtu = ipgre_tunnel_bind_dev(dev);
1578         if (!tb[IFLA_MTU])
1579                 dev->mtu = mtu;
1580
1581         /* Can use a lockless transmit, unless we generate output sequences */
1582         if (!(nt->parms.o_flags & GRE_SEQ))
1583                 dev->features |= NETIF_F_LLTX;
1584
1585         err = register_netdevice(dev);
1586         if (err)
1587                 goto out;
1588
1589         dev_hold(dev);
1590         ipgre_tunnel_link(ign, nt);
1591
1592 out:
1593         return err;
1594 }
1595
1596 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1597                             struct nlattr *data[])
1598 {
1599         struct ip_tunnel *t, *nt;
1600         struct net *net = dev_net(dev);
1601         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1602         struct ip_tunnel_parm p;
1603         int mtu;
1604
1605         if (dev == ign->fb_tunnel_dev)
1606                 return -EINVAL;
1607
1608         nt = netdev_priv(dev);
1609         ipgre_netlink_parms(data, &p);
1610
1611         t = ipgre_tunnel_locate(net, &p, 0);
1612
1613         if (t) {
1614                 if (t->dev != dev)
1615                         return -EEXIST;
1616         } else {
1617                 t = nt;
1618
1619                 if (dev->type != ARPHRD_ETHER) {
1620                         unsigned int nflags = 0;
1621
1622                         if (ipv4_is_multicast(p.iph.daddr))
1623                                 nflags = IFF_BROADCAST;
1624                         else if (p.iph.daddr)
1625                                 nflags = IFF_POINTOPOINT;
1626
1627                         if ((dev->flags ^ nflags) &
1628                             (IFF_POINTOPOINT | IFF_BROADCAST))
1629                                 return -EINVAL;
1630                 }
1631
1632                 ipgre_tunnel_unlink(ign, t);
1633                 t->parms.iph.saddr = p.iph.saddr;
1634                 t->parms.iph.daddr = p.iph.daddr;
1635                 t->parms.i_key = p.i_key;
1636                 if (dev->type != ARPHRD_ETHER) {
1637                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1638                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1639                 }
1640                 ipgre_tunnel_link(ign, t);
1641                 netdev_state_change(dev);
1642         }
1643
1644         t->parms.o_key = p.o_key;
1645         t->parms.iph.ttl = p.iph.ttl;
1646         t->parms.iph.tos = p.iph.tos;
1647         t->parms.iph.frag_off = p.iph.frag_off;
1648
1649         if (t->parms.link != p.link) {
1650                 t->parms.link = p.link;
1651                 mtu = ipgre_tunnel_bind_dev(dev);
1652                 if (!tb[IFLA_MTU])
1653                         dev->mtu = mtu;
1654                 netdev_state_change(dev);
1655         }
1656
1657         return 0;
1658 }
1659
1660 static size_t ipgre_get_size(const struct net_device *dev)
1661 {
1662         return
1663                 /* IFLA_GRE_LINK */
1664                 nla_total_size(4) +
1665                 /* IFLA_GRE_IFLAGS */
1666                 nla_total_size(2) +
1667                 /* IFLA_GRE_OFLAGS */
1668                 nla_total_size(2) +
1669                 /* IFLA_GRE_IKEY */
1670                 nla_total_size(4) +
1671                 /* IFLA_GRE_OKEY */
1672                 nla_total_size(4) +
1673                 /* IFLA_GRE_LOCAL */
1674                 nla_total_size(4) +
1675                 /* IFLA_GRE_REMOTE */
1676                 nla_total_size(4) +
1677                 /* IFLA_GRE_TTL */
1678                 nla_total_size(1) +
1679                 /* IFLA_GRE_TOS */
1680                 nla_total_size(1) +
1681                 /* IFLA_GRE_PMTUDISC */
1682                 nla_total_size(1) +
1683                 0;
1684 }
1685
1686 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1687 {
1688         struct ip_tunnel *t = netdev_priv(dev);
1689         struct ip_tunnel_parm *p = &t->parms;
1690
1691         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1692             nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1693             nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1694             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1695             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1696             nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1697             nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1698             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1699             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1700             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1701                        !!(p->iph.frag_off & htons(IP_DF))))
1702                 goto nla_put_failure;
1703         return 0;
1704
1705 nla_put_failure:
1706         return -EMSGSIZE;
1707 }
1708
1709 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1710         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1711         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1712         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1713         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1714         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1715         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1716         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1717         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1718         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1719         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1720 };
1721
1722 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1723         .kind           = "gre",
1724         .maxtype        = IFLA_GRE_MAX,
1725         .policy         = ipgre_policy,
1726         .priv_size      = sizeof(struct ip_tunnel),
1727         .setup          = ipgre_tunnel_setup,
1728         .validate       = ipgre_tunnel_validate,
1729         .newlink        = ipgre_newlink,
1730         .changelink     = ipgre_changelink,
1731         .get_size       = ipgre_get_size,
1732         .fill_info      = ipgre_fill_info,
1733 };
1734
1735 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1736         .kind           = "gretap",
1737         .maxtype        = IFLA_GRE_MAX,
1738         .policy         = ipgre_policy,
1739         .priv_size      = sizeof(struct ip_tunnel),
1740         .setup          = ipgre_tap_setup,
1741         .validate       = ipgre_tap_validate,
1742         .newlink        = ipgre_newlink,
1743         .changelink     = ipgre_changelink,
1744         .get_size       = ipgre_get_size,
1745         .fill_info      = ipgre_fill_info,
1746 };
1747
1748 /*
1749  *      And now the modules code and kernel interface.
1750  */
1751
1752 static int __init ipgre_init(void)
1753 {
1754         int err;
1755
1756         pr_info("GRE over IPv4 tunneling driver\n");
1757
1758         err = register_pernet_device(&ipgre_net_ops);
1759         if (err < 0)
1760                 return err;
1761
1762         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1763         if (err < 0) {
1764                 pr_info("%s: can't add protocol\n", __func__);
1765                 goto add_proto_failed;
1766         }
1767
1768         err = rtnl_link_register(&ipgre_link_ops);
1769         if (err < 0)
1770                 goto rtnl_link_failed;
1771
1772         err = rtnl_link_register(&ipgre_tap_ops);
1773         if (err < 0)
1774                 goto tap_ops_failed;
1775
1776 out:
1777         return err;
1778
1779 tap_ops_failed:
1780         rtnl_link_unregister(&ipgre_link_ops);
1781 rtnl_link_failed:
1782         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1783 add_proto_failed:
1784         unregister_pernet_device(&ipgre_net_ops);
1785         goto out;
1786 }
1787
1788 static void __exit ipgre_fini(void)
1789 {
1790         rtnl_link_unregister(&ipgre_tap_ops);
1791         rtnl_link_unregister(&ipgre_link_ops);
1792         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1793                 pr_info("%s: can't remove protocol\n", __func__);
1794         unregister_pernet_device(&ipgre_net_ops);
1795 }
1796
1797 module_init(ipgre_init);
1798 module_exit(ipgre_fini);
1799 MODULE_LICENSE("GPL");
1800 MODULE_ALIAS_RTNL_LINK("gre");
1801 MODULE_ALIAS_RTNL_LINK("gretap");
1802 MODULE_ALIAS_NETDEV("gre0");