714b6a80361df2cb616f2682cae04c20ad463b68
[linux-3.10.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #ifdef CONFIG_IPV6
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is the best
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: HARD_TX_LOCK lock breaks dead loops.
72
73
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* Given src, dst and key, find appropriate for input tunnel. */
169
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171                                               __be32 remote, __be32 local,
172                                               __be32 key, __be16 gre_proto)
173 {
174         struct net *net = dev_net(dev);
175         int link = dev->ifindex;
176         unsigned int h0 = HASH(remote);
177         unsigned int h1 = HASH(key);
178         struct ip_tunnel *t, *cand = NULL;
179         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181                        ARPHRD_ETHER : ARPHRD_IPGRE;
182         int score, cand_score = 4;
183
184         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185                 if (local != t->parms.iph.saddr ||
186                     remote != t->parms.iph.daddr ||
187                     key != t->parms.i_key ||
188                     !(t->dev->flags & IFF_UP))
189                         continue;
190
191                 if (t->dev->type != ARPHRD_IPGRE &&
192                     t->dev->type != dev_type)
193                         continue;
194
195                 score = 0;
196                 if (t->parms.link != link)
197                         score |= 1;
198                 if (t->dev->type != dev_type)
199                         score |= 2;
200                 if (score == 0)
201                         return t;
202
203                 if (score < cand_score) {
204                         cand = t;
205                         cand_score = score;
206                 }
207         }
208
209         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210                 if (remote != t->parms.iph.daddr ||
211                     key != t->parms.i_key ||
212                     !(t->dev->flags & IFF_UP))
213                         continue;
214
215                 if (t->dev->type != ARPHRD_IPGRE &&
216                     t->dev->type != dev_type)
217                         continue;
218
219                 score = 0;
220                 if (t->parms.link != link)
221                         score |= 1;
222                 if (t->dev->type != dev_type)
223                         score |= 2;
224                 if (score == 0)
225                         return t;
226
227                 if (score < cand_score) {
228                         cand = t;
229                         cand_score = score;
230                 }
231         }
232
233         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234                 if ((local != t->parms.iph.saddr &&
235                      (local != t->parms.iph.daddr ||
236                       !ipv4_is_multicast(local))) ||
237                     key != t->parms.i_key ||
238                     !(t->dev->flags & IFF_UP))
239                         continue;
240
241                 if (t->dev->type != ARPHRD_IPGRE &&
242                     t->dev->type != dev_type)
243                         continue;
244
245                 score = 0;
246                 if (t->parms.link != link)
247                         score |= 1;
248                 if (t->dev->type != dev_type)
249                         score |= 2;
250                 if (score == 0)
251                         return t;
252
253                 if (score < cand_score) {
254                         cand = t;
255                         cand_score = score;
256                 }
257         }
258
259         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260                 if (t->parms.i_key != key ||
261                     !(t->dev->flags & IFF_UP))
262                         continue;
263
264                 if (t->dev->type != ARPHRD_IPGRE &&
265                     t->dev->type != dev_type)
266                         continue;
267
268                 score = 0;
269                 if (t->parms.link != link)
270                         score |= 1;
271                 if (t->dev->type != dev_type)
272                         score |= 2;
273                 if (score == 0)
274                         return t;
275
276                 if (score < cand_score) {
277                         cand = t;
278                         cand_score = score;
279                 }
280         }
281
282         if (cand != NULL)
283                 return cand;
284
285         dev = ign->fb_tunnel_dev;
286         if (dev->flags & IFF_UP)
287                 return netdev_priv(dev);
288
289         return NULL;
290 }
291
292 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
293                 struct ip_tunnel_parm *parms)
294 {
295         __be32 remote = parms->iph.daddr;
296         __be32 local = parms->iph.saddr;
297         __be32 key = parms->i_key;
298         unsigned int h = HASH(key);
299         int prio = 0;
300
301         if (local)
302                 prio |= 1;
303         if (remote && !ipv4_is_multicast(remote)) {
304                 prio |= 2;
305                 h ^= HASH(remote);
306         }
307
308         return &ign->tunnels[prio][h];
309 }
310
311 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
312                 struct ip_tunnel *t)
313 {
314         return __ipgre_bucket(ign, &t->parms);
315 }
316
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318 {
319         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
320
321         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
322         rcu_assign_pointer(*tp, t);
323 }
324
325 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
326 {
327         struct ip_tunnel __rcu **tp;
328         struct ip_tunnel *iter;
329
330         for (tp = ipgre_bucket(ign, t);
331              (iter = rtnl_dereference(*tp)) != NULL;
332              tp = &iter->next) {
333                 if (t == iter) {
334                         rcu_assign_pointer(*tp, t->next);
335                         break;
336                 }
337         }
338 }
339
340 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
341                                            struct ip_tunnel_parm *parms,
342                                            int type)
343 {
344         __be32 remote = parms->iph.daddr;
345         __be32 local = parms->iph.saddr;
346         __be32 key = parms->i_key;
347         int link = parms->link;
348         struct ip_tunnel *t;
349         struct ip_tunnel __rcu **tp;
350         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351
352         for (tp = __ipgre_bucket(ign, parms);
353              (t = rtnl_dereference(*tp)) != NULL;
354              tp = &t->next)
355                 if (local == t->parms.iph.saddr &&
356                     remote == t->parms.iph.daddr &&
357                     key == t->parms.i_key &&
358                     link == t->parms.link &&
359                     type == t->dev->type)
360                         break;
361
362         return t;
363 }
364
365 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
366                 struct ip_tunnel_parm *parms, int create)
367 {
368         struct ip_tunnel *t, *nt;
369         struct net_device *dev;
370         char name[IFNAMSIZ];
371         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
372
373         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
374         if (t || !create)
375                 return t;
376
377         if (parms->name[0])
378                 strlcpy(name, parms->name, IFNAMSIZ);
379         else
380                 sprintf(name, "gre%%d");
381
382         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
383         if (!dev)
384           return NULL;
385
386         dev_net_set(dev, net);
387
388         if (strchr(name, '%')) {
389                 if (dev_alloc_name(dev, name) < 0)
390                         goto failed_free;
391         }
392
393         nt = netdev_priv(dev);
394         nt->parms = *parms;
395         dev->rtnl_link_ops = &ipgre_link_ops;
396
397         dev->mtu = ipgre_tunnel_bind_dev(dev);
398
399         if (register_netdevice(dev) < 0)
400                 goto failed_free;
401
402         dev_hold(dev);
403         ipgre_tunnel_link(ign, nt);
404         return nt;
405
406 failed_free:
407         free_netdev(dev);
408         return NULL;
409 }
410
411 static void ipgre_tunnel_uninit(struct net_device *dev)
412 {
413         struct net *net = dev_net(dev);
414         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
415
416         ipgre_tunnel_unlink(ign, netdev_priv(dev));
417         dev_put(dev);
418 }
419
420
421 static void ipgre_err(struct sk_buff *skb, u32 info)
422 {
423
424 /* All the routers (except for Linux) return only
425    8 bytes of packet payload. It means, that precise relaying of
426    ICMP in the real Internet is absolutely infeasible.
427
428    Moreover, Cisco "wise men" put GRE key to the third word
429    in GRE header. It makes impossible maintaining even soft state for keyed
430    GRE tunnels with enabled checksum. Tell them "thank you".
431
432    Well, I wonder, rfc1812 was written by Cisco employee,
433    what the hell these idiots break standrads established
434    by themself???
435  */
436
437         struct iphdr *iph = (struct iphdr *)skb->data;
438         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
439         int grehlen = (iph->ihl<<2) + 4;
440         const int type = icmp_hdr(skb)->type;
441         const int code = icmp_hdr(skb)->code;
442         struct ip_tunnel *t;
443         __be16 flags;
444
445         flags = p[0];
446         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
447                 if (flags&(GRE_VERSION|GRE_ROUTING))
448                         return;
449                 if (flags&GRE_KEY) {
450                         grehlen += 4;
451                         if (flags&GRE_CSUM)
452                                 grehlen += 4;
453                 }
454         }
455
456         /* If only 8 bytes returned, keyed message will be dropped here */
457         if (skb_headlen(skb) < grehlen)
458                 return;
459
460         switch (type) {
461         default:
462         case ICMP_PARAMETERPROB:
463                 return;
464
465         case ICMP_DEST_UNREACH:
466                 switch (code) {
467                 case ICMP_SR_FAILED:
468                 case ICMP_PORT_UNREACH:
469                         /* Impossible event. */
470                         return;
471                 case ICMP_FRAG_NEEDED:
472                         /* Soft state for pmtu is maintained by IP core. */
473                         return;
474                 default:
475                         /* All others are translated to HOST_UNREACH.
476                            rfc2003 contains "deep thoughts" about NET_UNREACH,
477                            I believe they are just ether pollution. --ANK
478                          */
479                         break;
480                 }
481                 break;
482         case ICMP_TIME_EXCEEDED:
483                 if (code != ICMP_EXC_TTL)
484                         return;
485                 break;
486         }
487
488         rcu_read_lock();
489         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
490                                 flags & GRE_KEY ?
491                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
492                                 p[1]);
493         if (t == NULL || t->parms.iph.daddr == 0 ||
494             ipv4_is_multicast(t->parms.iph.daddr))
495                 goto out;
496
497         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
498                 goto out;
499
500         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
501                 t->err_count++;
502         else
503                 t->err_count = 1;
504         t->err_time = jiffies;
505 out:
506         rcu_read_unlock();
507 }
508
509 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
510 {
511         if (INET_ECN_is_ce(iph->tos)) {
512                 if (skb->protocol == htons(ETH_P_IP)) {
513                         IP_ECN_set_ce(ip_hdr(skb));
514                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
515                         IP6_ECN_set_ce(ipv6_hdr(skb));
516                 }
517         }
518 }
519
520 static inline u8
521 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
522 {
523         u8 inner = 0;
524         if (skb->protocol == htons(ETH_P_IP))
525                 inner = old_iph->tos;
526         else if (skb->protocol == htons(ETH_P_IPV6))
527                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
528         return INET_ECN_encapsulate(tos, inner);
529 }
530
531 static int ipgre_rcv(struct sk_buff *skb)
532 {
533         struct iphdr *iph;
534         u8     *h;
535         __be16    flags;
536         __sum16   csum = 0;
537         __be32 key = 0;
538         u32    seqno = 0;
539         struct ip_tunnel *tunnel;
540         int    offset = 4;
541         __be16 gre_proto;
542
543         if (!pskb_may_pull(skb, 16))
544                 goto drop_nolock;
545
546         iph = ip_hdr(skb);
547         h = skb->data;
548         flags = *(__be16*)h;
549
550         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
551                 /* - Version must be 0.
552                    - We do not support routing headers.
553                  */
554                 if (flags&(GRE_VERSION|GRE_ROUTING))
555                         goto drop_nolock;
556
557                 if (flags&GRE_CSUM) {
558                         switch (skb->ip_summed) {
559                         case CHECKSUM_COMPLETE:
560                                 csum = csum_fold(skb->csum);
561                                 if (!csum)
562                                         break;
563                                 /* fall through */
564                         case CHECKSUM_NONE:
565                                 skb->csum = 0;
566                                 csum = __skb_checksum_complete(skb);
567                                 skb->ip_summed = CHECKSUM_COMPLETE;
568                         }
569                         offset += 4;
570                 }
571                 if (flags&GRE_KEY) {
572                         key = *(__be32*)(h + offset);
573                         offset += 4;
574                 }
575                 if (flags&GRE_SEQ) {
576                         seqno = ntohl(*(__be32*)(h + offset));
577                         offset += 4;
578                 }
579         }
580
581         gre_proto = *(__be16 *)(h + 2);
582
583         rcu_read_lock();
584         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
585                                           iph->saddr, iph->daddr, key,
586                                           gre_proto))) {
587                 struct net_device_stats *stats = &tunnel->dev->stats;
588
589                 secpath_reset(skb);
590
591                 skb->protocol = gre_proto;
592                 /* WCCP version 1 and 2 protocol decoding.
593                  * - Change protocol to IP
594                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
595                  */
596                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
597                         skb->protocol = htons(ETH_P_IP);
598                         if ((*(h + offset) & 0xF0) != 0x40)
599                                 offset += 4;
600                 }
601
602                 skb->mac_header = skb->network_header;
603                 __pskb_pull(skb, offset);
604                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
605                 skb->pkt_type = PACKET_HOST;
606 #ifdef CONFIG_NET_IPGRE_BROADCAST
607                 if (ipv4_is_multicast(iph->daddr)) {
608                         /* Looped back packet, drop it! */
609                         if (skb_rtable(skb)->fl.iif == 0)
610                                 goto drop;
611                         stats->multicast++;
612                         skb->pkt_type = PACKET_BROADCAST;
613                 }
614 #endif
615
616                 if (((flags&GRE_CSUM) && csum) ||
617                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
618                         stats->rx_crc_errors++;
619                         stats->rx_errors++;
620                         goto drop;
621                 }
622                 if (tunnel->parms.i_flags&GRE_SEQ) {
623                         if (!(flags&GRE_SEQ) ||
624                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
625                                 stats->rx_fifo_errors++;
626                                 stats->rx_errors++;
627                                 goto drop;
628                         }
629                         tunnel->i_seqno = seqno + 1;
630                 }
631
632                 /* Warning: All skb pointers will be invalidated! */
633                 if (tunnel->dev->type == ARPHRD_ETHER) {
634                         if (!pskb_may_pull(skb, ETH_HLEN)) {
635                                 stats->rx_length_errors++;
636                                 stats->rx_errors++;
637                                 goto drop;
638                         }
639
640                         iph = ip_hdr(skb);
641                         skb->protocol = eth_type_trans(skb, tunnel->dev);
642                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
643                 }
644
645                 skb_tunnel_rx(skb, tunnel->dev);
646
647                 skb_reset_network_header(skb);
648                 ipgre_ecn_decapsulate(iph, skb);
649
650                 if (netif_rx(skb) == NET_RX_DROP)
651                         stats->rx_dropped++;
652
653                 rcu_read_unlock();
654                 return 0;
655         }
656         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
657
658 drop:
659         rcu_read_unlock();
660 drop_nolock:
661         kfree_skb(skb);
662         return(0);
663 }
664
665 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
666 {
667         struct ip_tunnel *tunnel = netdev_priv(dev);
668         struct net_device_stats *stats = &dev->stats;
669         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
670         struct iphdr  *old_iph = ip_hdr(skb);
671         struct iphdr  *tiph;
672         u8     tos;
673         __be16 df;
674         struct rtable *rt;                      /* Route to the other host */
675         struct net_device *tdev;                /* Device to other host */
676         struct iphdr  *iph;                     /* Our new IP header */
677         unsigned int max_headroom;              /* The extra header space needed */
678         int    gre_hlen;
679         __be32 dst;
680         int    mtu;
681
682         if (dev->type == ARPHRD_ETHER)
683                 IPCB(skb)->flags = 0;
684
685         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
686                 gre_hlen = 0;
687                 tiph = (struct iphdr *)skb->data;
688         } else {
689                 gre_hlen = tunnel->hlen;
690                 tiph = &tunnel->parms.iph;
691         }
692
693         if ((dst = tiph->daddr) == 0) {
694                 /* NBMA tunnel */
695
696                 if (skb_dst(skb) == NULL) {
697                         stats->tx_fifo_errors++;
698                         goto tx_error;
699                 }
700
701                 if (skb->protocol == htons(ETH_P_IP)) {
702                         rt = skb_rtable(skb);
703                         if ((dst = rt->rt_gateway) == 0)
704                                 goto tx_error_icmp;
705                 }
706 #ifdef CONFIG_IPV6
707                 else if (skb->protocol == htons(ETH_P_IPV6)) {
708                         struct in6_addr *addr6;
709                         int addr_type;
710                         struct neighbour *neigh = skb_dst(skb)->neighbour;
711
712                         if (neigh == NULL)
713                                 goto tx_error;
714
715                         addr6 = (struct in6_addr *)&neigh->primary_key;
716                         addr_type = ipv6_addr_type(addr6);
717
718                         if (addr_type == IPV6_ADDR_ANY) {
719                                 addr6 = &ipv6_hdr(skb)->daddr;
720                                 addr_type = ipv6_addr_type(addr6);
721                         }
722
723                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
724                                 goto tx_error_icmp;
725
726                         dst = addr6->s6_addr32[3];
727                 }
728 #endif
729                 else
730                         goto tx_error;
731         }
732
733         tos = tiph->tos;
734         if (tos == 1) {
735                 tos = 0;
736                 if (skb->protocol == htons(ETH_P_IP))
737                         tos = old_iph->tos;
738                 else if (skb->protocol == htons(ETH_P_IPV6))
739                         tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
740         }
741
742         {
743                 struct flowi fl = { .oif = tunnel->parms.link,
744                                     .nl_u = { .ip4_u =
745                                               { .daddr = dst,
746                                                 .saddr = tiph->saddr,
747                                                 .tos = RT_TOS(tos) } },
748                                     .proto = IPPROTO_GRE };
749                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
750                         stats->tx_carrier_errors++;
751                         goto tx_error;
752                 }
753         }
754         tdev = rt->dst.dev;
755
756         if (tdev == dev) {
757                 ip_rt_put(rt);
758                 stats->collisions++;
759                 goto tx_error;
760         }
761
762         df = tiph->frag_off;
763         if (df)
764                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
765         else
766                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
767
768         if (skb_dst(skb))
769                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
770
771         if (skb->protocol == htons(ETH_P_IP)) {
772                 df |= (old_iph->frag_off&htons(IP_DF));
773
774                 if ((old_iph->frag_off&htons(IP_DF)) &&
775                     mtu < ntohs(old_iph->tot_len)) {
776                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
777                         ip_rt_put(rt);
778                         goto tx_error;
779                 }
780         }
781 #ifdef CONFIG_IPV6
782         else if (skb->protocol == htons(ETH_P_IPV6)) {
783                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
784
785                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
786                         if ((tunnel->parms.iph.daddr &&
787                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
788                             rt6->rt6i_dst.plen == 128) {
789                                 rt6->rt6i_flags |= RTF_MODIFIED;
790                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
791                         }
792                 }
793
794                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
795                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
796                         ip_rt_put(rt);
797                         goto tx_error;
798                 }
799         }
800 #endif
801
802         if (tunnel->err_count > 0) {
803                 if (time_before(jiffies,
804                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
805                         tunnel->err_count--;
806
807                         dst_link_failure(skb);
808                 } else
809                         tunnel->err_count = 0;
810         }
811
812         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
813
814         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
815             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
816                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
817                 if (max_headroom > dev->needed_headroom)
818                         dev->needed_headroom = max_headroom;
819                 if (!new_skb) {
820                         ip_rt_put(rt);
821                         txq->tx_dropped++;
822                         dev_kfree_skb(skb);
823                         return NETDEV_TX_OK;
824                 }
825                 if (skb->sk)
826                         skb_set_owner_w(new_skb, skb->sk);
827                 dev_kfree_skb(skb);
828                 skb = new_skb;
829                 old_iph = ip_hdr(skb);
830         }
831
832         skb_reset_transport_header(skb);
833         skb_push(skb, gre_hlen);
834         skb_reset_network_header(skb);
835         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
836         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
837                               IPSKB_REROUTED);
838         skb_dst_drop(skb);
839         skb_dst_set(skb, &rt->dst);
840
841         /*
842          *      Push down and install the IPIP header.
843          */
844
845         iph                     =       ip_hdr(skb);
846         iph->version            =       4;
847         iph->ihl                =       sizeof(struct iphdr) >> 2;
848         iph->frag_off           =       df;
849         iph->protocol           =       IPPROTO_GRE;
850         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
851         iph->daddr              =       rt->rt_dst;
852         iph->saddr              =       rt->rt_src;
853
854         if ((iph->ttl = tiph->ttl) == 0) {
855                 if (skb->protocol == htons(ETH_P_IP))
856                         iph->ttl = old_iph->ttl;
857 #ifdef CONFIG_IPV6
858                 else if (skb->protocol == htons(ETH_P_IPV6))
859                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
860 #endif
861                 else
862                         iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
863         }
864
865         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
866         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
867                                    htons(ETH_P_TEB) : skb->protocol;
868
869         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
870                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
871
872                 if (tunnel->parms.o_flags&GRE_SEQ) {
873                         ++tunnel->o_seqno;
874                         *ptr = htonl(tunnel->o_seqno);
875                         ptr--;
876                 }
877                 if (tunnel->parms.o_flags&GRE_KEY) {
878                         *ptr = tunnel->parms.o_key;
879                         ptr--;
880                 }
881                 if (tunnel->parms.o_flags&GRE_CSUM) {
882                         *ptr = 0;
883                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
884                 }
885         }
886
887         nf_reset(skb);
888
889         IPTUNNEL_XMIT();
890         return NETDEV_TX_OK;
891
892 tx_error_icmp:
893         dst_link_failure(skb);
894
895 tx_error:
896         stats->tx_errors++;
897         dev_kfree_skb(skb);
898         return NETDEV_TX_OK;
899 }
900
901 static int ipgre_tunnel_bind_dev(struct net_device *dev)
902 {
903         struct net_device *tdev = NULL;
904         struct ip_tunnel *tunnel;
905         struct iphdr *iph;
906         int hlen = LL_MAX_HEADER;
907         int mtu = ETH_DATA_LEN;
908         int addend = sizeof(struct iphdr) + 4;
909
910         tunnel = netdev_priv(dev);
911         iph = &tunnel->parms.iph;
912
913         /* Guess output device to choose reasonable mtu and needed_headroom */
914
915         if (iph->daddr) {
916                 struct flowi fl = { .oif = tunnel->parms.link,
917                                     .nl_u = { .ip4_u =
918                                               { .daddr = iph->daddr,
919                                                 .saddr = iph->saddr,
920                                                 .tos = RT_TOS(iph->tos) } },
921                                     .proto = IPPROTO_GRE };
922                 struct rtable *rt;
923                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
924                         tdev = rt->dst.dev;
925                         ip_rt_put(rt);
926                 }
927
928                 if (dev->type != ARPHRD_ETHER)
929                         dev->flags |= IFF_POINTOPOINT;
930         }
931
932         if (!tdev && tunnel->parms.link)
933                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
934
935         if (tdev) {
936                 hlen = tdev->hard_header_len + tdev->needed_headroom;
937                 mtu = tdev->mtu;
938         }
939         dev->iflink = tunnel->parms.link;
940
941         /* Precalculate GRE options length */
942         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
943                 if (tunnel->parms.o_flags&GRE_CSUM)
944                         addend += 4;
945                 if (tunnel->parms.o_flags&GRE_KEY)
946                         addend += 4;
947                 if (tunnel->parms.o_flags&GRE_SEQ)
948                         addend += 4;
949         }
950         dev->needed_headroom = addend + hlen;
951         mtu -= dev->hard_header_len + addend;
952
953         if (mtu < 68)
954                 mtu = 68;
955
956         tunnel->hlen = addend;
957
958         return mtu;
959 }
960
961 static int
962 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
963 {
964         int err = 0;
965         struct ip_tunnel_parm p;
966         struct ip_tunnel *t;
967         struct net *net = dev_net(dev);
968         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
969
970         switch (cmd) {
971         case SIOCGETTUNNEL:
972                 t = NULL;
973                 if (dev == ign->fb_tunnel_dev) {
974                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
975                                 err = -EFAULT;
976                                 break;
977                         }
978                         t = ipgre_tunnel_locate(net, &p, 0);
979                 }
980                 if (t == NULL)
981                         t = netdev_priv(dev);
982                 memcpy(&p, &t->parms, sizeof(p));
983                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
984                         err = -EFAULT;
985                 break;
986
987         case SIOCADDTUNNEL:
988         case SIOCCHGTUNNEL:
989                 err = -EPERM;
990                 if (!capable(CAP_NET_ADMIN))
991                         goto done;
992
993                 err = -EFAULT;
994                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
995                         goto done;
996
997                 err = -EINVAL;
998                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
999                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1000                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1001                         goto done;
1002                 if (p.iph.ttl)
1003                         p.iph.frag_off |= htons(IP_DF);
1004
1005                 if (!(p.i_flags&GRE_KEY))
1006                         p.i_key = 0;
1007                 if (!(p.o_flags&GRE_KEY))
1008                         p.o_key = 0;
1009
1010                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1011
1012                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1013                         if (t != NULL) {
1014                                 if (t->dev != dev) {
1015                                         err = -EEXIST;
1016                                         break;
1017                                 }
1018                         } else {
1019                                 unsigned int nflags = 0;
1020
1021                                 t = netdev_priv(dev);
1022
1023                                 if (ipv4_is_multicast(p.iph.daddr))
1024                                         nflags = IFF_BROADCAST;
1025                                 else if (p.iph.daddr)
1026                                         nflags = IFF_POINTOPOINT;
1027
1028                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1029                                         err = -EINVAL;
1030                                         break;
1031                                 }
1032                                 ipgre_tunnel_unlink(ign, t);
1033                                 t->parms.iph.saddr = p.iph.saddr;
1034                                 t->parms.iph.daddr = p.iph.daddr;
1035                                 t->parms.i_key = p.i_key;
1036                                 t->parms.o_key = p.o_key;
1037                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1038                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1039                                 ipgre_tunnel_link(ign, t);
1040                                 netdev_state_change(dev);
1041                         }
1042                 }
1043
1044                 if (t) {
1045                         err = 0;
1046                         if (cmd == SIOCCHGTUNNEL) {
1047                                 t->parms.iph.ttl = p.iph.ttl;
1048                                 t->parms.iph.tos = p.iph.tos;
1049                                 t->parms.iph.frag_off = p.iph.frag_off;
1050                                 if (t->parms.link != p.link) {
1051                                         t->parms.link = p.link;
1052                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1053                                         netdev_state_change(dev);
1054                                 }
1055                         }
1056                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1057                                 err = -EFAULT;
1058                 } else
1059                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1060                 break;
1061
1062         case SIOCDELTUNNEL:
1063                 err = -EPERM;
1064                 if (!capable(CAP_NET_ADMIN))
1065                         goto done;
1066
1067                 if (dev == ign->fb_tunnel_dev) {
1068                         err = -EFAULT;
1069                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1070                                 goto done;
1071                         err = -ENOENT;
1072                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1073                                 goto done;
1074                         err = -EPERM;
1075                         if (t == netdev_priv(ign->fb_tunnel_dev))
1076                                 goto done;
1077                         dev = t->dev;
1078                 }
1079                 unregister_netdevice(dev);
1080                 err = 0;
1081                 break;
1082
1083         default:
1084                 err = -EINVAL;
1085         }
1086
1087 done:
1088         return err;
1089 }
1090
1091 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1092 {
1093         struct ip_tunnel *tunnel = netdev_priv(dev);
1094         if (new_mtu < 68 ||
1095             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1096                 return -EINVAL;
1097         dev->mtu = new_mtu;
1098         return 0;
1099 }
1100
1101 /* Nice toy. Unfortunately, useless in real life :-)
1102    It allows to construct virtual multiprotocol broadcast "LAN"
1103    over the Internet, provided multicast routing is tuned.
1104
1105
1106    I have no idea was this bicycle invented before me,
1107    so that I had to set ARPHRD_IPGRE to a random value.
1108    I have an impression, that Cisco could make something similar,
1109    but this feature is apparently missing in IOS<=11.2(8).
1110
1111    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1112    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1113
1114    ping -t 255 224.66.66.66
1115
1116    If nobody answers, mbone does not work.
1117
1118    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1119    ip addr add 10.66.66.<somewhat>/24 dev Universe
1120    ifconfig Universe up
1121    ifconfig Universe add fe80::<Your_real_addr>/10
1122    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1123    ftp 10.66.66.66
1124    ...
1125    ftp fec0:6666:6666::193.233.7.65
1126    ...
1127
1128  */
1129
1130 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1131                         unsigned short type,
1132                         const void *daddr, const void *saddr, unsigned int len)
1133 {
1134         struct ip_tunnel *t = netdev_priv(dev);
1135         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1136         __be16 *p = (__be16*)(iph+1);
1137
1138         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1139         p[0]            = t->parms.o_flags;
1140         p[1]            = htons(type);
1141
1142         /*
1143          *      Set the source hardware address.
1144          */
1145
1146         if (saddr)
1147                 memcpy(&iph->saddr, saddr, 4);
1148         if (daddr)
1149                 memcpy(&iph->daddr, daddr, 4);
1150         if (iph->daddr)
1151                 return t->hlen;
1152
1153         return -t->hlen;
1154 }
1155
1156 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1157 {
1158         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1159         memcpy(haddr, &iph->saddr, 4);
1160         return 4;
1161 }
1162
1163 static const struct header_ops ipgre_header_ops = {
1164         .create = ipgre_header,
1165         .parse  = ipgre_header_parse,
1166 };
1167
1168 #ifdef CONFIG_NET_IPGRE_BROADCAST
1169 static int ipgre_open(struct net_device *dev)
1170 {
1171         struct ip_tunnel *t = netdev_priv(dev);
1172
1173         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1174                 struct flowi fl = { .oif = t->parms.link,
1175                                     .nl_u = { .ip4_u =
1176                                               { .daddr = t->parms.iph.daddr,
1177                                                 .saddr = t->parms.iph.saddr,
1178                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1179                                     .proto = IPPROTO_GRE };
1180                 struct rtable *rt;
1181                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1182                         return -EADDRNOTAVAIL;
1183                 dev = rt->dst.dev;
1184                 ip_rt_put(rt);
1185                 if (__in_dev_get_rtnl(dev) == NULL)
1186                         return -EADDRNOTAVAIL;
1187                 t->mlink = dev->ifindex;
1188                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1189         }
1190         return 0;
1191 }
1192
1193 static int ipgre_close(struct net_device *dev)
1194 {
1195         struct ip_tunnel *t = netdev_priv(dev);
1196
1197         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1198                 struct in_device *in_dev;
1199                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1200                 if (in_dev) {
1201                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1202                         in_dev_put(in_dev);
1203                 }
1204         }
1205         return 0;
1206 }
1207
1208 #endif
1209
1210 static const struct net_device_ops ipgre_netdev_ops = {
1211         .ndo_init               = ipgre_tunnel_init,
1212         .ndo_uninit             = ipgre_tunnel_uninit,
1213 #ifdef CONFIG_NET_IPGRE_BROADCAST
1214         .ndo_open               = ipgre_open,
1215         .ndo_stop               = ipgre_close,
1216 #endif
1217         .ndo_start_xmit         = ipgre_tunnel_xmit,
1218         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1219         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1220 };
1221
1222 static void ipgre_tunnel_setup(struct net_device *dev)
1223 {
1224         dev->netdev_ops         = &ipgre_netdev_ops;
1225         dev->destructor         = free_netdev;
1226
1227         dev->type               = ARPHRD_IPGRE;
1228         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1229         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1230         dev->flags              = IFF_NOARP;
1231         dev->iflink             = 0;
1232         dev->addr_len           = 4;
1233         dev->features           |= NETIF_F_NETNS_LOCAL;
1234         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1235 }
1236
1237 static int ipgre_tunnel_init(struct net_device *dev)
1238 {
1239         struct ip_tunnel *tunnel;
1240         struct iphdr *iph;
1241
1242         tunnel = netdev_priv(dev);
1243         iph = &tunnel->parms.iph;
1244
1245         tunnel->dev = dev;
1246         strcpy(tunnel->parms.name, dev->name);
1247
1248         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1249         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1250
1251         if (iph->daddr) {
1252 #ifdef CONFIG_NET_IPGRE_BROADCAST
1253                 if (ipv4_is_multicast(iph->daddr)) {
1254                         if (!iph->saddr)
1255                                 return -EINVAL;
1256                         dev->flags = IFF_BROADCAST;
1257                         dev->header_ops = &ipgre_header_ops;
1258                 }
1259 #endif
1260         } else
1261                 dev->header_ops = &ipgre_header_ops;
1262
1263         return 0;
1264 }
1265
1266 static void ipgre_fb_tunnel_init(struct net_device *dev)
1267 {
1268         struct ip_tunnel *tunnel = netdev_priv(dev);
1269         struct iphdr *iph = &tunnel->parms.iph;
1270         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1271
1272         tunnel->dev = dev;
1273         strcpy(tunnel->parms.name, dev->name);
1274
1275         iph->version            = 4;
1276         iph->protocol           = IPPROTO_GRE;
1277         iph->ihl                = 5;
1278         tunnel->hlen            = sizeof(struct iphdr) + 4;
1279
1280         dev_hold(dev);
1281         rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
1282 }
1283
1284
1285 static const struct gre_protocol ipgre_protocol = {
1286         .handler     = ipgre_rcv,
1287         .err_handler = ipgre_err,
1288 };
1289
1290 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1291 {
1292         int prio;
1293
1294         for (prio = 0; prio < 4; prio++) {
1295                 int h;
1296                 for (h = 0; h < HASH_SIZE; h++) {
1297                         struct ip_tunnel *t;
1298
1299                         t = rtnl_dereference(ign->tunnels[prio][h]);
1300
1301                         while (t != NULL) {
1302                                 unregister_netdevice_queue(t->dev, head);
1303                                 t = rtnl_dereference(t->next);
1304                         }
1305                 }
1306         }
1307 }
1308
1309 static int __net_init ipgre_init_net(struct net *net)
1310 {
1311         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1312         int err;
1313
1314         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1315                                            ipgre_tunnel_setup);
1316         if (!ign->fb_tunnel_dev) {
1317                 err = -ENOMEM;
1318                 goto err_alloc_dev;
1319         }
1320         dev_net_set(ign->fb_tunnel_dev, net);
1321
1322         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1323         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1324
1325         if ((err = register_netdev(ign->fb_tunnel_dev)))
1326                 goto err_reg_dev;
1327
1328         return 0;
1329
1330 err_reg_dev:
1331         free_netdev(ign->fb_tunnel_dev);
1332 err_alloc_dev:
1333         return err;
1334 }
1335
1336 static void __net_exit ipgre_exit_net(struct net *net)
1337 {
1338         struct ipgre_net *ign;
1339         LIST_HEAD(list);
1340
1341         ign = net_generic(net, ipgre_net_id);
1342         rtnl_lock();
1343         ipgre_destroy_tunnels(ign, &list);
1344         unregister_netdevice_many(&list);
1345         rtnl_unlock();
1346 }
1347
1348 static struct pernet_operations ipgre_net_ops = {
1349         .init = ipgre_init_net,
1350         .exit = ipgre_exit_net,
1351         .id   = &ipgre_net_id,
1352         .size = sizeof(struct ipgre_net),
1353 };
1354
1355 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1356 {
1357         __be16 flags;
1358
1359         if (!data)
1360                 return 0;
1361
1362         flags = 0;
1363         if (data[IFLA_GRE_IFLAGS])
1364                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1365         if (data[IFLA_GRE_OFLAGS])
1366                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1367         if (flags & (GRE_VERSION|GRE_ROUTING))
1368                 return -EINVAL;
1369
1370         return 0;
1371 }
1372
1373 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1374 {
1375         __be32 daddr;
1376
1377         if (tb[IFLA_ADDRESS]) {
1378                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1379                         return -EINVAL;
1380                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1381                         return -EADDRNOTAVAIL;
1382         }
1383
1384         if (!data)
1385                 goto out;
1386
1387         if (data[IFLA_GRE_REMOTE]) {
1388                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1389                 if (!daddr)
1390                         return -EINVAL;
1391         }
1392
1393 out:
1394         return ipgre_tunnel_validate(tb, data);
1395 }
1396
1397 static void ipgre_netlink_parms(struct nlattr *data[],
1398                                 struct ip_tunnel_parm *parms)
1399 {
1400         memset(parms, 0, sizeof(*parms));
1401
1402         parms->iph.protocol = IPPROTO_GRE;
1403
1404         if (!data)
1405                 return;
1406
1407         if (data[IFLA_GRE_LINK])
1408                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1409
1410         if (data[IFLA_GRE_IFLAGS])
1411                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1412
1413         if (data[IFLA_GRE_OFLAGS])
1414                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1415
1416         if (data[IFLA_GRE_IKEY])
1417                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1418
1419         if (data[IFLA_GRE_OKEY])
1420                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1421
1422         if (data[IFLA_GRE_LOCAL])
1423                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1424
1425         if (data[IFLA_GRE_REMOTE])
1426                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1427
1428         if (data[IFLA_GRE_TTL])
1429                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1430
1431         if (data[IFLA_GRE_TOS])
1432                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1433
1434         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1435                 parms->iph.frag_off = htons(IP_DF);
1436 }
1437
1438 static int ipgre_tap_init(struct net_device *dev)
1439 {
1440         struct ip_tunnel *tunnel;
1441
1442         tunnel = netdev_priv(dev);
1443
1444         tunnel->dev = dev;
1445         strcpy(tunnel->parms.name, dev->name);
1446
1447         ipgre_tunnel_bind_dev(dev);
1448
1449         return 0;
1450 }
1451
1452 static const struct net_device_ops ipgre_tap_netdev_ops = {
1453         .ndo_init               = ipgre_tap_init,
1454         .ndo_uninit             = ipgre_tunnel_uninit,
1455         .ndo_start_xmit         = ipgre_tunnel_xmit,
1456         .ndo_set_mac_address    = eth_mac_addr,
1457         .ndo_validate_addr      = eth_validate_addr,
1458         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1459 };
1460
1461 static void ipgre_tap_setup(struct net_device *dev)
1462 {
1463
1464         ether_setup(dev);
1465
1466         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1467         dev->destructor         = free_netdev;
1468
1469         dev->iflink             = 0;
1470         dev->features           |= NETIF_F_NETNS_LOCAL;
1471 }
1472
1473 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1474                          struct nlattr *data[])
1475 {
1476         struct ip_tunnel *nt;
1477         struct net *net = dev_net(dev);
1478         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1479         int mtu;
1480         int err;
1481
1482         nt = netdev_priv(dev);
1483         ipgre_netlink_parms(data, &nt->parms);
1484
1485         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1486                 return -EEXIST;
1487
1488         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1489                 random_ether_addr(dev->dev_addr);
1490
1491         mtu = ipgre_tunnel_bind_dev(dev);
1492         if (!tb[IFLA_MTU])
1493                 dev->mtu = mtu;
1494
1495         err = register_netdevice(dev);
1496         if (err)
1497                 goto out;
1498
1499         dev_hold(dev);
1500         ipgre_tunnel_link(ign, nt);
1501
1502 out:
1503         return err;
1504 }
1505
1506 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1507                             struct nlattr *data[])
1508 {
1509         struct ip_tunnel *t, *nt;
1510         struct net *net = dev_net(dev);
1511         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1512         struct ip_tunnel_parm p;
1513         int mtu;
1514
1515         if (dev == ign->fb_tunnel_dev)
1516                 return -EINVAL;
1517
1518         nt = netdev_priv(dev);
1519         ipgre_netlink_parms(data, &p);
1520
1521         t = ipgre_tunnel_locate(net, &p, 0);
1522
1523         if (t) {
1524                 if (t->dev != dev)
1525                         return -EEXIST;
1526         } else {
1527                 t = nt;
1528
1529                 if (dev->type != ARPHRD_ETHER) {
1530                         unsigned int nflags = 0;
1531
1532                         if (ipv4_is_multicast(p.iph.daddr))
1533                                 nflags = IFF_BROADCAST;
1534                         else if (p.iph.daddr)
1535                                 nflags = IFF_POINTOPOINT;
1536
1537                         if ((dev->flags ^ nflags) &
1538                             (IFF_POINTOPOINT | IFF_BROADCAST))
1539                                 return -EINVAL;
1540                 }
1541
1542                 ipgre_tunnel_unlink(ign, t);
1543                 t->parms.iph.saddr = p.iph.saddr;
1544                 t->parms.iph.daddr = p.iph.daddr;
1545                 t->parms.i_key = p.i_key;
1546                 if (dev->type != ARPHRD_ETHER) {
1547                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1548                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1549                 }
1550                 ipgre_tunnel_link(ign, t);
1551                 netdev_state_change(dev);
1552         }
1553
1554         t->parms.o_key = p.o_key;
1555         t->parms.iph.ttl = p.iph.ttl;
1556         t->parms.iph.tos = p.iph.tos;
1557         t->parms.iph.frag_off = p.iph.frag_off;
1558
1559         if (t->parms.link != p.link) {
1560                 t->parms.link = p.link;
1561                 mtu = ipgre_tunnel_bind_dev(dev);
1562                 if (!tb[IFLA_MTU])
1563                         dev->mtu = mtu;
1564                 netdev_state_change(dev);
1565         }
1566
1567         return 0;
1568 }
1569
1570 static size_t ipgre_get_size(const struct net_device *dev)
1571 {
1572         return
1573                 /* IFLA_GRE_LINK */
1574                 nla_total_size(4) +
1575                 /* IFLA_GRE_IFLAGS */
1576                 nla_total_size(2) +
1577                 /* IFLA_GRE_OFLAGS */
1578                 nla_total_size(2) +
1579                 /* IFLA_GRE_IKEY */
1580                 nla_total_size(4) +
1581                 /* IFLA_GRE_OKEY */
1582                 nla_total_size(4) +
1583                 /* IFLA_GRE_LOCAL */
1584                 nla_total_size(4) +
1585                 /* IFLA_GRE_REMOTE */
1586                 nla_total_size(4) +
1587                 /* IFLA_GRE_TTL */
1588                 nla_total_size(1) +
1589                 /* IFLA_GRE_TOS */
1590                 nla_total_size(1) +
1591                 /* IFLA_GRE_PMTUDISC */
1592                 nla_total_size(1) +
1593                 0;
1594 }
1595
1596 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1597 {
1598         struct ip_tunnel *t = netdev_priv(dev);
1599         struct ip_tunnel_parm *p = &t->parms;
1600
1601         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1602         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1603         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1604         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1605         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1606         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1607         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1608         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1609         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1610         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1611
1612         return 0;
1613
1614 nla_put_failure:
1615         return -EMSGSIZE;
1616 }
1617
1618 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1619         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1620         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1621         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1622         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1623         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1624         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1625         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1626         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1627         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1628         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1629 };
1630
1631 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1632         .kind           = "gre",
1633         .maxtype        = IFLA_GRE_MAX,
1634         .policy         = ipgre_policy,
1635         .priv_size      = sizeof(struct ip_tunnel),
1636         .setup          = ipgre_tunnel_setup,
1637         .validate       = ipgre_tunnel_validate,
1638         .newlink        = ipgre_newlink,
1639         .changelink     = ipgre_changelink,
1640         .get_size       = ipgre_get_size,
1641         .fill_info      = ipgre_fill_info,
1642 };
1643
1644 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1645         .kind           = "gretap",
1646         .maxtype        = IFLA_GRE_MAX,
1647         .policy         = ipgre_policy,
1648         .priv_size      = sizeof(struct ip_tunnel),
1649         .setup          = ipgre_tap_setup,
1650         .validate       = ipgre_tap_validate,
1651         .newlink        = ipgre_newlink,
1652         .changelink     = ipgre_changelink,
1653         .get_size       = ipgre_get_size,
1654         .fill_info      = ipgre_fill_info,
1655 };
1656
1657 /*
1658  *      And now the modules code and kernel interface.
1659  */
1660
1661 static int __init ipgre_init(void)
1662 {
1663         int err;
1664
1665         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1666
1667         err = register_pernet_device(&ipgre_net_ops);
1668         if (err < 0)
1669                 return err;
1670
1671         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1672         if (err < 0) {
1673                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1674                 goto add_proto_failed;
1675         }
1676
1677         err = rtnl_link_register(&ipgre_link_ops);
1678         if (err < 0)
1679                 goto rtnl_link_failed;
1680
1681         err = rtnl_link_register(&ipgre_tap_ops);
1682         if (err < 0)
1683                 goto tap_ops_failed;
1684
1685 out:
1686         return err;
1687
1688 tap_ops_failed:
1689         rtnl_link_unregister(&ipgre_link_ops);
1690 rtnl_link_failed:
1691         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1692 add_proto_failed:
1693         unregister_pernet_device(&ipgre_net_ops);
1694         goto out;
1695 }
1696
1697 static void __exit ipgre_fini(void)
1698 {
1699         rtnl_link_unregister(&ipgre_tap_ops);
1700         rtnl_link_unregister(&ipgre_link_ops);
1701         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1702                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1703         unregister_pernet_device(&ipgre_net_ops);
1704 }
1705
1706 module_init(ipgre_init);
1707 module_exit(ipgre_fini);
1708 MODULE_LICENSE("GPL");
1709 MODULE_ALIAS_RTNL_LINK("gre");
1710 MODULE_ALIAS_RTNL_LINK("gretap");