]> nv-tegra.nvidia Code Review - linux-2.6.git/blob - net/ipv4/ip_gre.c
tunnels: Optimize tx path
[linux-2.6.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: HARD_TX_LOCK lock breaks dead loops.
70
71
72
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, tt is not solution at all.
93
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    fastly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106
107
108
109    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110    practically identical code. It would be good to glue them
111    together, but it is not very evident, how to make them modular.
112    sit is integral part of IPv6, ipip and gre are naturally modular.
113    We could extract common parts (hash table, ioctl etc)
114    to a separate module (ip_tunnel.c).
115
116    Alexey Kuznetsov.
117  */
118
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
123
124 /* Fallback tunnel: no source, no destination, no key, no options */
125
126 #define HASH_SIZE  16
127
128 static int ipgre_net_id;
129 struct ipgre_net {
130         struct ip_tunnel *tunnels[4][HASH_SIZE];
131
132         struct net_device *fb_tunnel_dev;
133 };
134
135 /* Tunnel hash table */
136
137 /*
138    4 hash tables:
139
140    3: (remote,local)
141    2: (remote,*)
142    1: (*,local)
143    0: (*,*)
144
145    We require exact key match i.e. if a key is present in packet
146    it will match only tunnel with the same key; if it is not present,
147    it will match only keyless tunnel.
148
149    All keysless packets, if not matched configured keyless tunnels
150    will match fallback tunnel.
151  */
152
153 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
154
155 #define tunnels_r_l     tunnels[3]
156 #define tunnels_r       tunnels[2]
157 #define tunnels_l       tunnels[1]
158 #define tunnels_wc      tunnels[0]
159
160 static DEFINE_RWLOCK(ipgre_lock);
161
162 /* Given src, dst and key, find appropriate for input tunnel. */
163
164 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
165                                               __be32 remote, __be32 local,
166                                               __be32 key, __be16 gre_proto)
167 {
168         struct net *net = dev_net(dev);
169         int link = dev->ifindex;
170         unsigned h0 = HASH(remote);
171         unsigned h1 = HASH(key);
172         struct ip_tunnel *t, *cand = NULL;
173         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
174         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
175                        ARPHRD_ETHER : ARPHRD_IPGRE;
176         int score, cand_score = 4;
177
178         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
179                 if (local != t->parms.iph.saddr ||
180                     remote != t->parms.iph.daddr ||
181                     key != t->parms.i_key ||
182                     !(t->dev->flags & IFF_UP))
183                         continue;
184
185                 if (t->dev->type != ARPHRD_IPGRE &&
186                     t->dev->type != dev_type)
187                         continue;
188
189                 score = 0;
190                 if (t->parms.link != link)
191                         score |= 1;
192                 if (t->dev->type != dev_type)
193                         score |= 2;
194                 if (score == 0)
195                         return t;
196
197                 if (score < cand_score) {
198                         cand = t;
199                         cand_score = score;
200                 }
201         }
202
203         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
204                 if (remote != t->parms.iph.daddr ||
205                     key != t->parms.i_key ||
206                     !(t->dev->flags & IFF_UP))
207                         continue;
208
209                 if (t->dev->type != ARPHRD_IPGRE &&
210                     t->dev->type != dev_type)
211                         continue;
212
213                 score = 0;
214                 if (t->parms.link != link)
215                         score |= 1;
216                 if (t->dev->type != dev_type)
217                         score |= 2;
218                 if (score == 0)
219                         return t;
220
221                 if (score < cand_score) {
222                         cand = t;
223                         cand_score = score;
224                 }
225         }
226
227         for (t = ign->tunnels_l[h1]; t; t = t->next) {
228                 if ((local != t->parms.iph.saddr &&
229                      (local != t->parms.iph.daddr ||
230                       !ipv4_is_multicast(local))) ||
231                     key != t->parms.i_key ||
232                     !(t->dev->flags & IFF_UP))
233                         continue;
234
235                 if (t->dev->type != ARPHRD_IPGRE &&
236                     t->dev->type != dev_type)
237                         continue;
238
239                 score = 0;
240                 if (t->parms.link != link)
241                         score |= 1;
242                 if (t->dev->type != dev_type)
243                         score |= 2;
244                 if (score == 0)
245                         return t;
246
247                 if (score < cand_score) {
248                         cand = t;
249                         cand_score = score;
250                 }
251         }
252
253         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
254                 if (t->parms.i_key != key ||
255                     !(t->dev->flags & IFF_UP))
256                         continue;
257
258                 if (t->dev->type != ARPHRD_IPGRE &&
259                     t->dev->type != dev_type)
260                         continue;
261
262                 score = 0;
263                 if (t->parms.link != link)
264                         score |= 1;
265                 if (t->dev->type != dev_type)
266                         score |= 2;
267                 if (score == 0)
268                         return t;
269
270                 if (score < cand_score) {
271                         cand = t;
272                         cand_score = score;
273                 }
274         }
275
276         if (cand != NULL)
277                 return cand;
278
279         if (ign->fb_tunnel_dev->flags & IFF_UP)
280                 return netdev_priv(ign->fb_tunnel_dev);
281
282         return NULL;
283 }
284
285 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
286                 struct ip_tunnel_parm *parms)
287 {
288         __be32 remote = parms->iph.daddr;
289         __be32 local = parms->iph.saddr;
290         __be32 key = parms->i_key;
291         unsigned h = HASH(key);
292         int prio = 0;
293
294         if (local)
295                 prio |= 1;
296         if (remote && !ipv4_is_multicast(remote)) {
297                 prio |= 2;
298                 h ^= HASH(remote);
299         }
300
301         return &ign->tunnels[prio][h];
302 }
303
304 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
305                 struct ip_tunnel *t)
306 {
307         return __ipgre_bucket(ign, &t->parms);
308 }
309
310 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
311 {
312         struct ip_tunnel **tp = ipgre_bucket(ign, t);
313
314         t->next = *tp;
315         write_lock_bh(&ipgre_lock);
316         *tp = t;
317         write_unlock_bh(&ipgre_lock);
318 }
319
320 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
321 {
322         struct ip_tunnel **tp;
323
324         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
325                 if (t == *tp) {
326                         write_lock_bh(&ipgre_lock);
327                         *tp = t->next;
328                         write_unlock_bh(&ipgre_lock);
329                         break;
330                 }
331         }
332 }
333
334 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
335                                            struct ip_tunnel_parm *parms,
336                                            int type)
337 {
338         __be32 remote = parms->iph.daddr;
339         __be32 local = parms->iph.saddr;
340         __be32 key = parms->i_key;
341         int link = parms->link;
342         struct ip_tunnel *t, **tp;
343         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
344
345         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
346                 if (local == t->parms.iph.saddr &&
347                     remote == t->parms.iph.daddr &&
348                     key == t->parms.i_key &&
349                     link == t->parms.link &&
350                     type == t->dev->type)
351                         break;
352
353         return t;
354 }
355
356 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
357                 struct ip_tunnel_parm *parms, int create)
358 {
359         struct ip_tunnel *t, *nt;
360         struct net_device *dev;
361         char name[IFNAMSIZ];
362         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
363
364         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
365         if (t || !create)
366                 return t;
367
368         if (parms->name[0])
369                 strlcpy(name, parms->name, IFNAMSIZ);
370         else
371                 sprintf(name, "gre%%d");
372
373         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
374         if (!dev)
375           return NULL;
376
377         dev_net_set(dev, net);
378
379         if (strchr(name, '%')) {
380                 if (dev_alloc_name(dev, name) < 0)
381                         goto failed_free;
382         }
383
384         nt = netdev_priv(dev);
385         nt->parms = *parms;
386         dev->rtnl_link_ops = &ipgre_link_ops;
387
388         dev->mtu = ipgre_tunnel_bind_dev(dev);
389
390         if (register_netdevice(dev) < 0)
391                 goto failed_free;
392
393         dev_hold(dev);
394         ipgre_tunnel_link(ign, nt);
395         return nt;
396
397 failed_free:
398         free_netdev(dev);
399         return NULL;
400 }
401
402 static void ipgre_tunnel_uninit(struct net_device *dev)
403 {
404         struct net *net = dev_net(dev);
405         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
406
407         ipgre_tunnel_unlink(ign, netdev_priv(dev));
408         dev_put(dev);
409 }
410
411
412 static void ipgre_err(struct sk_buff *skb, u32 info)
413 {
414
415 /* All the routers (except for Linux) return only
416    8 bytes of packet payload. It means, that precise relaying of
417    ICMP in the real Internet is absolutely infeasible.
418
419    Moreover, Cisco "wise men" put GRE key to the third word
420    in GRE header. It makes impossible maintaining even soft state for keyed
421    GRE tunnels with enabled checksum. Tell them "thank you".
422
423    Well, I wonder, rfc1812 was written by Cisco employee,
424    what the hell these idiots break standrads established
425    by themself???
426  */
427
428         struct iphdr *iph = (struct iphdr *)skb->data;
429         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
430         int grehlen = (iph->ihl<<2) + 4;
431         const int type = icmp_hdr(skb)->type;
432         const int code = icmp_hdr(skb)->code;
433         struct ip_tunnel *t;
434         __be16 flags;
435
436         flags = p[0];
437         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
438                 if (flags&(GRE_VERSION|GRE_ROUTING))
439                         return;
440                 if (flags&GRE_KEY) {
441                         grehlen += 4;
442                         if (flags&GRE_CSUM)
443                                 grehlen += 4;
444                 }
445         }
446
447         /* If only 8 bytes returned, keyed message will be dropped here */
448         if (skb_headlen(skb) < grehlen)
449                 return;
450
451         switch (type) {
452         default:
453         case ICMP_PARAMETERPROB:
454                 return;
455
456         case ICMP_DEST_UNREACH:
457                 switch (code) {
458                 case ICMP_SR_FAILED:
459                 case ICMP_PORT_UNREACH:
460                         /* Impossible event. */
461                         return;
462                 case ICMP_FRAG_NEEDED:
463                         /* Soft state for pmtu is maintained by IP core. */
464                         return;
465                 default:
466                         /* All others are translated to HOST_UNREACH.
467                            rfc2003 contains "deep thoughts" about NET_UNREACH,
468                            I believe they are just ether pollution. --ANK
469                          */
470                         break;
471                 }
472                 break;
473         case ICMP_TIME_EXCEEDED:
474                 if (code != ICMP_EXC_TTL)
475                         return;
476                 break;
477         }
478
479         read_lock(&ipgre_lock);
480         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
481                                 flags & GRE_KEY ?
482                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
483                                 p[1]);
484         if (t == NULL || t->parms.iph.daddr == 0 ||
485             ipv4_is_multicast(t->parms.iph.daddr))
486                 goto out;
487
488         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
489                 goto out;
490
491         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
492                 t->err_count++;
493         else
494                 t->err_count = 1;
495         t->err_time = jiffies;
496 out:
497         read_unlock(&ipgre_lock);
498         return;
499 }
500
501 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
502 {
503         if (INET_ECN_is_ce(iph->tos)) {
504                 if (skb->protocol == htons(ETH_P_IP)) {
505                         IP_ECN_set_ce(ip_hdr(skb));
506                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
507                         IP6_ECN_set_ce(ipv6_hdr(skb));
508                 }
509         }
510 }
511
512 static inline u8
513 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
514 {
515         u8 inner = 0;
516         if (skb->protocol == htons(ETH_P_IP))
517                 inner = old_iph->tos;
518         else if (skb->protocol == htons(ETH_P_IPV6))
519                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
520         return INET_ECN_encapsulate(tos, inner);
521 }
522
523 static int ipgre_rcv(struct sk_buff *skb)
524 {
525         struct iphdr *iph;
526         u8     *h;
527         __be16    flags;
528         __sum16   csum = 0;
529         __be32 key = 0;
530         u32    seqno = 0;
531         struct ip_tunnel *tunnel;
532         int    offset = 4;
533         __be16 gre_proto;
534         unsigned int len;
535
536         if (!pskb_may_pull(skb, 16))
537                 goto drop_nolock;
538
539         iph = ip_hdr(skb);
540         h = skb->data;
541         flags = *(__be16*)h;
542
543         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
544                 /* - Version must be 0.
545                    - We do not support routing headers.
546                  */
547                 if (flags&(GRE_VERSION|GRE_ROUTING))
548                         goto drop_nolock;
549
550                 if (flags&GRE_CSUM) {
551                         switch (skb->ip_summed) {
552                         case CHECKSUM_COMPLETE:
553                                 csum = csum_fold(skb->csum);
554                                 if (!csum)
555                                         break;
556                                 /* fall through */
557                         case CHECKSUM_NONE:
558                                 skb->csum = 0;
559                                 csum = __skb_checksum_complete(skb);
560                                 skb->ip_summed = CHECKSUM_COMPLETE;
561                         }
562                         offset += 4;
563                 }
564                 if (flags&GRE_KEY) {
565                         key = *(__be32*)(h + offset);
566                         offset += 4;
567                 }
568                 if (flags&GRE_SEQ) {
569                         seqno = ntohl(*(__be32*)(h + offset));
570                         offset += 4;
571                 }
572         }
573
574         gre_proto = *(__be16 *)(h + 2);
575
576         read_lock(&ipgre_lock);
577         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
578                                           iph->saddr, iph->daddr, key,
579                                           gre_proto))) {
580                 struct net_device_stats *stats = &tunnel->dev->stats;
581
582                 secpath_reset(skb);
583
584                 skb->protocol = gre_proto;
585                 /* WCCP version 1 and 2 protocol decoding.
586                  * - Change protocol to IP
587                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
588                  */
589                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
590                         skb->protocol = htons(ETH_P_IP);
591                         if ((*(h + offset) & 0xF0) != 0x40)
592                                 offset += 4;
593                 }
594
595                 skb->mac_header = skb->network_header;
596                 __pskb_pull(skb, offset);
597                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
598                 skb->pkt_type = PACKET_HOST;
599 #ifdef CONFIG_NET_IPGRE_BROADCAST
600                 if (ipv4_is_multicast(iph->daddr)) {
601                         /* Looped back packet, drop it! */
602                         if (skb_rtable(skb)->fl.iif == 0)
603                                 goto drop;
604                         stats->multicast++;
605                         skb->pkt_type = PACKET_BROADCAST;
606                 }
607 #endif
608
609                 if (((flags&GRE_CSUM) && csum) ||
610                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
611                         stats->rx_crc_errors++;
612                         stats->rx_errors++;
613                         goto drop;
614                 }
615                 if (tunnel->parms.i_flags&GRE_SEQ) {
616                         if (!(flags&GRE_SEQ) ||
617                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
618                                 stats->rx_fifo_errors++;
619                                 stats->rx_errors++;
620                                 goto drop;
621                         }
622                         tunnel->i_seqno = seqno + 1;
623                 }
624
625                 len = skb->len;
626
627                 /* Warning: All skb pointers will be invalidated! */
628                 if (tunnel->dev->type == ARPHRD_ETHER) {
629                         if (!pskb_may_pull(skb, ETH_HLEN)) {
630                                 stats->rx_length_errors++;
631                                 stats->rx_errors++;
632                                 goto drop;
633                         }
634
635                         iph = ip_hdr(skb);
636                         skb->protocol = eth_type_trans(skb, tunnel->dev);
637                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
638                 }
639
640                 stats->rx_packets++;
641                 stats->rx_bytes += len;
642                 skb->dev = tunnel->dev;
643                 skb_dst_drop(skb);
644                 nf_reset(skb);
645
646                 skb_reset_network_header(skb);
647                 ipgre_ecn_decapsulate(iph, skb);
648
649                 netif_rx(skb);
650                 read_unlock(&ipgre_lock);
651                 return(0);
652         }
653         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654
655 drop:
656         read_unlock(&ipgre_lock);
657 drop_nolock:
658         kfree_skb(skb);
659         return(0);
660 }
661
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
663 {
664         struct ip_tunnel *tunnel = netdev_priv(dev);
665         struct net_device_stats *stats = &dev->stats;
666         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
667         struct iphdr  *old_iph = ip_hdr(skb);
668         struct iphdr  *tiph;
669         u8     tos;
670         __be16 df;
671         struct rtable *rt;                      /* Route to the other host */
672         struct net_device *tdev;                        /* Device to other host */
673         struct iphdr  *iph;                     /* Our new IP header */
674         unsigned int max_headroom;              /* The extra header space needed */
675         int    gre_hlen;
676         __be32 dst;
677         int    mtu;
678
679         if (dev->type == ARPHRD_ETHER)
680                 IPCB(skb)->flags = 0;
681
682         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
683                 gre_hlen = 0;
684                 tiph = (struct iphdr *)skb->data;
685         } else {
686                 gre_hlen = tunnel->hlen;
687                 tiph = &tunnel->parms.iph;
688         }
689
690         if ((dst = tiph->daddr) == 0) {
691                 /* NBMA tunnel */
692
693                 if (skb_dst(skb) == NULL) {
694                         stats->tx_fifo_errors++;
695                         goto tx_error;
696                 }
697
698                 if (skb->protocol == htons(ETH_P_IP)) {
699                         rt = skb_rtable(skb);
700                         if ((dst = rt->rt_gateway) == 0)
701                                 goto tx_error_icmp;
702                 }
703 #ifdef CONFIG_IPV6
704                 else if (skb->protocol == htons(ETH_P_IPV6)) {
705                         struct in6_addr *addr6;
706                         int addr_type;
707                         struct neighbour *neigh = skb_dst(skb)->neighbour;
708
709                         if (neigh == NULL)
710                                 goto tx_error;
711
712                         addr6 = (struct in6_addr *)&neigh->primary_key;
713                         addr_type = ipv6_addr_type(addr6);
714
715                         if (addr_type == IPV6_ADDR_ANY) {
716                                 addr6 = &ipv6_hdr(skb)->daddr;
717                                 addr_type = ipv6_addr_type(addr6);
718                         }
719
720                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
721                                 goto tx_error_icmp;
722
723                         dst = addr6->s6_addr32[3];
724                 }
725 #endif
726                 else
727                         goto tx_error;
728         }
729
730         tos = tiph->tos;
731         if (tos == 1) {
732                 tos = 0;
733                 if (skb->protocol == htons(ETH_P_IP))
734                         tos = old_iph->tos;
735         }
736
737         {
738                 struct flowi fl = { .oif = tunnel->parms.link,
739                                     .nl_u = { .ip4_u =
740                                               { .daddr = dst,
741                                                 .saddr = tiph->saddr,
742                                                 .tos = RT_TOS(tos) } },
743                                     .proto = IPPROTO_GRE };
744                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
745                         stats->tx_carrier_errors++;
746                         goto tx_error;
747                 }
748         }
749         tdev = rt->u.dst.dev;
750
751         if (tdev == dev) {
752                 ip_rt_put(rt);
753                 stats->collisions++;
754                 goto tx_error;
755         }
756
757         df = tiph->frag_off;
758         if (df)
759                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
760         else
761                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
762
763         if (skb_dst(skb))
764                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
765
766         if (skb->protocol == htons(ETH_P_IP)) {
767                 df |= (old_iph->frag_off&htons(IP_DF));
768
769                 if ((old_iph->frag_off&htons(IP_DF)) &&
770                     mtu < ntohs(old_iph->tot_len)) {
771                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
772                         ip_rt_put(rt);
773                         goto tx_error;
774                 }
775         }
776 #ifdef CONFIG_IPV6
777         else if (skb->protocol == htons(ETH_P_IPV6)) {
778                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
779
780                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
781                         if ((tunnel->parms.iph.daddr &&
782                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
783                             rt6->rt6i_dst.plen == 128) {
784                                 rt6->rt6i_flags |= RTF_MODIFIED;
785                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
786                         }
787                 }
788
789                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
790                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
791                         ip_rt_put(rt);
792                         goto tx_error;
793                 }
794         }
795 #endif
796
797         if (tunnel->err_count > 0) {
798                 if (time_before(jiffies,
799                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
800                         tunnel->err_count--;
801
802                         dst_link_failure(skb);
803                 } else
804                         tunnel->err_count = 0;
805         }
806
807         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
808
809         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
810             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
811                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
812                 if (!new_skb) {
813                         ip_rt_put(rt);
814                         txq->tx_dropped++;
815                         dev_kfree_skb(skb);
816                         return NETDEV_TX_OK;
817                 }
818                 if (skb->sk)
819                         skb_set_owner_w(new_skb, skb->sk);
820                 dev_kfree_skb(skb);
821                 skb = new_skb;
822                 old_iph = ip_hdr(skb);
823         }
824
825         skb_reset_transport_header(skb);
826         skb_push(skb, gre_hlen);
827         skb_reset_network_header(skb);
828         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
829         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
830                               IPSKB_REROUTED);
831         skb_dst_drop(skb);
832         skb_dst_set(skb, &rt->u.dst);
833
834         /*
835          *      Push down and install the IPIP header.
836          */
837
838         iph                     =       ip_hdr(skb);
839         iph->version            =       4;
840         iph->ihl                =       sizeof(struct iphdr) >> 2;
841         iph->frag_off           =       df;
842         iph->protocol           =       IPPROTO_GRE;
843         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
844         iph->daddr              =       rt->rt_dst;
845         iph->saddr              =       rt->rt_src;
846
847         if ((iph->ttl = tiph->ttl) == 0) {
848                 if (skb->protocol == htons(ETH_P_IP))
849                         iph->ttl = old_iph->ttl;
850 #ifdef CONFIG_IPV6
851                 else if (skb->protocol == htons(ETH_P_IPV6))
852                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
853 #endif
854                 else
855                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
856         }
857
858         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
859         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
860                                    htons(ETH_P_TEB) : skb->protocol;
861
862         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
863                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
864
865                 if (tunnel->parms.o_flags&GRE_SEQ) {
866                         ++tunnel->o_seqno;
867                         *ptr = htonl(tunnel->o_seqno);
868                         ptr--;
869                 }
870                 if (tunnel->parms.o_flags&GRE_KEY) {
871                         *ptr = tunnel->parms.o_key;
872                         ptr--;
873                 }
874                 if (tunnel->parms.o_flags&GRE_CSUM) {
875                         *ptr = 0;
876                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
877                 }
878         }
879
880         nf_reset(skb);
881
882         IPTUNNEL_XMIT();
883         return NETDEV_TX_OK;
884
885 tx_error_icmp:
886         dst_link_failure(skb);
887
888 tx_error:
889         stats->tx_errors++;
890         dev_kfree_skb(skb);
891         return NETDEV_TX_OK;
892 }
893
894 static int ipgre_tunnel_bind_dev(struct net_device *dev)
895 {
896         struct net_device *tdev = NULL;
897         struct ip_tunnel *tunnel;
898         struct iphdr *iph;
899         int hlen = LL_MAX_HEADER;
900         int mtu = ETH_DATA_LEN;
901         int addend = sizeof(struct iphdr) + 4;
902
903         tunnel = netdev_priv(dev);
904         iph = &tunnel->parms.iph;
905
906         /* Guess output device to choose reasonable mtu and needed_headroom */
907
908         if (iph->daddr) {
909                 struct flowi fl = { .oif = tunnel->parms.link,
910                                     .nl_u = { .ip4_u =
911                                               { .daddr = iph->daddr,
912                                                 .saddr = iph->saddr,
913                                                 .tos = RT_TOS(iph->tos) } },
914                                     .proto = IPPROTO_GRE };
915                 struct rtable *rt;
916                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
917                         tdev = rt->u.dst.dev;
918                         ip_rt_put(rt);
919                 }
920
921                 if (dev->type != ARPHRD_ETHER)
922                         dev->flags |= IFF_POINTOPOINT;
923         }
924
925         if (!tdev && tunnel->parms.link)
926                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
927
928         if (tdev) {
929                 hlen = tdev->hard_header_len + tdev->needed_headroom;
930                 mtu = tdev->mtu;
931         }
932         dev->iflink = tunnel->parms.link;
933
934         /* Precalculate GRE options length */
935         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
936                 if (tunnel->parms.o_flags&GRE_CSUM)
937                         addend += 4;
938                 if (tunnel->parms.o_flags&GRE_KEY)
939                         addend += 4;
940                 if (tunnel->parms.o_flags&GRE_SEQ)
941                         addend += 4;
942         }
943         dev->needed_headroom = addend + hlen;
944         mtu -= dev->hard_header_len + addend;
945
946         if (mtu < 68)
947                 mtu = 68;
948
949         tunnel->hlen = addend;
950
951         return mtu;
952 }
953
954 static int
955 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
956 {
957         int err = 0;
958         struct ip_tunnel_parm p;
959         struct ip_tunnel *t;
960         struct net *net = dev_net(dev);
961         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
962
963         switch (cmd) {
964         case SIOCGETTUNNEL:
965                 t = NULL;
966                 if (dev == ign->fb_tunnel_dev) {
967                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
968                                 err = -EFAULT;
969                                 break;
970                         }
971                         t = ipgre_tunnel_locate(net, &p, 0);
972                 }
973                 if (t == NULL)
974                         t = netdev_priv(dev);
975                 memcpy(&p, &t->parms, sizeof(p));
976                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
977                         err = -EFAULT;
978                 break;
979
980         case SIOCADDTUNNEL:
981         case SIOCCHGTUNNEL:
982                 err = -EPERM;
983                 if (!capable(CAP_NET_ADMIN))
984                         goto done;
985
986                 err = -EFAULT;
987                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
988                         goto done;
989
990                 err = -EINVAL;
991                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
992                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
993                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
994                         goto done;
995                 if (p.iph.ttl)
996                         p.iph.frag_off |= htons(IP_DF);
997
998                 if (!(p.i_flags&GRE_KEY))
999                         p.i_key = 0;
1000                 if (!(p.o_flags&GRE_KEY))
1001                         p.o_key = 0;
1002
1003                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1004
1005                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1006                         if (t != NULL) {
1007                                 if (t->dev != dev) {
1008                                         err = -EEXIST;
1009                                         break;
1010                                 }
1011                         } else {
1012                                 unsigned nflags = 0;
1013
1014                                 t = netdev_priv(dev);
1015
1016                                 if (ipv4_is_multicast(p.iph.daddr))
1017                                         nflags = IFF_BROADCAST;
1018                                 else if (p.iph.daddr)
1019                                         nflags = IFF_POINTOPOINT;
1020
1021                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1022                                         err = -EINVAL;
1023                                         break;
1024                                 }
1025                                 ipgre_tunnel_unlink(ign, t);
1026                                 t->parms.iph.saddr = p.iph.saddr;
1027                                 t->parms.iph.daddr = p.iph.daddr;
1028                                 t->parms.i_key = p.i_key;
1029                                 t->parms.o_key = p.o_key;
1030                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1031                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1032                                 ipgre_tunnel_link(ign, t);
1033                                 netdev_state_change(dev);
1034                         }
1035                 }
1036
1037                 if (t) {
1038                         err = 0;
1039                         if (cmd == SIOCCHGTUNNEL) {
1040                                 t->parms.iph.ttl = p.iph.ttl;
1041                                 t->parms.iph.tos = p.iph.tos;
1042                                 t->parms.iph.frag_off = p.iph.frag_off;
1043                                 if (t->parms.link != p.link) {
1044                                         t->parms.link = p.link;
1045                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1046                                         netdev_state_change(dev);
1047                                 }
1048                         }
1049                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1050                                 err = -EFAULT;
1051                 } else
1052                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1053                 break;
1054
1055         case SIOCDELTUNNEL:
1056                 err = -EPERM;
1057                 if (!capable(CAP_NET_ADMIN))
1058                         goto done;
1059
1060                 if (dev == ign->fb_tunnel_dev) {
1061                         err = -EFAULT;
1062                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1063                                 goto done;
1064                         err = -ENOENT;
1065                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1066                                 goto done;
1067                         err = -EPERM;
1068                         if (t == netdev_priv(ign->fb_tunnel_dev))
1069                                 goto done;
1070                         dev = t->dev;
1071                 }
1072                 unregister_netdevice(dev);
1073                 err = 0;
1074                 break;
1075
1076         default:
1077                 err = -EINVAL;
1078         }
1079
1080 done:
1081         return err;
1082 }
1083
1084 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1085 {
1086         struct ip_tunnel *tunnel = netdev_priv(dev);
1087         if (new_mtu < 68 ||
1088             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1089                 return -EINVAL;
1090         dev->mtu = new_mtu;
1091         return 0;
1092 }
1093
1094 /* Nice toy. Unfortunately, useless in real life :-)
1095    It allows to construct virtual multiprotocol broadcast "LAN"
1096    over the Internet, provided multicast routing is tuned.
1097
1098
1099    I have no idea was this bicycle invented before me,
1100    so that I had to set ARPHRD_IPGRE to a random value.
1101    I have an impression, that Cisco could make something similar,
1102    but this feature is apparently missing in IOS<=11.2(8).
1103
1104    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1105    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1106
1107    ping -t 255 224.66.66.66
1108
1109    If nobody answers, mbone does not work.
1110
1111    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1112    ip addr add 10.66.66.<somewhat>/24 dev Universe
1113    ifconfig Universe up
1114    ifconfig Universe add fe80::<Your_real_addr>/10
1115    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1116    ftp 10.66.66.66
1117    ...
1118    ftp fec0:6666:6666::193.233.7.65
1119    ...
1120
1121  */
1122
1123 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1124                         unsigned short type,
1125                         const void *daddr, const void *saddr, unsigned len)
1126 {
1127         struct ip_tunnel *t = netdev_priv(dev);
1128         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1129         __be16 *p = (__be16*)(iph+1);
1130
1131         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1132         p[0]            = t->parms.o_flags;
1133         p[1]            = htons(type);
1134
1135         /*
1136          *      Set the source hardware address.
1137          */
1138
1139         if (saddr)
1140                 memcpy(&iph->saddr, saddr, 4);
1141
1142         if (daddr) {
1143                 memcpy(&iph->daddr, daddr, 4);
1144                 return t->hlen;
1145         }
1146         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1147                 return t->hlen;
1148
1149         return -t->hlen;
1150 }
1151
1152 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1153 {
1154         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1155         memcpy(haddr, &iph->saddr, 4);
1156         return 4;
1157 }
1158
1159 static const struct header_ops ipgre_header_ops = {
1160         .create = ipgre_header,
1161         .parse  = ipgre_header_parse,
1162 };
1163
1164 #ifdef CONFIG_NET_IPGRE_BROADCAST
1165 static int ipgre_open(struct net_device *dev)
1166 {
1167         struct ip_tunnel *t = netdev_priv(dev);
1168
1169         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170                 struct flowi fl = { .oif = t->parms.link,
1171                                     .nl_u = { .ip4_u =
1172                                               { .daddr = t->parms.iph.daddr,
1173                                                 .saddr = t->parms.iph.saddr,
1174                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1175                                     .proto = IPPROTO_GRE };
1176                 struct rtable *rt;
1177                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1178                         return -EADDRNOTAVAIL;
1179                 dev = rt->u.dst.dev;
1180                 ip_rt_put(rt);
1181                 if (__in_dev_get_rtnl(dev) == NULL)
1182                         return -EADDRNOTAVAIL;
1183                 t->mlink = dev->ifindex;
1184                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1185         }
1186         return 0;
1187 }
1188
1189 static int ipgre_close(struct net_device *dev)
1190 {
1191         struct ip_tunnel *t = netdev_priv(dev);
1192
1193         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1194                 struct in_device *in_dev;
1195                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196                 if (in_dev) {
1197                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198                         in_dev_put(in_dev);
1199                 }
1200         }
1201         return 0;
1202 }
1203
1204 #endif
1205
1206 static const struct net_device_ops ipgre_netdev_ops = {
1207         .ndo_init               = ipgre_tunnel_init,
1208         .ndo_uninit             = ipgre_tunnel_uninit,
1209 #ifdef CONFIG_NET_IPGRE_BROADCAST
1210         .ndo_open               = ipgre_open,
1211         .ndo_stop               = ipgre_close,
1212 #endif
1213         .ndo_start_xmit         = ipgre_tunnel_xmit,
1214         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1215         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1216 };
1217
1218 static void ipgre_tunnel_setup(struct net_device *dev)
1219 {
1220         dev->netdev_ops         = &ipgre_netdev_ops;
1221         dev->destructor         = free_netdev;
1222
1223         dev->type               = ARPHRD_IPGRE;
1224         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1225         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1226         dev->flags              = IFF_NOARP;
1227         dev->iflink             = 0;
1228         dev->addr_len           = 4;
1229         dev->features           |= NETIF_F_NETNS_LOCAL;
1230         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1231 }
1232
1233 static int ipgre_tunnel_init(struct net_device *dev)
1234 {
1235         struct ip_tunnel *tunnel;
1236         struct iphdr *iph;
1237
1238         tunnel = netdev_priv(dev);
1239         iph = &tunnel->parms.iph;
1240
1241         tunnel->dev = dev;
1242         strcpy(tunnel->parms.name, dev->name);
1243
1244         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1245         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1246
1247         if (iph->daddr) {
1248 #ifdef CONFIG_NET_IPGRE_BROADCAST
1249                 if (ipv4_is_multicast(iph->daddr)) {
1250                         if (!iph->saddr)
1251                                 return -EINVAL;
1252                         dev->flags = IFF_BROADCAST;
1253                         dev->header_ops = &ipgre_header_ops;
1254                 }
1255 #endif
1256         } else
1257                 dev->header_ops = &ipgre_header_ops;
1258
1259         return 0;
1260 }
1261
1262 static void ipgre_fb_tunnel_init(struct net_device *dev)
1263 {
1264         struct ip_tunnel *tunnel = netdev_priv(dev);
1265         struct iphdr *iph = &tunnel->parms.iph;
1266         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1267
1268         tunnel->dev = dev;
1269         strcpy(tunnel->parms.name, dev->name);
1270
1271         iph->version            = 4;
1272         iph->protocol           = IPPROTO_GRE;
1273         iph->ihl                = 5;
1274         tunnel->hlen            = sizeof(struct iphdr) + 4;
1275
1276         dev_hold(dev);
1277         ign->tunnels_wc[0]      = tunnel;
1278 }
1279
1280
1281 static const struct net_protocol ipgre_protocol = {
1282         .handler        =       ipgre_rcv,
1283         .err_handler    =       ipgre_err,
1284         .netns_ok       =       1,
1285 };
1286
1287 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1288 {
1289         int prio;
1290
1291         for (prio = 0; prio < 4; prio++) {
1292                 int h;
1293                 for (h = 0; h < HASH_SIZE; h++) {
1294                         struct ip_tunnel *t;
1295                         while ((t = ign->tunnels[prio][h]) != NULL)
1296                                 unregister_netdevice(t->dev);
1297                 }
1298         }
1299 }
1300
1301 static int ipgre_init_net(struct net *net)
1302 {
1303         int err;
1304         struct ipgre_net *ign;
1305
1306         err = -ENOMEM;
1307         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1308         if (ign == NULL)
1309                 goto err_alloc;
1310
1311         err = net_assign_generic(net, ipgre_net_id, ign);
1312         if (err < 0)
1313                 goto err_assign;
1314
1315         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316                                            ipgre_tunnel_setup);
1317         if (!ign->fb_tunnel_dev) {
1318                 err = -ENOMEM;
1319                 goto err_alloc_dev;
1320         }
1321         dev_net_set(ign->fb_tunnel_dev, net);
1322
1323         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1324         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1325
1326         if ((err = register_netdev(ign->fb_tunnel_dev)))
1327                 goto err_reg_dev;
1328
1329         return 0;
1330
1331 err_reg_dev:
1332         free_netdev(ign->fb_tunnel_dev);
1333 err_alloc_dev:
1334         /* nothing */
1335 err_assign:
1336         kfree(ign);
1337 err_alloc:
1338         return err;
1339 }
1340
1341 static void ipgre_exit_net(struct net *net)
1342 {
1343         struct ipgre_net *ign;
1344
1345         ign = net_generic(net, ipgre_net_id);
1346         rtnl_lock();
1347         ipgre_destroy_tunnels(ign);
1348         rtnl_unlock();
1349         kfree(ign);
1350 }
1351
1352 static struct pernet_operations ipgre_net_ops = {
1353         .init = ipgre_init_net,
1354         .exit = ipgre_exit_net,
1355 };
1356
1357 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1358 {
1359         __be16 flags;
1360
1361         if (!data)
1362                 return 0;
1363
1364         flags = 0;
1365         if (data[IFLA_GRE_IFLAGS])
1366                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1367         if (data[IFLA_GRE_OFLAGS])
1368                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1369         if (flags & (GRE_VERSION|GRE_ROUTING))
1370                 return -EINVAL;
1371
1372         return 0;
1373 }
1374
1375 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1376 {
1377         __be32 daddr;
1378
1379         if (tb[IFLA_ADDRESS]) {
1380                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1381                         return -EINVAL;
1382                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1383                         return -EADDRNOTAVAIL;
1384         }
1385
1386         if (!data)
1387                 goto out;
1388
1389         if (data[IFLA_GRE_REMOTE]) {
1390                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1391                 if (!daddr)
1392                         return -EINVAL;
1393         }
1394
1395 out:
1396         return ipgre_tunnel_validate(tb, data);
1397 }
1398
1399 static void ipgre_netlink_parms(struct nlattr *data[],
1400                                 struct ip_tunnel_parm *parms)
1401 {
1402         memset(parms, 0, sizeof(*parms));
1403
1404         parms->iph.protocol = IPPROTO_GRE;
1405
1406         if (!data)
1407                 return;
1408
1409         if (data[IFLA_GRE_LINK])
1410                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1411
1412         if (data[IFLA_GRE_IFLAGS])
1413                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1414
1415         if (data[IFLA_GRE_OFLAGS])
1416                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1417
1418         if (data[IFLA_GRE_IKEY])
1419                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1420
1421         if (data[IFLA_GRE_OKEY])
1422                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1423
1424         if (data[IFLA_GRE_LOCAL])
1425                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1426
1427         if (data[IFLA_GRE_REMOTE])
1428                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1429
1430         if (data[IFLA_GRE_TTL])
1431                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1432
1433         if (data[IFLA_GRE_TOS])
1434                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1435
1436         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1437                 parms->iph.frag_off = htons(IP_DF);
1438 }
1439
1440 static int ipgre_tap_init(struct net_device *dev)
1441 {
1442         struct ip_tunnel *tunnel;
1443
1444         tunnel = netdev_priv(dev);
1445
1446         tunnel->dev = dev;
1447         strcpy(tunnel->parms.name, dev->name);
1448
1449         ipgre_tunnel_bind_dev(dev);
1450
1451         return 0;
1452 }
1453
1454 static const struct net_device_ops ipgre_tap_netdev_ops = {
1455         .ndo_init               = ipgre_tap_init,
1456         .ndo_uninit             = ipgre_tunnel_uninit,
1457         .ndo_start_xmit         = ipgre_tunnel_xmit,
1458         .ndo_set_mac_address    = eth_mac_addr,
1459         .ndo_validate_addr      = eth_validate_addr,
1460         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1461 };
1462
1463 static void ipgre_tap_setup(struct net_device *dev)
1464 {
1465
1466         ether_setup(dev);
1467
1468         dev->netdev_ops         = &ipgre_netdev_ops;
1469         dev->destructor         = free_netdev;
1470
1471         dev->iflink             = 0;
1472         dev->features           |= NETIF_F_NETNS_LOCAL;
1473 }
1474
1475 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1476                          struct nlattr *data[])
1477 {
1478         struct ip_tunnel *nt;
1479         struct net *net = dev_net(dev);
1480         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1481         int mtu;
1482         int err;
1483
1484         nt = netdev_priv(dev);
1485         ipgre_netlink_parms(data, &nt->parms);
1486
1487         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1488                 return -EEXIST;
1489
1490         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1491                 random_ether_addr(dev->dev_addr);
1492
1493         mtu = ipgre_tunnel_bind_dev(dev);
1494         if (!tb[IFLA_MTU])
1495                 dev->mtu = mtu;
1496
1497         err = register_netdevice(dev);
1498         if (err)
1499                 goto out;
1500
1501         dev_hold(dev);
1502         ipgre_tunnel_link(ign, nt);
1503
1504 out:
1505         return err;
1506 }
1507
1508 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1509                             struct nlattr *data[])
1510 {
1511         struct ip_tunnel *t, *nt;
1512         struct net *net = dev_net(dev);
1513         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1514         struct ip_tunnel_parm p;
1515         int mtu;
1516
1517         if (dev == ign->fb_tunnel_dev)
1518                 return -EINVAL;
1519
1520         nt = netdev_priv(dev);
1521         ipgre_netlink_parms(data, &p);
1522
1523         t = ipgre_tunnel_locate(net, &p, 0);
1524
1525         if (t) {
1526                 if (t->dev != dev)
1527                         return -EEXIST;
1528         } else {
1529                 unsigned nflags = 0;
1530
1531                 t = nt;
1532
1533                 if (ipv4_is_multicast(p.iph.daddr))
1534                         nflags = IFF_BROADCAST;
1535                 else if (p.iph.daddr)
1536                         nflags = IFF_POINTOPOINT;
1537
1538                 if ((dev->flags ^ nflags) &
1539                     (IFF_POINTOPOINT | IFF_BROADCAST))
1540                         return -EINVAL;
1541
1542                 ipgre_tunnel_unlink(ign, t);
1543                 t->parms.iph.saddr = p.iph.saddr;
1544                 t->parms.iph.daddr = p.iph.daddr;
1545                 t->parms.i_key = p.i_key;
1546                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1547                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1548                 ipgre_tunnel_link(ign, t);
1549                 netdev_state_change(dev);
1550         }
1551
1552         t->parms.o_key = p.o_key;
1553         t->parms.iph.ttl = p.iph.ttl;
1554         t->parms.iph.tos = p.iph.tos;
1555         t->parms.iph.frag_off = p.iph.frag_off;
1556
1557         if (t->parms.link != p.link) {
1558                 t->parms.link = p.link;
1559                 mtu = ipgre_tunnel_bind_dev(dev);
1560                 if (!tb[IFLA_MTU])
1561                         dev->mtu = mtu;
1562                 netdev_state_change(dev);
1563         }
1564
1565         return 0;
1566 }
1567
1568 static size_t ipgre_get_size(const struct net_device *dev)
1569 {
1570         return
1571                 /* IFLA_GRE_LINK */
1572                 nla_total_size(4) +
1573                 /* IFLA_GRE_IFLAGS */
1574                 nla_total_size(2) +
1575                 /* IFLA_GRE_OFLAGS */
1576                 nla_total_size(2) +
1577                 /* IFLA_GRE_IKEY */
1578                 nla_total_size(4) +
1579                 /* IFLA_GRE_OKEY */
1580                 nla_total_size(4) +
1581                 /* IFLA_GRE_LOCAL */
1582                 nla_total_size(4) +
1583                 /* IFLA_GRE_REMOTE */
1584                 nla_total_size(4) +
1585                 /* IFLA_GRE_TTL */
1586                 nla_total_size(1) +
1587                 /* IFLA_GRE_TOS */
1588                 nla_total_size(1) +
1589                 /* IFLA_GRE_PMTUDISC */
1590                 nla_total_size(1) +
1591                 0;
1592 }
1593
1594 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1595 {
1596         struct ip_tunnel *t = netdev_priv(dev);
1597         struct ip_tunnel_parm *p = &t->parms;
1598
1599         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1600         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1601         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1602         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1603         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1604         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1605         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1606         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1607         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1608         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1609
1610         return 0;
1611
1612 nla_put_failure:
1613         return -EMSGSIZE;
1614 }
1615
1616 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1617         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1618         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1619         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1620         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1621         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1622         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1623         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1624         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1625         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1626         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1627 };
1628
1629 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1630         .kind           = "gre",
1631         .maxtype        = IFLA_GRE_MAX,
1632         .policy         = ipgre_policy,
1633         .priv_size      = sizeof(struct ip_tunnel),
1634         .setup          = ipgre_tunnel_setup,
1635         .validate       = ipgre_tunnel_validate,
1636         .newlink        = ipgre_newlink,
1637         .changelink     = ipgre_changelink,
1638         .get_size       = ipgre_get_size,
1639         .fill_info      = ipgre_fill_info,
1640 };
1641
1642 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1643         .kind           = "gretap",
1644         .maxtype        = IFLA_GRE_MAX,
1645         .policy         = ipgre_policy,
1646         .priv_size      = sizeof(struct ip_tunnel),
1647         .setup          = ipgre_tap_setup,
1648         .validate       = ipgre_tap_validate,
1649         .newlink        = ipgre_newlink,
1650         .changelink     = ipgre_changelink,
1651         .get_size       = ipgre_get_size,
1652         .fill_info      = ipgre_fill_info,
1653 };
1654
1655 /*
1656  *      And now the modules code and kernel interface.
1657  */
1658
1659 static int __init ipgre_init(void)
1660 {
1661         int err;
1662
1663         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1664
1665         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1666                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1667                 return -EAGAIN;
1668         }
1669
1670         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1671         if (err < 0)
1672                 goto gen_device_failed;
1673
1674         err = rtnl_link_register(&ipgre_link_ops);
1675         if (err < 0)
1676                 goto rtnl_link_failed;
1677
1678         err = rtnl_link_register(&ipgre_tap_ops);
1679         if (err < 0)
1680                 goto tap_ops_failed;
1681
1682 out:
1683         return err;
1684
1685 tap_ops_failed:
1686         rtnl_link_unregister(&ipgre_link_ops);
1687 rtnl_link_failed:
1688         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1689 gen_device_failed:
1690         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1691         goto out;
1692 }
1693
1694 static void __exit ipgre_fini(void)
1695 {
1696         rtnl_link_unregister(&ipgre_tap_ops);
1697         rtnl_link_unregister(&ipgre_link_ops);
1698         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1699         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1700                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1701 }
1702
1703 module_init(ipgre_init);
1704 module_exit(ipgre_fini);
1705 MODULE_LICENSE("GPL");
1706 MODULE_ALIAS_RTNL_LINK("gre");
1707 MODULE_ALIAS_RTNL_LINK("gretap");