net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136 #include <linux/if_tunnel.h>
 137 #include <linux/if_pppox.h>
 138 #include <linux/ppp_defs.h>
 139 #include <linux/net_tstamp.h>
 140 #include <linux/jump_label.h>
 141
 142 #include "net-sysfs.h"
 143
 144 /* Instead of increasing this, you should create a hash table. */
 145 #define MAX_GRO_SKBS 8
 146
 147 /* This should be increased if a protocol with a bigger head is added. */
 148 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 149
 150 /*
 151  *      The list of packet types we will receive (as opposed to discard)
 152  *      and the routines to invoke.
 153  *
 154  *      Why 16. Because with 16 the only overlap we get on a hash of the
 155  *      low nibble of the protocol value is RARP/SNAP/X.25.
 156  *
 157  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 158  *             sure which should go first, but I bet it won't make much
 159  *             difference if we are running VLANs.  The good news is that
 160  *             this protocol won't be in the list unless compiled in, so
 161  *             the average user (w/out VLANs) will not be adversely affected.
 162  *             --BLG
 163  *
 164  *              0800    IP
 165  *              8100    802.1Q VLAN
 166  *              0001    802.3
 167  *              0002    AX.25
 168  *              0004    802.2
 169  *              8035    RARP
 170  *              0005    SNAP
 171  *              0805    X.25
 172  *              0806    ARP
 173  *              8137    IPX
 174  *              0009    Localtalk
 175  *              86DD    IPv6
 176  */
 177
 178 #define PTYPE_HASH_SIZE (16)
 179 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 180
 181 static DEFINE_SPINLOCK(ptype_lock);
 182 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 183 static struct list_head ptype_all __read_mostly;        /* Taps */
 184
 185 /*
 186  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 187  * semaphore.
 188  *
 189  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 190  *
 191  * Writers must hold the rtnl semaphore while they loop through the
 192  * dev_base_head list, and hold dev_base_lock for writing when they do the
 193  * actual updates.  This allows pure readers to access the list even
 194  * while a writer is preparing to update it.
 195  *
 196  * To put it another way, dev_base_lock is held for writing only to
 197  * protect against pure readers; the rtnl semaphore provides the
 198  * protection against other writers.
 199  *
 200  * See, for example usages, register_netdevice() and
 201  * unregister_netdevice(), which must be called with the rtnl
 202  * semaphore held.
 203  */
 204 DEFINE_RWLOCK(dev_base_lock);
 205 EXPORT_SYMBOL(dev_base_lock);
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 306          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 307          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 308          ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 324          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 325          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 326          "_xmit_VOID", "_xmit_NONE"};
 327
 328 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 330
 331 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 332 {
 333         int i;
 334
 335         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 336                 if (netdev_lock_type[i] == dev_type)
 337                         return i;
 338         /* the last key is used by default */
 339         return ARRAY_SIZE(netdev_lock_type) - 1;
 340 }
 341
 342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 343                                                  unsigned short dev_type)
 344 {
 345         int i;
 346
 347         i = netdev_lock_pos(dev_type);
 348         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 349                                    netdev_lock_name[i]);
 350 }
 351
 352 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 353 {
 354         int i;
 355
 356         i = netdev_lock_pos(dev->type);
 357         lockdep_set_class_and_name(&dev->addr_list_lock,
 358                                    &netdev_addr_lock_key[i],
 359                                    netdev_lock_name[i]);
 360 }
 361 #else
 362 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 363                                                  unsigned short dev_type)
 364 {
 365 }
 366 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 367 {
 368 }
 369 #endif
 370
 371 /*******************************************************************************
 372
 373                 Protocol management and registration routines
 374
 375 *******************************************************************************/
 376
 377 /*
 378  *      Add a protocol ID to the list. Now that the input handler is
 379  *      smarter we can dispense with all the messy stuff that used to be
 380  *      here.
 381  *
 382  *      BEWARE!!! Protocol handlers, mangling input packets,
 383  *      MUST BE last in hash buckets and checking protocol handlers
 384  *      MUST start from promiscuous ptype_all chain in net_bh.
 385  *      It is true now, do not change it.
 386  *      Explanation follows: if protocol handler, mangling packet, will
 387  *      be the first on list, it is not able to sense, that packet
 388  *      is cloned and should be copied-on-write, so that it will
 389  *      change it and subsequent readers will get broken packet.
 390  *                                                      --ANK (980803)
 391  */
 392
 393 static inline struct list_head *ptype_head(const struct packet_type *pt)
 394 {
 395         if (pt->type == htons(ETH_P_ALL))
 396                 return &ptype_all;
 397         else
 398                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 399 }
 400
 401 /**
 402  *      dev_add_pack - add packet handler
 403  *      @pt: packet type declaration
 404  *
 405  *      Add a protocol handler to the networking stack. The passed &packet_type
 406  *      is linked into kernel lists and may not be freed until it has been
 407  *      removed from the kernel lists.
 408  *
 409  *      This call does not sleep therefore it can not
 410  *      guarantee all CPU's that are in middle of receiving packets
 411  *      will see the new packet type (until the next received packet).
 412  */
 413
 414 void dev_add_pack(struct packet_type *pt)
 415 {
 416         struct list_head *head = ptype_head(pt);
 417
 418         spin_lock(&ptype_lock);
 419         list_add_rcu(&pt->list, head);
 420         spin_unlock(&ptype_lock);
 421 }
 422 EXPORT_SYMBOL(dev_add_pack);
 423
 424 /**
 425  *      __dev_remove_pack        - remove packet handler
 426  *      @pt: packet type declaration
 427  *
 428  *      Remove a protocol handler that was previously added to the kernel
 429  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430  *      from the kernel lists and can be freed or reused once this function
 431  *      returns.
 432  *
 433  *      The packet type might still be in use by receivers
 434  *      and must not be freed until after all the CPU's have gone
 435  *      through a quiescent state.
 436  */
 437 void __dev_remove_pack(struct packet_type *pt)
 438 {
 439         struct list_head *head = ptype_head(pt);
 440         struct packet_type *pt1;
 441
 442         spin_lock(&ptype_lock);
 443
 444         list_for_each_entry(pt1, head, list) {
 445                 if (pt == pt1) {
 446                         list_del_rcu(&pt->list);
 447                         goto out;
 448                 }
 449         }
 450
 451         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 452 out:
 453         spin_unlock(&ptype_lock);
 454 }
 455 EXPORT_SYMBOL(__dev_remove_pack);
 456
 457 /**
 458  *      dev_remove_pack  - remove packet handler
 459  *      @pt: packet type declaration
 460  *
 461  *      Remove a protocol handler that was previously added to the kernel
 462  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 463  *      from the kernel lists and can be freed or reused once this function
 464  *      returns.
 465  *
 466  *      This call sleeps to guarantee that no CPU is looking at the packet
 467  *      type after return.
 468  */
 469 void dev_remove_pack(struct packet_type *pt)
 470 {
 471         __dev_remove_pack(pt);
 472
 473         synchronize_net();
 474 }
 475 EXPORT_SYMBOL(dev_remove_pack);
 476
 477 /******************************************************************************
 478
 479                       Device Boot-time Settings Routines
 480
 481 *******************************************************************************/
 482
 483 /* Boot time configuration table */
 484 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 485
 486 /**
 487  *      netdev_boot_setup_add   - add new setup entry
 488  *      @name: name of the device
 489  *      @map: configured settings for the device
 490  *
 491  *      Adds new setup entry to the dev_boot_setup list.  The function
 492  *      returns 0 on error and 1 on success.  This is a generic routine to
 493  *      all netdevices.
 494  */
 495 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 496 {
 497         struct netdev_boot_setup *s;
 498         int i;
 499
 500         s = dev_boot_setup;
 501         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 502                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 503                         memset(s[i].name, 0, sizeof(s[i].name));
 504                         strlcpy(s[i].name, name, IFNAMSIZ);
 505                         memcpy(&s[i].map, map, sizeof(s[i].map));
 506                         break;
 507                 }
 508         }
 509
 510         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 511 }
 512
 513 /**
 514  *      netdev_boot_setup_check - check boot time settings
 515  *      @dev: the netdevice
 516  *
 517  *      Check boot time settings for the device.
 518  *      The found settings are set for the device to be used
 519  *      later in the device probing.
 520  *      Returns 0 if no settings found, 1 if they are.
 521  */
 522 int netdev_boot_setup_check(struct net_device *dev)
 523 {
 524         struct netdev_boot_setup *s = dev_boot_setup;
 525         int i;
 526
 527         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 528                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 529                     !strcmp(dev->name, s[i].name)) {
 530                         dev->irq        = s[i].map.irq;
 531                         dev->base_addr  = s[i].map.base_addr;
 532                         dev->mem_start  = s[i].map.mem_start;
 533                         dev->mem_end    = s[i].map.mem_end;
 534                         return 1;
 535                 }
 536         }
 537         return 0;
 538 }
 539 EXPORT_SYMBOL(netdev_boot_setup_check);
 540
 541
 542 /**
 543  *      netdev_boot_base        - get address from boot time settings
 544  *      @prefix: prefix for network device
 545  *      @unit: id for network device
 546  *
 547  *      Check boot time settings for the base address of device.
 548  *      The found settings are set for the device to be used
 549  *      later in the device probing.
 550  *      Returns 0 if no settings found.
 551  */
 552 unsigned long netdev_boot_base(const char *prefix, int unit)
 553 {
 554         const struct netdev_boot_setup *s = dev_boot_setup;
 555         char name[IFNAMSIZ];
 556         int i;
 557
 558         sprintf(name, "%s%d", prefix, unit);
 559
 560         /*
 561          * If device already registered then return base of 1
 562          * to indicate not to probe for this interface
 563          */
 564         if (__dev_get_by_name(&init_net, name))
 565                 return 1;
 566
 567         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 568                 if (!strcmp(name, s[i].name))
 569                         return s[i].map.base_addr;
 570         return 0;
 571 }
 572
 573 /*
 574  * Saves at boot time configured settings for any netdevice.
 575  */
 576 int __init netdev_boot_setup(char *str)
 577 {
 578         int ints[5];
 579         struct ifmap map;
 580
 581         str = get_options(str, ARRAY_SIZE(ints), ints);
 582         if (!str || !*str)
 583                 return 0;
 584
 585         /* Save settings */
 586         memset(&map, 0, sizeof(map));
 587         if (ints[0] > 0)
 588                 map.irq = ints[1];
 589         if (ints[0] > 1)
 590                 map.base_addr = ints[2];
 591         if (ints[0] > 2)
 592                 map.mem_start = ints[3];
 593         if (ints[0] > 3)
 594                 map.mem_end = ints[4];
 595
 596         /* Add new entry to the list */
 597         return netdev_boot_setup_add(str, &map);
 598 }
 599
 600 __setup("netdev=", netdev_boot_setup);
 601
 602 /*******************************************************************************
 603
 604                             Device Interface Subroutines
 605
 606 *******************************************************************************/
 607
 608 /**
 609  *      __dev_get_by_name       - find a device by its name
 610  *      @net: the applicable net namespace
 611  *      @name: name to find
 612  *
 613  *      Find an interface by name. Must be called under RTNL semaphore
 614  *      or @dev_base_lock. If the name is found a pointer to the device
 615  *      is returned. If the name is not found then %NULL is returned. The
 616  *      reference counters are not incremented so the caller must be
 617  *      careful with locks.
 618  */
 619
 620 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 621 {
 622         struct hlist_node *p;
 623         struct net_device *dev;
 624         struct hlist_head *head = dev_name_hash(net, name);
 625
 626         hlist_for_each_entry(dev, p, head, name_hlist)
 627                 if (!strncmp(dev->name, name, IFNAMSIZ))
 628                         return dev;
 629
 630         return NULL;
 631 }
 632 EXPORT_SYMBOL(__dev_get_by_name);
 633
 634 /**
 635  *      dev_get_by_name_rcu     - find a device by its name
 636  *      @net: the applicable net namespace
 637  *      @name: name to find
 638  *
 639  *      Find an interface by name.
 640  *      If the name is found a pointer to the device is returned.
 641  *      If the name is not found then %NULL is returned.
 642  *      The reference counters are not incremented so the caller must be
 643  *      careful with locks. The caller must hold RCU lock.
 644  */
 645
 646 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 647 {
 648         struct hlist_node *p;
 649         struct net_device *dev;
 650         struct hlist_head *head = dev_name_hash(net, name);
 651
 652         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 653                 if (!strncmp(dev->name, name, IFNAMSIZ))
 654                         return dev;
 655
 656         return NULL;
 657 }
 658 EXPORT_SYMBOL(dev_get_by_name_rcu);
 659
 660 /**
 661  *      dev_get_by_name         - find a device by its name
 662  *      @net: the applicable net namespace
 663  *      @name: name to find
 664  *
 665  *      Find an interface by name. This can be called from any
 666  *      context and does its own locking. The returned handle has
 667  *      the usage count incremented and the caller must use dev_put() to
 668  *      release it when it is no longer needed. %NULL is returned if no
 669  *      matching device is found.
 670  */
 671
 672 struct net_device *dev_get_by_name(struct net *net, const char *name)
 673 {
 674         struct net_device *dev;
 675
 676         rcu_read_lock();
 677         dev = dev_get_by_name_rcu(net, name);
 678         if (dev)
 679                 dev_hold(dev);
 680         rcu_read_unlock();
 681         return dev;
 682 }
 683 EXPORT_SYMBOL(dev_get_by_name);
 684
 685 /**
 686  *      __dev_get_by_index - find a device by its ifindex
 687  *      @net: the applicable net namespace
 688  *      @ifindex: index of device
 689  *
 690  *      Search for an interface by index. Returns %NULL if the device
 691  *      is not found or a pointer to the device. The device has not
 692  *      had its reference counter increased so the caller must be careful
 693  *      about locking. The caller must hold either the RTNL semaphore
 694  *      or @dev_base_lock.
 695  */
 696
 697 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 698 {
 699         struct hlist_node *p;
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_index_hash(net, ifindex);
 702
 703         hlist_for_each_entry(dev, p, head, index_hlist)
 704                 if (dev->ifindex == ifindex)
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(__dev_get_by_index);
 710
 711 /**
 712  *      dev_get_by_index_rcu - find a device by its ifindex
 713  *      @net: the applicable net namespace
 714  *      @ifindex: index of device
 715  *
 716  *      Search for an interface by index. Returns %NULL if the device
 717  *      is not found or a pointer to the device. The device has not
 718  *      had its reference counter increased so the caller must be careful
 719  *      about locking. The caller must hold RCU lock.
 720  */
 721
 722 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 723 {
 724         struct hlist_node *p;
 725         struct net_device *dev;
 726         struct hlist_head *head = dev_index_hash(net, ifindex);
 727
 728         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 729                 if (dev->ifindex == ifindex)
 730                         return dev;
 731
 732         return NULL;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_index_rcu);
 735
 736
 737 /**
 738  *      dev_get_by_index - find a device by its ifindex
 739  *      @net: the applicable net namespace
 740  *      @ifindex: index of device
 741  *
 742  *      Search for an interface by index. Returns NULL if the device
 743  *      is not found or a pointer to the device. The device returned has
 744  *      had a reference added and the pointer is safe until the user calls
 745  *      dev_put to indicate they have finished with it.
 746  */
 747
 748 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751
 752         rcu_read_lock();
 753         dev = dev_get_by_index_rcu(net, ifindex);
 754         if (dev)
 755                 dev_hold(dev);
 756         rcu_read_unlock();
 757         return dev;
 758 }
 759 EXPORT_SYMBOL(dev_get_by_index);
 760
 761 /**
 762  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 763  *      @net: the applicable net namespace
 764  *      @type: media type of device
 765  *      @ha: hardware address
 766  *
 767  *      Search for an interface by MAC address. Returns NULL if the device
 768  *      is not found or a pointer to the device.
 769  *      The caller must hold RCU or RTNL.
 770  *      The returned device has not had its ref count increased
 771  *      and the caller must therefore be careful about locking
 772  *
 773  */
 774
 775 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 776                                        const char *ha)
 777 {
 778         struct net_device *dev;
 779
 780         for_each_netdev_rcu(net, dev)
 781                 if (dev->type == type &&
 782                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 783                         return dev;
 784
 785         return NULL;
 786 }
 787 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 788
 789 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 790 {
 791         struct net_device *dev;
 792
 793         ASSERT_RTNL();
 794         for_each_netdev(net, dev)
 795                 if (dev->type == type)
 796                         return dev;
 797
 798         return NULL;
 799 }
 800 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 801
 802 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 803 {
 804         struct net_device *dev, *ret = NULL;
 805
 806         rcu_read_lock();
 807         for_each_netdev_rcu(net, dev)
 808                 if (dev->type == type) {
 809                         dev_hold(dev);
 810                         ret = dev;
 811                         break;
 812                 }
 813         rcu_read_unlock();
 814         return ret;
 815 }
 816 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 817
 818 /**
 819  *      dev_get_by_flags_rcu - find any device with given flags
 820  *      @net: the applicable net namespace
 821  *      @if_flags: IFF_* values
 822  *      @mask: bitmask of bits in if_flags to check
 823  *
 824  *      Search for any interface with the given flags. Returns NULL if a device
 825  *      is not found or a pointer to the device. Must be called inside
 826  *      rcu_read_lock(), and result refcount is unchanged.
 827  */
 828
 829 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 830                                     unsigned short mask)
 831 {
 832         struct net_device *dev, *ret;
 833
 834         ret = NULL;
 835         for_each_netdev_rcu(net, dev) {
 836                 if (((dev->flags ^ if_flags) & mask) == 0) {
 837                         ret = dev;
 838                         break;
 839                 }
 840         }
 841         return ret;
 842 }
 843 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 844
 845 /**
 846  *      dev_valid_name - check if name is okay for network device
 847  *      @name: name string
 848  *
 849  *      Network device names need to be valid file names to
 850  *      to allow sysfs to work.  We also disallow any kind of
 851  *      whitespace.
 852  */
 853 int dev_valid_name(const char *name)
 854 {
 855         if (*name == '\0')
 856                 return 0;
 857         if (strlen(name) >= IFNAMSIZ)
 858                 return 0;
 859         if (!strcmp(name, ".") || !strcmp(name, ".."))
 860                 return 0;
 861
 862         while (*name) {
 863                 if (*name == '/' || isspace(*name))
 864                         return 0;
 865                 name++;
 866         }
 867         return 1;
 868 }
 869 EXPORT_SYMBOL(dev_valid_name);
 870
 871 /**
 872  *      __dev_alloc_name - allocate a name for a device
 873  *      @net: network namespace to allocate the device name in
 874  *      @name: name format string
 875  *      @buf:  scratch buffer and result name string
 876  *
 877  *      Passed a format string - eg "lt%d" it will try and find a suitable
 878  *      id. It scans list of devices to build up a free map, then chooses
 879  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 880  *      while allocating the name and adding the device in order to avoid
 881  *      duplicates.
 882  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 883  *      Returns the number of the unit assigned or a negative errno code.
 884  */
 885
 886 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 887 {
 888         int i = 0;
 889         const char *p;
 890         const int max_netdevices = 8*PAGE_SIZE;
 891         unsigned long *inuse;
 892         struct net_device *d;
 893
 894         p = strnchr(name, IFNAMSIZ-1, '%');
 895         if (p) {
 896                 /*
 897                  * Verify the string as this thing may have come from
 898                  * the user.  There must be either one "%d" and no other "%"
 899                  * characters.
 900                  */
 901                 if (p[1] != 'd' || strchr(p + 2, '%'))
 902                         return -EINVAL;
 903
 904                 /* Use one page as a bit array of possible slots */
 905                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 906                 if (!inuse)
 907                         return -ENOMEM;
 908
 909                 for_each_netdev(net, d) {
 910                         if (!sscanf(d->name, name, &i))
 911                                 continue;
 912                         if (i < 0 || i >= max_netdevices)
 913                                 continue;
 914
 915                         /*  avoid cases where sscanf is not exact inverse of printf */
 916                         snprintf(buf, IFNAMSIZ, name, i);
 917                         if (!strncmp(buf, d->name, IFNAMSIZ))
 918                                 set_bit(i, inuse);
 919                 }
 920
 921                 i = find_first_zero_bit(inuse, max_netdevices);
 922                 free_page((unsigned long) inuse);
 923         }
 924
 925         if (buf != name)
 926                 snprintf(buf, IFNAMSIZ, name, i);
 927         if (!__dev_get_by_name(net, buf))
 928                 return i;
 929
 930         /* It is possible to run out of possible slots
 931          * when the name is long and there isn't enough space left
 932          * for the digits, or if all bits are used.
 933          */
 934         return -ENFILE;
 935 }
 936
 937 /**
 938  *      dev_alloc_name - allocate a name for a device
 939  *      @dev: device
 940  *      @name: name format string
 941  *
 942  *      Passed a format string - eg "lt%d" it will try and find a suitable
 943  *      id. It scans list of devices to build up a free map, then chooses
 944  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 945  *      while allocating the name and adding the device in order to avoid
 946  *      duplicates.
 947  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 948  *      Returns the number of the unit assigned or a negative errno code.
 949  */
 950
 951 int dev_alloc_name(struct net_device *dev, const char *name)
 952 {
 953         char buf[IFNAMSIZ];
 954         struct net *net;
 955         int ret;
 956
 957         BUG_ON(!dev_net(dev));
 958         net = dev_net(dev);
 959         ret = __dev_alloc_name(net, name, buf);
 960         if (ret >= 0)
 961                 strlcpy(dev->name, buf, IFNAMSIZ);
 962         return ret;
 963 }
 964 EXPORT_SYMBOL(dev_alloc_name);
 965
 966 static int dev_get_valid_name(struct net_device *dev, const char *name)
 967 {
 968         struct net *net;
 969
 970         BUG_ON(!dev_net(dev));
 971         net = dev_net(dev);
 972
 973         if (!dev_valid_name(name))
 974                 return -EINVAL;
 975
 976         if (strchr(name, '%'))
 977                 return dev_alloc_name(dev, name);
 978         else if (__dev_get_by_name(net, name))
 979                 return -EEXIST;
 980         else if (dev->name != name)
 981                 strlcpy(dev->name, name, IFNAMSIZ);
 982
 983         return 0;
 984 }
 985
 986 /**
 987  *      dev_change_name - change name of a device
 988  *      @dev: device
 989  *      @newname: name (or format string) must be at least IFNAMSIZ
 990  *
 991  *      Change name of a device, can pass format strings "eth%d".
 992  *      for wildcarding.
 993  */
 994 int dev_change_name(struct net_device *dev, const char *newname)
 995 {
 996         char oldname[IFNAMSIZ];
 997         int err = 0;
 998         int ret;
 999         struct net *net;
1000
1001         ASSERT_RTNL();
1002         BUG_ON(!dev_net(dev));
1003
1004         net = dev_net(dev);
1005         if (dev->flags & IFF_UP)
1006                 return -EBUSY;
1007
1008         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1009                 return 0;
1010
1011         memcpy(oldname, dev->name, IFNAMSIZ);
1012
1013         err = dev_get_valid_name(dev, newname);
1014         if (err < 0)
1015                 return err;
1016
1017 rollback:
1018         ret = device_rename(&dev->dev, dev->name);
1019         if (ret) {
1020                 memcpy(dev->name, oldname, IFNAMSIZ);
1021                 return ret;
1022         }
1023
1024         write_lock_bh(&dev_base_lock);
1025         hlist_del_rcu(&dev->name_hlist);
1026         write_unlock_bh(&dev_base_lock);
1027
1028         synchronize_rcu();
1029
1030         write_lock_bh(&dev_base_lock);
1031         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1032         write_unlock_bh(&dev_base_lock);
1033
1034         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1035         ret = notifier_to_errno(ret);
1036
1037         if (ret) {
1038                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1039                 if (err >= 0) {
1040                         err = ret;
1041                         memcpy(dev->name, oldname, IFNAMSIZ);
1042                         goto rollback;
1043                 } else {
1044                         printk(KERN_ERR
1045                                "%s: name change rollback failed: %d.\n",
1046                                dev->name, ret);
1047                 }
1048         }
1049
1050         return err;
1051 }
1052
1053 /**
1054  *      dev_set_alias - change ifalias of a device
1055  *      @dev: device
1056  *      @alias: name up to IFALIASZ
1057  *      @len: limit of bytes to copy from info
1058  *
1059  *      Set ifalias for a device,
1060  */
1061 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1062 {
1063         ASSERT_RTNL();
1064
1065         if (len >= IFALIASZ)
1066                 return -EINVAL;
1067
1068         if (!len) {
1069                 if (dev->ifalias) {
1070                         kfree(dev->ifalias);
1071                         dev->ifalias = NULL;
1072                 }
1073                 return 0;
1074         }
1075
1076         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1077         if (!dev->ifalias)
1078                 return -ENOMEM;
1079
1080         strlcpy(dev->ifalias, alias, len+1);
1081         return len;
1082 }
1083
1084
1085 /**
1086  *      netdev_features_change - device changes features
1087  *      @dev: device to cause notification
1088  *
1089  *      Called to indicate a device has changed features.
1090  */
1091 void netdev_features_change(struct net_device *dev)
1092 {
1093         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1094 }
1095 EXPORT_SYMBOL(netdev_features_change);
1096
1097 /**
1098  *      netdev_state_change - device changes state
1099  *      @dev: device to cause notification
1100  *
1101  *      Called to indicate a device has changed state. This function calls
1102  *      the notifier chains for netdev_chain and sends a NEWLINK message
1103  *      to the routing socket.
1104  */
1105 void netdev_state_change(struct net_device *dev)
1106 {
1107         if (dev->flags & IFF_UP) {
1108                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1109                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1110         }
1111 }
1112 EXPORT_SYMBOL(netdev_state_change);
1113
1114 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1115 {
1116         return call_netdevice_notifiers(event, dev);
1117 }
1118 EXPORT_SYMBOL(netdev_bonding_change);
1119
1120 /**
1121  *      dev_load        - load a network module
1122  *      @net: the applicable net namespace
1123  *      @name: name of interface
1124  *
1125  *      If a network interface is not present and the process has suitable
1126  *      privileges this function loads the module. If module loading is not
1127  *      available in this kernel then it becomes a nop.
1128  */
1129
1130 void dev_load(struct net *net, const char *name)
1131 {
1132         struct net_device *dev;
1133         int no_module;
1134
1135         rcu_read_lock();
1136         dev = dev_get_by_name_rcu(net, name);
1137         rcu_read_unlock();
1138
1139         no_module = !dev;
1140         if (no_module && capable(CAP_NET_ADMIN))
1141                 no_module = request_module("netdev-%s", name);
1142         if (no_module && capable(CAP_SYS_MODULE)) {
1143                 if (!request_module("%s", name))
1144                         pr_err("Loading kernel module for a network device "
1145 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1146 "instead\n", name);
1147         }
1148 }
1149 EXPORT_SYMBOL(dev_load);
1150
1151 static int __dev_open(struct net_device *dev)
1152 {
1153         const struct net_device_ops *ops = dev->netdev_ops;
1154         int ret;
1155
1156         ASSERT_RTNL();
1157
1158         if (!netif_device_present(dev))
1159                 return -ENODEV;
1160
1161         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1162         ret = notifier_to_errno(ret);
1163         if (ret)
1164                 return ret;
1165
1166         set_bit(__LINK_STATE_START, &dev->state);
1167
1168         if (ops->ndo_validate_addr)
1169                 ret = ops->ndo_validate_addr(dev);
1170
1171         if (!ret && ops->ndo_open)
1172                 ret = ops->ndo_open(dev);
1173
1174         if (ret)
1175                 clear_bit(__LINK_STATE_START, &dev->state);
1176         else {
1177                 dev->flags |= IFF_UP;
1178                 net_dmaengine_get();
1179                 dev_set_rx_mode(dev);
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         if (dev->flags & IFF_UP)
1203                 return 0;
1204
1205         ret = __dev_open(dev);
1206         if (ret < 0)
1207                 return ret;
1208
1209         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210         call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212         return ret;
1213 }
1214 EXPORT_SYMBOL(dev_open);
1215
1216 static int __dev_close_many(struct list_head *head)
1217 {
1218         struct net_device *dev;
1219
1220         ASSERT_RTNL();
1221         might_sleep();
1222
1223         list_for_each_entry(dev, head, unreg_list) {
1224                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1225
1226                 clear_bit(__LINK_STATE_START, &dev->state);
1227
1228                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1229                  * can be even on different cpu. So just clear netif_running().
1230                  *
1231                  * dev->stop() will invoke napi_disable() on all of it's
1232                  * napi_struct instances on this device.
1233                  */
1234                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1235         }
1236
1237         dev_deactivate_many(head);
1238
1239         list_for_each_entry(dev, head, unreg_list) {
1240                 const struct net_device_ops *ops = dev->netdev_ops;
1241
1242                 /*
1243                  *      Call the device specific close. This cannot fail.
1244                  *      Only if device is UP
1245                  *
1246                  *      We allow it to be called even after a DETACH hot-plug
1247                  *      event.
1248                  */
1249                 if (ops->ndo_stop)
1250                         ops->ndo_stop(dev);
1251
1252                 dev->flags &= ~IFF_UP;
1253                 net_dmaengine_put();
1254         }
1255
1256         return 0;
1257 }
1258
1259 static int __dev_close(struct net_device *dev)
1260 {
1261         int retval;
1262         LIST_HEAD(single);
1263
1264         list_add(&dev->unreg_list, &single);
1265         retval = __dev_close_many(&single);
1266         list_del(&single);
1267         return retval;
1268 }
1269
1270 static int dev_close_many(struct list_head *head)
1271 {
1272         struct net_device *dev, *tmp;
1273         LIST_HEAD(tmp_list);
1274
1275         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1276                 if (!(dev->flags & IFF_UP))
1277                         list_move(&dev->unreg_list, &tmp_list);
1278
1279         __dev_close_many(head);
1280
1281         list_for_each_entry(dev, head, unreg_list) {
1282                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1283                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1284         }
1285
1286         /* rollback_registered_many needs the complete original list */
1287         list_splice(&tmp_list, head);
1288         return 0;
1289 }
1290
1291 /**
1292  *      dev_close - shutdown an interface.
1293  *      @dev: device to shutdown
1294  *
1295  *      This function moves an active device into down state. A
1296  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1297  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1298  *      chain.
1299  */
1300 int dev_close(struct net_device *dev)
1301 {
1302         if (dev->flags & IFF_UP) {
1303                 LIST_HEAD(single);
1304
1305                 list_add(&dev->unreg_list, &single);
1306                 dev_close_many(&single);
1307                 list_del(&single);
1308         }
1309         return 0;
1310 }
1311 EXPORT_SYMBOL(dev_close);
1312
1313
1314 /**
1315  *      dev_disable_lro - disable Large Receive Offload on a device
1316  *      @dev: device
1317  *
1318  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1319  *      called under RTNL.  This is needed if received packets may be
1320  *      forwarded to another interface.
1321  */
1322 void dev_disable_lro(struct net_device *dev)
1323 {
1324         /*
1325          * If we're trying to disable lro on a vlan device
1326          * use the underlying physical device instead
1327          */
1328         if (is_vlan_dev(dev))
1329                 dev = vlan_dev_real_dev(dev);
1330
1331         dev->wanted_features &= ~NETIF_F_LRO;
1332         netdev_update_features(dev);
1333
1334         if (unlikely(dev->features & NETIF_F_LRO))
1335                 netdev_WARN(dev, "failed to disable LRO!\n");
1336 }
1337 EXPORT_SYMBOL(dev_disable_lro);
1338
1339
1340 static int dev_boot_phase = 1;
1341
1342 /**
1343  *      register_netdevice_notifier - register a network notifier block
1344  *      @nb: notifier
1345  *
1346  *      Register a notifier to be called when network device events occur.
1347  *      The notifier passed is linked into the kernel structures and must
1348  *      not be reused until it has been unregistered. A negative errno code
1349  *      is returned on a failure.
1350  *
1351  *      When registered all registration and up events are replayed
1352  *      to the new notifier to allow device to have a race free
1353  *      view of the network device list.
1354  */
1355
1356 int register_netdevice_notifier(struct notifier_block *nb)
1357 {
1358         struct net_device *dev;
1359         struct net_device *last;
1360         struct net *net;
1361         int err;
1362
1363         rtnl_lock();
1364         err = raw_notifier_chain_register(&netdev_chain, nb);
1365         if (err)
1366                 goto unlock;
1367         if (dev_boot_phase)
1368                 goto unlock;
1369         for_each_net(net) {
1370                 for_each_netdev(net, dev) {
1371                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1372                         err = notifier_to_errno(err);
1373                         if (err)
1374                                 goto rollback;
1375
1376                         if (!(dev->flags & IFF_UP))
1377                                 continue;
1378
1379                         nb->notifier_call(nb, NETDEV_UP, dev);
1380                 }
1381         }
1382
1383 unlock:
1384         rtnl_unlock();
1385         return err;
1386
1387 rollback:
1388         last = dev;
1389         for_each_net(net) {
1390                 for_each_netdev(net, dev) {
1391                         if (dev == last)
1392                                 break;
1393
1394                         if (dev->flags & IFF_UP) {
1395                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1396                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1397                         }
1398                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1399                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1400                 }
1401         }
1402
1403         raw_notifier_chain_unregister(&netdev_chain, nb);
1404         goto unlock;
1405 }
1406 EXPORT_SYMBOL(register_netdevice_notifier);
1407
1408 /**
1409  *      unregister_netdevice_notifier - unregister a network notifier block
1410  *      @nb: notifier
1411  *
1412  *      Unregister a notifier previously registered by
1413  *      register_netdevice_notifier(). The notifier is unlinked into the
1414  *      kernel structures and may then be reused. A negative errno code
1415  *      is returned on a failure.
1416  */
1417
1418 int unregister_netdevice_notifier(struct notifier_block *nb)
1419 {
1420         int err;
1421
1422         rtnl_lock();
1423         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1424         rtnl_unlock();
1425         return err;
1426 }
1427 EXPORT_SYMBOL(unregister_netdevice_notifier);
1428
1429 /**
1430  *      call_netdevice_notifiers - call all network notifier blocks
1431  *      @val: value passed unmodified to notifier function
1432  *      @dev: net_device pointer passed unmodified to notifier function
1433  *
1434  *      Call all network notifier blocks.  Parameters and return value
1435  *      are as for raw_notifier_call_chain().
1436  */
1437
1438 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1439 {
1440         ASSERT_RTNL();
1441         return raw_notifier_call_chain(&netdev_chain, val, dev);
1442 }
1443 EXPORT_SYMBOL(call_netdevice_notifiers);
1444
1445 static struct jump_label_key netstamp_needed __read_mostly;
1446
1447 void net_enable_timestamp(void)
1448 {
1449         jump_label_inc(&netstamp_needed);
1450 }
1451 EXPORT_SYMBOL(net_enable_timestamp);
1452
1453 void net_disable_timestamp(void)
1454 {
1455         jump_label_dec(&netstamp_needed);
1456 }
1457 EXPORT_SYMBOL(net_disable_timestamp);
1458
1459 static inline void net_timestamp_set(struct sk_buff *skb)
1460 {
1461         skb->tstamp.tv64 = 0;
1462         if (static_branch(&netstamp_needed))
1463                 __net_timestamp(skb);
1464 }
1465
1466 #define net_timestamp_check(COND, SKB)                  \
1467         if (static_branch(&netstamp_needed)) {          \
1468                 if ((COND) && !(SKB)->tstamp.tv64)      \
1469                         __net_timestamp(SKB);           \
1470         }                                               \
1471
1472 static int net_hwtstamp_validate(struct ifreq *ifr)
1473 {
1474         struct hwtstamp_config cfg;
1475         enum hwtstamp_tx_types tx_type;
1476         enum hwtstamp_rx_filters rx_filter;
1477         int tx_type_valid = 0;
1478         int rx_filter_valid = 0;
1479
1480         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1481                 return -EFAULT;
1482
1483         if (cfg.flags) /* reserved for future extensions */
1484                 return -EINVAL;
1485
1486         tx_type = cfg.tx_type;
1487         rx_filter = cfg.rx_filter;
1488
1489         switch (tx_type) {
1490         case HWTSTAMP_TX_OFF:
1491         case HWTSTAMP_TX_ON:
1492         case HWTSTAMP_TX_ONESTEP_SYNC:
1493                 tx_type_valid = 1;
1494                 break;
1495         }
1496
1497         switch (rx_filter) {
1498         case HWTSTAMP_FILTER_NONE:
1499         case HWTSTAMP_FILTER_ALL:
1500         case HWTSTAMP_FILTER_SOME:
1501         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1502         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1503         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1504         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1505         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1506         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1507         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1508         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1509         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1510         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1511         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1512         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1513                 rx_filter_valid = 1;
1514                 break;
1515         }
1516
1517         if (!tx_type_valid || !rx_filter_valid)
1518                 return -ERANGE;
1519
1520         return 0;
1521 }
1522
1523 static inline bool is_skb_forwardable(struct net_device *dev,
1524                                       struct sk_buff *skb)
1525 {
1526         unsigned int len;
1527
1528         if (!(dev->flags & IFF_UP))
1529                 return false;
1530
1531         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1532         if (skb->len <= len)
1533                 return true;
1534
1535         /* if TSO is enabled, we don't care about the length as the packet
1536          * could be forwarded without being segmented before
1537          */
1538         if (skb_is_gso(skb))
1539                 return true;
1540
1541         return false;
1542 }
1543
1544 /**
1545  * dev_forward_skb - loopback an skb to another netif
1546  *
1547  * @dev: destination network device
1548  * @skb: buffer to forward
1549  *
1550  * return values:
1551  *      NET_RX_SUCCESS  (no congestion)
1552  *      NET_RX_DROP     (packet was dropped, but freed)
1553  *
1554  * dev_forward_skb can be used for injecting an skb from the
1555  * start_xmit function of one device into the receive queue
1556  * of another device.
1557  *
1558  * The receiving device may be in another namespace, so
1559  * we have to clear all information in the skb that could
1560  * impact namespace isolation.
1561  */
1562 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1563 {
1564         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1565                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1566                         atomic_long_inc(&dev->rx_dropped);
1567                         kfree_skb(skb);
1568                         return NET_RX_DROP;
1569                 }
1570         }
1571
1572         skb_orphan(skb);
1573         nf_reset(skb);
1574
1575         if (unlikely(!is_skb_forwardable(dev, skb))) {
1576                 atomic_long_inc(&dev->rx_dropped);
1577                 kfree_skb(skb);
1578                 return NET_RX_DROP;
1579         }
1580         skb_set_dev(skb, dev);
1581         skb->tstamp.tv64 = 0;
1582         skb->pkt_type = PACKET_HOST;
1583         skb->protocol = eth_type_trans(skb, dev);
1584         return netif_rx(skb);
1585 }
1586 EXPORT_SYMBOL_GPL(dev_forward_skb);
1587
1588 static inline int deliver_skb(struct sk_buff *skb,
1589                               struct packet_type *pt_prev,
1590                               struct net_device *orig_dev)
1591 {
1592         atomic_inc(&skb->users);
1593         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1594 }
1595
1596 /*
1597  *      Support routine. Sends outgoing frames to any network
1598  *      taps currently in use.
1599  */
1600
1601 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1602 {
1603         struct packet_type *ptype;
1604         struct sk_buff *skb2 = NULL;
1605         struct packet_type *pt_prev = NULL;
1606
1607         rcu_read_lock();
1608         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1609                 /* Never send packets back to the socket
1610                  * they originated from - MvS (miquels@drinkel.ow.org)
1611                  */
1612                 if ((ptype->dev == dev || !ptype->dev) &&
1613                     (ptype->af_packet_priv == NULL ||
1614                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1615                         if (pt_prev) {
1616                                 deliver_skb(skb2, pt_prev, skb->dev);
1617                                 pt_prev = ptype;
1618                                 continue;
1619                         }
1620
1621                         skb2 = skb_clone(skb, GFP_ATOMIC);
1622                         if (!skb2)
1623                                 break;
1624
1625                         net_timestamp_set(skb2);
1626
1627                         /* skb->nh should be correctly
1628                            set by sender, so that the second statement is
1629                            just protection against buggy protocols.
1630                          */
1631                         skb_reset_mac_header(skb2);
1632
1633                         if (skb_network_header(skb2) < skb2->data ||
1634                             skb2->network_header > skb2->tail) {
1635                                 if (net_ratelimit())
1636                                         printk(KERN_CRIT "protocol %04x is "
1637                                                "buggy, dev %s\n",
1638                                                ntohs(skb2->protocol),
1639                                                dev->name);
1640                                 skb_reset_network_header(skb2);
1641                         }
1642
1643                         skb2->transport_header = skb2->network_header;
1644                         skb2->pkt_type = PACKET_OUTGOING;
1645                         pt_prev = ptype;
1646                 }
1647         }
1648         if (pt_prev)
1649                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1650         rcu_read_unlock();
1651 }
1652
1653 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1654  * @dev: Network device
1655  * @txq: number of queues available
1656  *
1657  * If real_num_tx_queues is changed the tc mappings may no longer be
1658  * valid. To resolve this verify the tc mapping remains valid and if
1659  * not NULL the mapping. With no priorities mapping to this
1660  * offset/count pair it will no longer be used. In the worst case TC0
1661  * is invalid nothing can be done so disable priority mappings. If is
1662  * expected that drivers will fix this mapping if they can before
1663  * calling netif_set_real_num_tx_queues.
1664  */
1665 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1666 {
1667         int i;
1668         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1669
1670         /* If TC0 is invalidated disable TC mapping */
1671         if (tc->offset + tc->count > txq) {
1672                 pr_warning("Number of in use tx queues changed "
1673                            "invalidating tc mappings. Priority "
1674                            "traffic classification disabled!\n");
1675                 dev->num_tc = 0;
1676                 return;
1677         }
1678
1679         /* Invalidated prio to tc mappings set to TC0 */
1680         for (i = 1; i < TC_BITMASK + 1; i++) {
1681                 int q = netdev_get_prio_tc_map(dev, i);
1682
1683                 tc = &dev->tc_to_txq[q];
1684                 if (tc->offset + tc->count > txq) {
1685                         pr_warning("Number of in use tx queues "
1686                                    "changed. Priority %i to tc "
1687                                    "mapping %i is no longer valid "
1688                                    "setting map to 0\n",
1689                                    i, q);
1690                         netdev_set_prio_tc_map(dev, i, 0);
1691                 }
1692         }
1693 }
1694
1695 /*
1696  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1697  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1698  */
1699 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1700 {
1701         int rc;
1702
1703         if (txq < 1 || txq > dev->num_tx_queues)
1704                 return -EINVAL;
1705
1706         if (dev->reg_state == NETREG_REGISTERED ||
1707             dev->reg_state == NETREG_UNREGISTERING) {
1708                 ASSERT_RTNL();
1709
1710                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1711                                                   txq);
1712                 if (rc)
1713                         return rc;
1714
1715                 if (dev->num_tc)
1716                         netif_setup_tc(dev, txq);
1717
1718                 if (txq < dev->real_num_tx_queues)
1719                         qdisc_reset_all_tx_gt(dev, txq);
1720         }
1721
1722         dev->real_num_tx_queues = txq;
1723         return 0;
1724 }
1725 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1726
1727 #ifdef CONFIG_RPS
1728 /**
1729  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1730  *      @dev: Network device
1731  *      @rxq: Actual number of RX queues
1732  *
1733  *      This must be called either with the rtnl_lock held or before
1734  *      registration of the net device.  Returns 0 on success, or a
1735  *      negative error code.  If called before registration, it always
1736  *      succeeds.
1737  */
1738 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1739 {
1740         int rc;
1741
1742         if (rxq < 1 || rxq > dev->num_rx_queues)
1743                 return -EINVAL;
1744
1745         if (dev->reg_state == NETREG_REGISTERED) {
1746                 ASSERT_RTNL();
1747
1748                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1749                                                   rxq);
1750                 if (rc)
1751                         return rc;
1752         }
1753
1754         dev->real_num_rx_queues = rxq;
1755         return 0;
1756 }
1757 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1758 #endif
1759
1760 static inline void __netif_reschedule(struct Qdisc *q)
1761 {
1762         struct softnet_data *sd;
1763         unsigned long flags;
1764
1765         local_irq_save(flags);
1766         sd = &__get_cpu_var(softnet_data);
1767         q->next_sched = NULL;
1768         *sd->output_queue_tailp = q;
1769         sd->output_queue_tailp = &q->next_sched;
1770         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1771         local_irq_restore(flags);
1772 }
1773
1774 void __netif_schedule(struct Qdisc *q)
1775 {
1776         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1777                 __netif_reschedule(q);
1778 }
1779 EXPORT_SYMBOL(__netif_schedule);
1780
1781 void dev_kfree_skb_irq(struct sk_buff *skb)
1782 {
1783         if (atomic_dec_and_test(&skb->users)) {
1784                 struct softnet_data *sd;
1785                 unsigned long flags;
1786
1787                 local_irq_save(flags);
1788                 sd = &__get_cpu_var(softnet_data);
1789                 skb->next = sd->completion_queue;
1790                 sd->completion_queue = skb;
1791                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1792                 local_irq_restore(flags);
1793         }
1794 }
1795 EXPORT_SYMBOL(dev_kfree_skb_irq);
1796
1797 void dev_kfree_skb_any(struct sk_buff *skb)
1798 {
1799         if (in_irq() || irqs_disabled())
1800                 dev_kfree_skb_irq(skb);
1801         else
1802                 dev_kfree_skb(skb);
1803 }
1804 EXPORT_SYMBOL(dev_kfree_skb_any);
1805
1806
1807 /**
1808  * netif_device_detach - mark device as removed
1809  * @dev: network device
1810  *
1811  * Mark device as removed from system and therefore no longer available.
1812  */
1813 void netif_device_detach(struct net_device *dev)
1814 {
1815         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1816             netif_running(dev)) {
1817                 netif_tx_stop_all_queues(dev);
1818         }
1819 }
1820 EXPORT_SYMBOL(netif_device_detach);
1821
1822 /**
1823  * netif_device_attach - mark device as attached
1824  * @dev: network device
1825  *
1826  * Mark device as attached from system and restart if needed.
1827  */
1828 void netif_device_attach(struct net_device *dev)
1829 {
1830         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1831             netif_running(dev)) {
1832                 netif_tx_wake_all_queues(dev);
1833                 __netdev_watchdog_up(dev);
1834         }
1835 }
1836 EXPORT_SYMBOL(netif_device_attach);
1837
1838 /**
1839  * skb_dev_set -- assign a new device to a buffer
1840  * @skb: buffer for the new device
1841  * @dev: network device
1842  *
1843  * If an skb is owned by a device already, we have to reset
1844  * all data private to the namespace a device belongs to
1845  * before assigning it a new device.
1846  */
1847 #ifdef CONFIG_NET_NS
1848 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1849 {
1850         skb_dst_drop(skb);
1851         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1852                 secpath_reset(skb);
1853                 nf_reset(skb);
1854                 skb_init_secmark(skb);
1855                 skb->mark = 0;
1856                 skb->priority = 0;
1857                 skb->nf_trace = 0;
1858                 skb->ipvs_property = 0;
1859 #ifdef CONFIG_NET_SCHED
1860                 skb->tc_index = 0;
1861 #endif
1862         }
1863         skb->dev = dev;
1864 }
1865 EXPORT_SYMBOL(skb_set_dev);
1866 #endif /* CONFIG_NET_NS */
1867
1868 /*
1869  * Invalidate hardware checksum when packet is to be mangled, and
1870  * complete checksum manually on outgoing path.
1871  */
1872 int skb_checksum_help(struct sk_buff *skb)
1873 {
1874         __wsum csum;
1875         int ret = 0, offset;
1876
1877         if (skb->ip_summed == CHECKSUM_COMPLETE)
1878                 goto out_set_summed;
1879
1880         if (unlikely(skb_shinfo(skb)->gso_size)) {
1881                 /* Let GSO fix up the checksum. */
1882                 goto out_set_summed;
1883         }
1884
1885         offset = skb_checksum_start_offset(skb);
1886         BUG_ON(offset >= skb_headlen(skb));
1887         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1888
1889         offset += skb->csum_offset;
1890         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1891
1892         if (skb_cloned(skb) &&
1893             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1894                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1895                 if (ret)
1896                         goto out;
1897         }
1898
1899         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1900 out_set_summed:
1901         skb->ip_summed = CHECKSUM_NONE;
1902 out:
1903         return ret;
1904 }
1905 EXPORT_SYMBOL(skb_checksum_help);
1906
1907 /**
1908  *      skb_gso_segment - Perform segmentation on skb.
1909  *      @skb: buffer to segment
1910  *      @features: features for the output path (see dev->features)
1911  *
1912  *      This function segments the given skb and returns a list of segments.
1913  *
1914  *      It may return NULL if the skb requires no segmentation.  This is
1915  *      only possible when GSO is used for verifying header integrity.
1916  */
1917 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1918         netdev_features_t features)
1919 {
1920         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1921         struct packet_type *ptype;
1922         __be16 type = skb->protocol;
1923         int vlan_depth = ETH_HLEN;
1924         int err;
1925
1926         while (type == htons(ETH_P_8021Q)) {
1927                 struct vlan_hdr *vh;
1928
1929                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1930                         return ERR_PTR(-EINVAL);
1931
1932                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1933                 type = vh->h_vlan_encapsulated_proto;
1934                 vlan_depth += VLAN_HLEN;
1935         }
1936
1937         skb_reset_mac_header(skb);
1938         skb->mac_len = skb->network_header - skb->mac_header;
1939         __skb_pull(skb, skb->mac_len);
1940
1941         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1942                 struct net_device *dev = skb->dev;
1943                 struct ethtool_drvinfo info = {};
1944
1945                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1946                         dev->ethtool_ops->get_drvinfo(dev, &info);
1947
1948                 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n",
1949                      info.driver, dev ? &dev->features : NULL,
1950                      skb->sk ? &skb->sk->sk_route_caps : NULL,
1951                      skb->len, skb->data_len, skb->ip_summed);
1952
1953                 if (skb_header_cloned(skb) &&
1954                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1955                         return ERR_PTR(err);
1956         }
1957
1958         rcu_read_lock();
1959         list_for_each_entry_rcu(ptype,
1960                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1961                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1962                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1963                                 err = ptype->gso_send_check(skb);
1964                                 segs = ERR_PTR(err);
1965                                 if (err || skb_gso_ok(skb, features))
1966                                         break;
1967                                 __skb_push(skb, (skb->data -
1968                                                  skb_network_header(skb)));
1969                         }
1970                         segs = ptype->gso_segment(skb, features);
1971                         break;
1972                 }
1973         }
1974         rcu_read_unlock();
1975
1976         __skb_push(skb, skb->data - skb_mac_header(skb));
1977
1978         return segs;
1979 }
1980 EXPORT_SYMBOL(skb_gso_segment);
1981
1982 /* Take action when hardware reception checksum errors are detected. */
1983 #ifdef CONFIG_BUG
1984 void netdev_rx_csum_fault(struct net_device *dev)
1985 {
1986         if (net_ratelimit()) {
1987                 printk(KERN_ERR "%s: hw csum failure.\n",
1988                         dev ? dev->name : "<unknown>");
1989                 dump_stack();
1990         }
1991 }
1992 EXPORT_SYMBOL(netdev_rx_csum_fault);
1993 #endif
1994
1995 /* Actually, we should eliminate this check as soon as we know, that:
1996  * 1. IOMMU is present and allows to map all the memory.
1997  * 2. No high memory really exists on this machine.
1998  */
1999
2000 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2001 {
2002 #ifdef CONFIG_HIGHMEM
2003         int i;
2004         if (!(dev->features & NETIF_F_HIGHDMA)) {
2005                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2006                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2007                         if (PageHighMem(skb_frag_page(frag)))
2008                                 return 1;
2009                 }
2010         }
2011
2012         if (PCI_DMA_BUS_IS_PHYS) {
2013                 struct device *pdev = dev->dev.parent;
2014
2015                 if (!pdev)
2016                         return 0;
2017                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2018                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2019                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2020                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2021                                 return 1;
2022                 }
2023         }
2024 #endif
2025         return 0;
2026 }
2027
2028 struct dev_gso_cb {
2029         void (*destructor)(struct sk_buff *skb);
2030 };
2031
2032 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2033
2034 static void dev_gso_skb_destructor(struct sk_buff *skb)
2035 {
2036         struct dev_gso_cb *cb;
2037
2038         do {
2039                 struct sk_buff *nskb = skb->next;
2040
2041                 skb->next = nskb->next;
2042                 nskb->next = NULL;
2043                 kfree_skb(nskb);
2044         } while (skb->next);
2045
2046         cb = DEV_GSO_CB(skb);
2047         if (cb->destructor)
2048                 cb->destructor(skb);
2049 }
2050
2051 /**
2052  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2053  *      @skb: buffer to segment
2054  *      @features: device features as applicable to this skb
2055  *
2056  *      This function segments the given skb and stores the list of segments
2057  *      in skb->next.
2058  */
2059 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2060 {
2061         struct sk_buff *segs;
2062
2063         segs = skb_gso_segment(skb, features);
2064
2065         /* Verifying header integrity only. */
2066         if (!segs)
2067                 return 0;
2068
2069         if (IS_ERR(segs))
2070                 return PTR_ERR(segs);
2071
2072         skb->next = segs;
2073         DEV_GSO_CB(skb)->destructor = skb->destructor;
2074         skb->destructor = dev_gso_skb_destructor;
2075
2076         return 0;
2077 }
2078
2079 /*
2080  * Try to orphan skb early, right before transmission by the device.
2081  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2082  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2083  */
2084 static inline void skb_orphan_try(struct sk_buff *skb)
2085 {
2086         struct sock *sk = skb->sk;
2087
2088         if (sk && !skb_shinfo(skb)->tx_flags) {
2089                 /* skb_tx_hash() wont be able to get sk.
2090                  * We copy sk_hash into skb->rxhash
2091                  */
2092                 if (!skb->rxhash)
2093                         skb->rxhash = sk->sk_hash;
2094                 skb_orphan(skb);
2095         }
2096 }
2097
2098 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2099 {
2100         return ((features & NETIF_F_GEN_CSUM) ||
2101                 ((features & NETIF_F_V4_CSUM) &&
2102                  protocol == htons(ETH_P_IP)) ||
2103                 ((features & NETIF_F_V6_CSUM) &&
2104                  protocol == htons(ETH_P_IPV6)) ||
2105                 ((features & NETIF_F_FCOE_CRC) &&
2106                  protocol == htons(ETH_P_FCOE)));
2107 }
2108
2109 static netdev_features_t harmonize_features(struct sk_buff *skb,
2110         __be16 protocol, netdev_features_t features)
2111 {
2112         if (!can_checksum_protocol(features, protocol)) {
2113                 features &= ~NETIF_F_ALL_CSUM;
2114                 features &= ~NETIF_F_SG;
2115         } else if (illegal_highdma(skb->dev, skb)) {
2116                 features &= ~NETIF_F_SG;
2117         }
2118
2119         return features;
2120 }
2121
2122 netdev_features_t netif_skb_features(struct sk_buff *skb)
2123 {
2124         __be16 protocol = skb->protocol;
2125         netdev_features_t features = skb->dev->features;
2126
2127         if (protocol == htons(ETH_P_8021Q)) {
2128                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2129                 protocol = veh->h_vlan_encapsulated_proto;
2130         } else if (!vlan_tx_tag_present(skb)) {
2131                 return harmonize_features(skb, protocol, features);
2132         }
2133
2134         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2135
2136         if (protocol != htons(ETH_P_8021Q)) {
2137                 return harmonize_features(skb, protocol, features);
2138         } else {
2139                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2140                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2141                 return harmonize_features(skb, protocol, features);
2142         }
2143 }
2144 EXPORT_SYMBOL(netif_skb_features);
2145
2146 /*
2147  * Returns true if either:
2148  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2149  *      2. skb is fragmented and the device does not support SG, or if
2150  *         at least one of fragments is in highmem and device does not
2151  *         support DMA from it.
2152  */
2153 static inline int skb_needs_linearize(struct sk_buff *skb,
2154                                       int features)
2155 {
2156         return skb_is_nonlinear(skb) &&
2157                         ((skb_has_frag_list(skb) &&
2158                                 !(features & NETIF_F_FRAGLIST)) ||
2159                         (skb_shinfo(skb)->nr_frags &&
2160                                 !(features & NETIF_F_SG)));
2161 }
2162
2163 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2164                         struct netdev_queue *txq)
2165 {
2166         const struct net_device_ops *ops = dev->netdev_ops;
2167         int rc = NETDEV_TX_OK;
2168         unsigned int skb_len;
2169
2170         if (likely(!skb->next)) {
2171                 netdev_features_t features;
2172
2173                 /*
2174                  * If device doesn't need skb->dst, release it right now while
2175                  * its hot in this cpu cache
2176                  */
2177                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2178                         skb_dst_drop(skb);
2179
2180                 if (!list_empty(&ptype_all))
2181                         dev_queue_xmit_nit(skb, dev);
2182
2183                 skb_orphan_try(skb);
2184
2185                 features = netif_skb_features(skb);
2186
2187                 if (vlan_tx_tag_present(skb) &&
2188                     !(features & NETIF_F_HW_VLAN_TX)) {
2189                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2190                         if (unlikely(!skb))
2191                                 goto out;
2192
2193                         skb->vlan_tci = 0;
2194                 }
2195
2196                 if (netif_needs_gso(skb, features)) {
2197                         if (unlikely(dev_gso_segment(skb, features)))
2198                                 goto out_kfree_skb;
2199                         if (skb->next)
2200                                 goto gso;
2201                 } else {
2202                         if (skb_needs_linearize(skb, features) &&
2203                             __skb_linearize(skb))
2204                                 goto out_kfree_skb;
2205
2206                         /* If packet is not checksummed and device does not
2207                          * support checksumming for this protocol, complete
2208                          * checksumming here.
2209                          */
2210                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2211                                 skb_set_transport_header(skb,
2212                                         skb_checksum_start_offset(skb));
2213                                 if (!(features & NETIF_F_ALL_CSUM) &&
2214                                      skb_checksum_help(skb))
2215                                         goto out_kfree_skb;
2216                         }
2217                 }
2218
2219                 skb_len = skb->len;
2220                 rc = ops->ndo_start_xmit(skb, dev);
2221                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2222                 if (rc == NETDEV_TX_OK)
2223                         txq_trans_update(txq);
2224                 return rc;
2225         }
2226
2227 gso:
2228         do {
2229                 struct sk_buff *nskb = skb->next;
2230
2231                 skb->next = nskb->next;
2232                 nskb->next = NULL;
2233
2234                 /*
2235                  * If device doesn't need nskb->dst, release it right now while
2236                  * its hot in this cpu cache
2237                  */
2238                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2239                         skb_dst_drop(nskb);
2240
2241                 skb_len = nskb->len;
2242                 rc = ops->ndo_start_xmit(nskb, dev);
2243                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2244                 if (unlikely(rc != NETDEV_TX_OK)) {
2245                         if (rc & ~NETDEV_TX_MASK)
2246                                 goto out_kfree_gso_skb;
2247                         nskb->next = skb->next;
2248                         skb->next = nskb;
2249                         return rc;
2250                 }
2251                 txq_trans_update(txq);
2252                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2253                         return NETDEV_TX_BUSY;
2254         } while (skb->next);
2255
2256 out_kfree_gso_skb:
2257         if (likely(skb->next == NULL))
2258                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2259 out_kfree_skb:
2260         kfree_skb(skb);
2261 out:
2262         return rc;
2263 }
2264
2265 static u32 hashrnd __read_mostly;
2266
2267 /*
2268  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2269  * to be used as a distribution range.
2270  */
2271 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2272                   unsigned int num_tx_queues)
2273 {
2274         u32 hash;
2275         u16 qoffset = 0;
2276         u16 qcount = num_tx_queues;
2277
2278         if (skb_rx_queue_recorded(skb)) {
2279                 hash = skb_get_rx_queue(skb);
2280                 while (unlikely(hash >= num_tx_queues))
2281                         hash -= num_tx_queues;
2282                 return hash;
2283         }
2284
2285         if (dev->num_tc) {
2286                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2287                 qoffset = dev->tc_to_txq[tc].offset;
2288                 qcount = dev->tc_to_txq[tc].count;
2289         }
2290
2291         if (skb->sk && skb->sk->sk_hash)
2292                 hash = skb->sk->sk_hash;
2293         else
2294                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2295         hash = jhash_1word(hash, hashrnd);
2296
2297         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2298 }
2299 EXPORT_SYMBOL(__skb_tx_hash);
2300
2301 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2302 {
2303         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2304                 if (net_ratelimit()) {
2305                         pr_warning("%s selects TX queue %d, but "
2306                                 "real number of TX queues is %d\n",
2307                                 dev->name, queue_index, dev->real_num_tx_queues);
2308                 }
2309                 return 0;
2310         }
2311         return queue_index;
2312 }
2313
2314 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2315 {
2316 #ifdef CONFIG_XPS
2317         struct xps_dev_maps *dev_maps;
2318         struct xps_map *map;
2319         int queue_index = -1;
2320
2321         rcu_read_lock();
2322         dev_maps = rcu_dereference(dev->xps_maps);
2323         if (dev_maps) {
2324                 map = rcu_dereference(
2325                     dev_maps->cpu_map[raw_smp_processor_id()]);
2326                 if (map) {
2327                         if (map->len == 1)
2328                                 queue_index = map->queues[0];
2329                         else {
2330                                 u32 hash;
2331                                 if (skb->sk && skb->sk->sk_hash)
2332                                         hash = skb->sk->sk_hash;
2333                                 else
2334                                         hash = (__force u16) skb->protocol ^
2335                                             skb->rxhash;
2336                                 hash = jhash_1word(hash, hashrnd);
2337                                 queue_index = map->queues[
2338                                     ((u64)hash * map->len) >> 32];
2339                         }
2340                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2341                                 queue_index = -1;
2342                 }
2343         }
2344         rcu_read_unlock();
2345
2346         return queue_index;
2347 #else
2348         return -1;
2349 #endif
2350 }
2351
2352 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2353                                         struct sk_buff *skb)
2354 {
2355         int queue_index;
2356         const struct net_device_ops *ops = dev->netdev_ops;
2357
2358         if (dev->real_num_tx_queues == 1)
2359                 queue_index = 0;
2360         else if (ops->ndo_select_queue) {
2361                 queue_index = ops->ndo_select_queue(dev, skb);
2362                 queue_index = dev_cap_txqueue(dev, queue_index);
2363         } else {
2364                 struct sock *sk = skb->sk;
2365                 queue_index = sk_tx_queue_get(sk);
2366
2367                 if (queue_index < 0 || skb->ooo_okay ||
2368                     queue_index >= dev->real_num_tx_queues) {
2369                         int old_index = queue_index;
2370
2371                         queue_index = get_xps_queue(dev, skb);
2372                         if (queue_index < 0)
2373                                 queue_index = skb_tx_hash(dev, skb);
2374
2375                         if (queue_index != old_index && sk) {
2376                                 struct dst_entry *dst =
2377                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2378
2379                                 if (dst && skb_dst(skb) == dst)
2380                                         sk_tx_queue_set(sk, queue_index);
2381                         }
2382                 }
2383         }
2384
2385         skb_set_queue_mapping(skb, queue_index);
2386         return netdev_get_tx_queue(dev, queue_index);
2387 }
2388
2389 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2390                                  struct net_device *dev,
2391                                  struct netdev_queue *txq)
2392 {
2393         spinlock_t *root_lock = qdisc_lock(q);
2394         bool contended;
2395         int rc;
2396
2397         qdisc_skb_cb(skb)->pkt_len = skb->len;
2398         qdisc_calculate_pkt_len(skb, q);
2399         /*
2400          * Heuristic to force contended enqueues to serialize on a
2401          * separate lock before trying to get qdisc main lock.
2402          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2403          * and dequeue packets faster.
2404          */
2405         contended = qdisc_is_running(q);
2406         if (unlikely(contended))
2407                 spin_lock(&q->busylock);
2408
2409         spin_lock(root_lock);
2410         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2411                 kfree_skb(skb);
2412                 rc = NET_XMIT_DROP;
2413         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2414                    qdisc_run_begin(q)) {
2415                 /*
2416                  * This is a work-conserving queue; there are no old skbs
2417                  * waiting to be sent out; and the qdisc is not running -
2418                  * xmit the skb directly.
2419                  */
2420                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2421                         skb_dst_force(skb);
2422
2423                 qdisc_bstats_update(q, skb);
2424
2425                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2426                         if (unlikely(contended)) {
2427                                 spin_unlock(&q->busylock);
2428                                 contended = false;
2429                         }
2430                         __qdisc_run(q);
2431                 } else
2432                         qdisc_run_end(q);
2433
2434                 rc = NET_XMIT_SUCCESS;
2435         } else {
2436                 skb_dst_force(skb);
2437                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2438                 if (qdisc_run_begin(q)) {
2439                         if (unlikely(contended)) {
2440                                 spin_unlock(&q->busylock);
2441                                 contended = false;
2442                         }
2443                         __qdisc_run(q);
2444                 }
2445         }
2446         spin_unlock(root_lock);
2447         if (unlikely(contended))
2448                 spin_unlock(&q->busylock);
2449         return rc;
2450 }
2451
2452 static DEFINE_PER_CPU(int, xmit_recursion);
2453 #define RECURSION_LIMIT 10
2454
2455 /**
2456  *      dev_queue_xmit - transmit a buffer
2457  *      @skb: buffer to transmit
2458  *
2459  *      Queue a buffer for transmission to a network device. The caller must
2460  *      have set the device and priority and built the buffer before calling
2461  *      this function. The function can be called from an interrupt.
2462  *
2463  *      A negative errno code is returned on a failure. A success does not
2464  *      guarantee the frame will be transmitted as it may be dropped due
2465  *      to congestion or traffic shaping.
2466  *
2467  * -----------------------------------------------------------------------------------
2468  *      I notice this method can also return errors from the queue disciplines,
2469  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2470  *      be positive.
2471  *
2472  *      Regardless of the return value, the skb is consumed, so it is currently
2473  *      difficult to retry a send to this method.  (You can bump the ref count
2474  *      before sending to hold a reference for retry if you are careful.)
2475  *
2476  *      When calling this method, interrupts MUST be enabled.  This is because
2477  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2478  *          --BLG
2479  */
2480 int dev_queue_xmit(struct sk_buff *skb)
2481 {
2482         struct net_device *dev = skb->dev;
2483         struct netdev_queue *txq;
2484         struct Qdisc *q;
2485         int rc = -ENOMEM;
2486
2487         /* Disable soft irqs for various locks below. Also
2488          * stops preemption for RCU.
2489          */
2490         rcu_read_lock_bh();
2491
2492         txq = dev_pick_tx(dev, skb);
2493         q = rcu_dereference_bh(txq->qdisc);
2494
2495 #ifdef CONFIG_NET_CLS_ACT
2496         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2497 #endif
2498         trace_net_dev_queue(skb);
2499         if (q->enqueue) {
2500                 rc = __dev_xmit_skb(skb, q, dev, txq);
2501                 goto out;
2502         }
2503
2504         /* The device has no queue. Common case for software devices:
2505            loopback, all the sorts of tunnels...
2506
2507            Really, it is unlikely that netif_tx_lock protection is necessary
2508            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2509            counters.)
2510            However, it is possible, that they rely on protection
2511            made by us here.
2512
2513            Check this and shot the lock. It is not prone from deadlocks.
2514            Either shot noqueue qdisc, it is even simpler 8)
2515          */
2516         if (dev->flags & IFF_UP) {
2517                 int cpu = smp_processor_id(); /* ok because BHs are off */
2518
2519                 if (txq->xmit_lock_owner != cpu) {
2520
2521                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2522                                 goto recursion_alert;
2523
2524                         HARD_TX_LOCK(dev, txq, cpu);
2525
2526                         if (!netif_tx_queue_stopped(txq)) {
2527                                 __this_cpu_inc(xmit_recursion);
2528                                 rc = dev_hard_start_xmit(skb, dev, txq);
2529                                 __this_cpu_dec(xmit_recursion);
2530                                 if (dev_xmit_complete(rc)) {
2531                                         HARD_TX_UNLOCK(dev, txq);
2532                                         goto out;
2533                                 }
2534                         }
2535                         HARD_TX_UNLOCK(dev, txq);
2536                         if (net_ratelimit())
2537                                 printk(KERN_CRIT "Virtual device %s asks to "
2538                                        "queue packet!\n", dev->name);
2539                 } else {
2540                         /* Recursion is detected! It is possible,
2541                          * unfortunately
2542                          */
2543 recursion_alert:
2544                         if (net_ratelimit())
2545                                 printk(KERN_CRIT "Dead loop on virtual device "
2546                                        "%s, fix it urgently!\n", dev->name);
2547                 }
2548         }
2549
2550         rc = -ENETDOWN;
2551         rcu_read_unlock_bh();
2552
2553         kfree_skb(skb);
2554         return rc;
2555 out:
2556         rcu_read_unlock_bh();
2557         return rc;
2558 }
2559 EXPORT_SYMBOL(dev_queue_xmit);
2560
2561
2562 /*=======================================================================
2563                         Receiver routines
2564   =======================================================================*/
2565
2566 int netdev_max_backlog __read_mostly = 1000;
2567 int netdev_tstamp_prequeue __read_mostly = 1;
2568 int netdev_budget __read_mostly = 300;
2569 int weight_p __read_mostly = 64;            /* old backlog weight */
2570
2571 /* Called with irq disabled */
2572 static inline void ____napi_schedule(struct softnet_data *sd,
2573                                      struct napi_struct *napi)
2574 {
2575         list_add_tail(&napi->poll_list, &sd->poll_list);
2576         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2577 }
2578
2579 /*
2580  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2581  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2582  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2583  * if hash is a canonical 4-tuple hash over transport ports.
2584  */
2585 void __skb_get_rxhash(struct sk_buff *skb)
2586 {
2587         int nhoff, hash = 0, poff;
2588         const struct ipv6hdr *ip6;
2589         const struct iphdr *ip;
2590         const struct vlan_hdr *vlan;
2591         u8 ip_proto;
2592         u32 addr1, addr2;
2593         u16 proto;
2594         union {
2595                 u32 v32;
2596                 u16 v16[2];
2597         } ports;
2598
2599         nhoff = skb_network_offset(skb);
2600         proto = skb->protocol;
2601
2602 again:
2603         switch (proto) {
2604         case __constant_htons(ETH_P_IP):
2605 ip:
2606                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2607                         goto done;
2608
2609                 ip = (const struct iphdr *) (skb->data + nhoff);
2610                 if (ip_is_fragment(ip))
2611                         ip_proto = 0;
2612                 else
2613                         ip_proto = ip->protocol;
2614                 addr1 = (__force u32) ip->saddr;
2615                 addr2 = (__force u32) ip->daddr;
2616                 nhoff += ip->ihl * 4;
2617                 break;
2618         case __constant_htons(ETH_P_IPV6):
2619 ipv6:
2620                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2621                         goto done;
2622
2623                 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2624                 ip_proto = ip6->nexthdr;
2625                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2626                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2627                 nhoff += 40;
2628                 break;
2629         case __constant_htons(ETH_P_8021Q):
2630                 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2631                         goto done;
2632                 vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2633                 proto = vlan->h_vlan_encapsulated_proto;
2634                 nhoff += sizeof(*vlan);
2635                 goto again;
2636         case __constant_htons(ETH_P_PPP_SES):
2637                 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2638                         goto done;
2639                 proto = *((__be16 *) (skb->data + nhoff +
2640                                       sizeof(struct pppoe_hdr)));
2641                 nhoff += PPPOE_SES_HLEN;
2642                 switch (proto) {
2643                 case __constant_htons(PPP_IP):
2644                         goto ip;
2645                 case __constant_htons(PPP_IPV6):
2646                         goto ipv6;
2647                 default:
2648                         goto done;
2649                 }
2650         default:
2651                 goto done;
2652         }
2653
2654         switch (ip_proto) {
2655         case IPPROTO_GRE:
2656                 if (pskb_may_pull(skb, nhoff + 16)) {
2657                         u8 *h = skb->data + nhoff;
2658                         __be16 flags = *(__be16 *)h;
2659
2660                         /*
2661                          * Only look inside GRE if version zero and no
2662                          * routing
2663                          */
2664                         if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2665                                 proto = *(__be16 *)(h + 2);
2666                                 nhoff += 4;
2667                                 if (flags & GRE_CSUM)
2668                                         nhoff += 4;
2669                                 if (flags & GRE_KEY)
2670                                         nhoff += 4;
2671                                 if (flags & GRE_SEQ)
2672                                         nhoff += 4;
2673                                 goto again;
2674                         }
2675                 }
2676                 break;
2677         case IPPROTO_IPIP:
2678                 goto again;
2679         default:
2680                 break;
2681         }
2682
2683         ports.v32 = 0;
2684         poff = proto_ports_offset(ip_proto);
2685         if (poff >= 0) {
2686                 nhoff += poff;
2687                 if (pskb_may_pull(skb, nhoff + 4)) {
2688                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2689                         if (ports.v16[1] < ports.v16[0])
2690                                 swap(ports.v16[0], ports.v16[1]);
2691                         skb->l4_rxhash = 1;
2692                 }
2693         }
2694
2695         /* get a consistent hash (same value on both flow directions) */
2696         if (addr2 < addr1)
2697                 swap(addr1, addr2);
2698
2699         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2700         if (!hash)
2701                 hash = 1;
2702
2703 done:
2704         skb->rxhash = hash;
2705 }
2706 EXPORT_SYMBOL(__skb_get_rxhash);
2707
2708 #ifdef CONFIG_RPS
2709
2710 /* One global table that all flow-based protocols share. */
2711 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2712 EXPORT_SYMBOL(rps_sock_flow_table);
2713
2714 static struct rps_dev_flow *
2715 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2716             struct rps_dev_flow *rflow, u16 next_cpu)
2717 {
2718         if (next_cpu != RPS_NO_CPU) {
2719 #ifdef CONFIG_RFS_ACCEL
2720                 struct netdev_rx_queue *rxqueue;
2721                 struct rps_dev_flow_table *flow_table;
2722                 struct rps_dev_flow *old_rflow;
2723                 u32 flow_id;
2724                 u16 rxq_index;
2725                 int rc;
2726
2727                 /* Should we steer this flow to a different hardware queue? */
2728                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2729                     !(dev->features & NETIF_F_NTUPLE))
2730                         goto out;
2731                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2732                 if (rxq_index == skb_get_rx_queue(skb))
2733                         goto out;
2734
2735                 rxqueue = dev->_rx + rxq_index;
2736                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737                 if (!flow_table)
2738                         goto out;
2739                 flow_id = skb->rxhash & flow_table->mask;
2740                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2741                                                         rxq_index, flow_id);
2742                 if (rc < 0)
2743                         goto out;
2744                 old_rflow = rflow;
2745                 rflow = &flow_table->flows[flow_id];
2746                 rflow->filter = rc;
2747                 if (old_rflow->filter == rflow->filter)
2748                         old_rflow->filter = RPS_NO_FILTER;
2749         out:
2750 #endif
2751                 rflow->last_qtail =
2752                         per_cpu(softnet_data, next_cpu).input_queue_head;
2753         }
2754
2755         rflow->cpu = next_cpu;
2756         return rflow;
2757 }
2758
2759 /*
2760  * get_rps_cpu is called from netif_receive_skb and returns the target
2761  * CPU from the RPS map of the receiving queue for a given skb.
2762  * rcu_read_lock must be held on entry.
2763  */
2764 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2765                        struct rps_dev_flow **rflowp)
2766 {
2767         struct netdev_rx_queue *rxqueue;
2768         struct rps_map *map;
2769         struct rps_dev_flow_table *flow_table;
2770         struct rps_sock_flow_table *sock_flow_table;
2771         int cpu = -1;
2772         u16 tcpu;
2773
2774         if (skb_rx_queue_recorded(skb)) {
2775                 u16 index = skb_get_rx_queue(skb);
2776                 if (unlikely(index >= dev->real_num_rx_queues)) {
2777                         WARN_ONCE(dev->real_num_rx_queues > 1,
2778                                   "%s received packet on queue %u, but number "
2779                                   "of RX queues is %u\n",
2780                                   dev->name, index, dev->real_num_rx_queues);
2781                         goto done;
2782                 }
2783                 rxqueue = dev->_rx + index;
2784         } else
2785                 rxqueue = dev->_rx;
2786
2787         map = rcu_dereference(rxqueue->rps_map);
2788         if (map) {
2789                 if (map->len == 1 &&
2790                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2791                         tcpu = map->cpus[0];
2792                         if (cpu_online(tcpu))
2793                                 cpu = tcpu;
2794                         goto done;
2795                 }
2796         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2797                 goto done;
2798         }
2799
2800         skb_reset_network_header(skb);
2801         if (!skb_get_rxhash(skb))
2802                 goto done;
2803
2804         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2805         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2806         if (flow_table && sock_flow_table) {
2807                 u16 next_cpu;
2808                 struct rps_dev_flow *rflow;
2809
2810                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2811                 tcpu = rflow->cpu;
2812
2813                 next_cpu = sock_flow_table->ents[skb->rxhash &
2814                     sock_flow_table->mask];
2815
2816                 /*
2817                  * If the desired CPU (where last recvmsg was done) is
2818                  * different from current CPU (one in the rx-queue flow
2819                  * table entry), switch if one of the following holds:
2820                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2821                  *   - Current CPU is offline.
2822                  *   - The current CPU's queue tail has advanced beyond the
2823                  *     last packet that was enqueued using this table entry.
2824                  *     This guarantees that all previous packets for the flow
2825                  *     have been dequeued, thus preserving in order delivery.
2826                  */
2827                 if (unlikely(tcpu != next_cpu) &&
2828                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2829                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2830                       rflow->last_qtail)) >= 0))
2831                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2832
2833                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2834                         *rflowp = rflow;
2835                         cpu = tcpu;
2836                         goto done;
2837                 }
2838         }
2839
2840         if (map) {
2841                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2842
2843                 if (cpu_online(tcpu)) {
2844                         cpu = tcpu;
2845                         goto done;
2846                 }
2847         }
2848
2849 done:
2850         return cpu;
2851 }
2852
2853 #ifdef CONFIG_RFS_ACCEL
2854
2855 /**
2856  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2857  * @dev: Device on which the filter was set
2858  * @rxq_index: RX queue index
2859  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2860  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2861  *
2862  * Drivers that implement ndo_rx_flow_steer() should periodically call
2863  * this function for each installed filter and remove the filters for
2864  * which it returns %true.
2865  */
2866 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2867                          u32 flow_id, u16 filter_id)
2868 {
2869         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2870         struct rps_dev_flow_table *flow_table;
2871         struct rps_dev_flow *rflow;
2872         bool expire = true;
2873         int cpu;
2874
2875         rcu_read_lock();
2876         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2877         if (flow_table && flow_id <= flow_table->mask) {
2878                 rflow = &flow_table->flows[flow_id];
2879                 cpu = ACCESS_ONCE(rflow->cpu);
2880                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2881                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2882                            rflow->last_qtail) <
2883                      (int)(10 * flow_table->mask)))
2884                         expire = false;
2885         }
2886         rcu_read_unlock();
2887         return expire;
2888 }
2889 EXPORT_SYMBOL(rps_may_expire_flow);
2890
2891 #endif /* CONFIG_RFS_ACCEL */
2892
2893 /* Called from hardirq (IPI) context */
2894 static void rps_trigger_softirq(void *data)
2895 {
2896         struct softnet_data *sd = data;
2897
2898         ____napi_schedule(sd, &sd->backlog);
2899         sd->received_rps++;
2900 }
2901
2902 #endif /* CONFIG_RPS */
2903
2904 /*
2905  * Check if this softnet_data structure is another cpu one
2906  * If yes, queue it to our IPI list and return 1
2907  * If no, return 0
2908  */
2909 static int rps_ipi_queued(struct softnet_data *sd)
2910 {
2911 #ifdef CONFIG_RPS
2912         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2913
2914         if (sd != mysd) {
2915                 sd->rps_ipi_next = mysd->rps_ipi_list;
2916                 mysd->rps_ipi_list = sd;
2917
2918                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2919                 return 1;
2920         }
2921 #endif /* CONFIG_RPS */
2922         return 0;
2923 }
2924
2925 /*
2926  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2927  * queue (may be a remote CPU queue).
2928  */
2929 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2930                               unsigned int *qtail)
2931 {
2932         struct softnet_data *sd;
2933         unsigned long flags;
2934
2935         sd = &per_cpu(softnet_data, cpu);
2936
2937         local_irq_save(flags);
2938
2939         rps_lock(sd);
2940         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2941                 if (skb_queue_len(&sd->input_pkt_queue)) {
2942 enqueue:
2943                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2944                         input_queue_tail_incr_save(sd, qtail);
2945                         rps_unlock(sd);
2946                         local_irq_restore(flags);
2947                         return NET_RX_SUCCESS;
2948                 }
2949
2950                 /* Schedule NAPI for backlog device
2951                  * We can use non atomic operation since we own the queue lock
2952                  */
2953                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2954                         if (!rps_ipi_queued(sd))
2955                                 ____napi_schedule(sd, &sd->backlog);
2956                 }
2957                 goto enqueue;
2958         }
2959
2960         sd->dropped++;
2961         rps_unlock(sd);
2962
2963         local_irq_restore(flags);
2964
2965         atomic_long_inc(&skb->dev->rx_dropped);
2966         kfree_skb(skb);
2967         return NET_RX_DROP;
2968 }
2969
2970 /**
2971  *      netif_rx        -       post buffer to the network code
2972  *      @skb: buffer to post
2973  *
2974  *      This function receives a packet from a device driver and queues it for
2975  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2976  *      may be dropped during processing for congestion control or by the
2977  *      protocol layers.
2978  *
2979  *      return values:
2980  *      NET_RX_SUCCESS  (no congestion)
2981  *      NET_RX_DROP     (packet was dropped)
2982  *
2983  */
2984
2985 int netif_rx(struct sk_buff *skb)
2986 {
2987         int ret;
2988
2989         /* if netpoll wants it, pretend we never saw it */
2990         if (netpoll_rx(skb))
2991                 return NET_RX_DROP;
2992
2993         net_timestamp_check(netdev_tstamp_prequeue, skb);
2994
2995         trace_netif_rx(skb);
2996 #ifdef CONFIG_RPS
2997         {
2998                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2999                 int cpu;
3000
3001                 preempt_disable();
3002                 rcu_read_lock();
3003
3004                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3005                 if (cpu < 0)
3006                         cpu = smp_processor_id();
3007
3008                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3009
3010                 rcu_read_unlock();
3011                 preempt_enable();
3012         }
3013 #else
3014         {
3015                 unsigned int qtail;
3016                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3017                 put_cpu();
3018         }
3019 #endif
3020         return ret;
3021 }
3022 EXPORT_SYMBOL(netif_rx);
3023
3024 int netif_rx_ni(struct sk_buff *skb)
3025 {
3026         int err;
3027
3028         preempt_disable();
3029         err = netif_rx(skb);
3030         if (local_softirq_pending())
3031                 do_softirq();
3032         preempt_enable();
3033
3034         return err;
3035 }
3036 EXPORT_SYMBOL(netif_rx_ni);
3037
3038 static void net_tx_action(struct softirq_action *h)
3039 {
3040         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3041
3042         if (sd->completion_queue) {
3043                 struct sk_buff *clist;
3044
3045                 local_irq_disable();
3046                 clist = sd->completion_queue;
3047                 sd->completion_queue = NULL;
3048                 local_irq_enable();
3049
3050                 while (clist) {
3051                         struct sk_buff *skb = clist;
3052                         clist = clist->next;
3053
3054                         WARN_ON(atomic_read(&skb->users));
3055                         trace_kfree_skb(skb, net_tx_action);
3056                         __kfree_skb(skb);
3057                 }
3058         }
3059
3060         if (sd->output_queue) {
3061                 struct Qdisc *head;
3062
3063                 local_irq_disable();
3064                 head = sd->output_queue;
3065                 sd->output_queue = NULL;
3066                 sd->output_queue_tailp = &sd->output_queue;
3067                 local_irq_enable();
3068
3069                 while (head) {
3070                         struct Qdisc *q = head;
3071                         spinlock_t *root_lock;
3072
3073                         head = head->next_sched;
3074
3075                         root_lock = qdisc_lock(q);
3076                         if (spin_trylock(root_lock)) {
3077                                 smp_mb__before_clear_bit();
3078                                 clear_bit(__QDISC_STATE_SCHED,
3079                                           &q->state);
3080                                 qdisc_run(q);
3081                                 spin_unlock(root_lock);
3082                         } else {
3083                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3084                                               &q->state)) {
3085                                         __netif_reschedule(q);
3086                                 } else {
3087                                         smp_mb__before_clear_bit();
3088                                         clear_bit(__QDISC_STATE_SCHED,
3089                                                   &q->state);
3090                                 }
3091                         }
3092                 }
3093         }
3094 }
3095
3096 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3097     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3098 /* This hook is defined here for ATM LANE */
3099 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3100                              unsigned char *addr) __read_mostly;
3101 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3102 #endif
3103
3104 #ifdef CONFIG_NET_CLS_ACT
3105 /* TODO: Maybe we should just force sch_ingress to be compiled in
3106  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3107  * a compare and 2 stores extra right now if we dont have it on
3108  * but have CONFIG_NET_CLS_ACT
3109  * NOTE: This doesn't stop any functionality; if you dont have
3110  * the ingress scheduler, you just can't add policies on ingress.
3111  *
3112  */
3113 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3114 {
3115         struct net_device *dev = skb->dev;
3116         u32 ttl = G_TC_RTTL(skb->tc_verd);
3117         int result = TC_ACT_OK;
3118         struct Qdisc *q;
3119
3120         if (unlikely(MAX_RED_LOOP < ttl++)) {
3121                 if (net_ratelimit())
3122                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3123                                skb->skb_iif, dev->ifindex);
3124                 return TC_ACT_SHOT;
3125         }
3126
3127         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3128         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3129
3130         q = rxq->qdisc;
3131         if (q != &noop_qdisc) {
3132                 spin_lock(qdisc_lock(q));
3133                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3134                         result = qdisc_enqueue_root(skb, q);
3135                 spin_unlock(qdisc_lock(q));
3136         }
3137
3138         return result;
3139 }
3140
3141 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3142                                          struct packet_type **pt_prev,
3143                                          int *ret, struct net_device *orig_dev)
3144 {
3145         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3146
3147         if (!rxq || rxq->qdisc == &noop_qdisc)
3148                 goto out;
3149
3150         if (*pt_prev) {
3151                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3152                 *pt_prev = NULL;
3153         }
3154
3155         switch (ing_filter(skb, rxq)) {
3156         case TC_ACT_SHOT:
3157         case TC_ACT_STOLEN:
3158                 kfree_skb(skb);
3159                 return NULL;
3160         }
3161
3162 out:
3163         skb->tc_verd = 0;
3164         return skb;
3165 }
3166 #endif
3167
3168 /**
3169  *      netdev_rx_handler_register - register receive handler
3170  *      @dev: device to register a handler for
3171  *      @rx_handler: receive handler to register
3172  *      @rx_handler_data: data pointer that is used by rx handler
3173  *
3174  *      Register a receive hander for a device. This handler will then be
3175  *      called from __netif_receive_skb. A negative errno code is returned
3176  *      on a failure.
3177  *
3178  *      The caller must hold the rtnl_mutex.
3179  *
3180  *      For a general description of rx_handler, see enum rx_handler_result.
3181  */
3182 int netdev_rx_handler_register(struct net_device *dev,
3183                                rx_handler_func_t *rx_handler,
3184                                void *rx_handler_data)
3185 {
3186         ASSERT_RTNL();
3187
3188         if (dev->rx_handler)
3189                 return -EBUSY;
3190
3191         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3192         rcu_assign_pointer(dev->rx_handler, rx_handler);
3193
3194         return 0;
3195 }
3196 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3197
3198 /**
3199  *      netdev_rx_handler_unregister - unregister receive handler
3200  *      @dev: device to unregister a handler from
3201  *
3202  *      Unregister a receive hander from a device.
3203  *
3204  *      The caller must hold the rtnl_mutex.
3205  */
3206 void netdev_rx_handler_unregister(struct net_device *dev)
3207 {
3208
3209         ASSERT_RTNL();
3210         RCU_INIT_POINTER(dev->rx_handler, NULL);
3211         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3212 }
3213 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3214
3215 static int __netif_receive_skb(struct sk_buff *skb)
3216 {
3217         struct packet_type *ptype, *pt_prev;
3218         rx_handler_func_t *rx_handler;
3219         struct net_device *orig_dev;
3220         struct net_device *null_or_dev;
3221         bool deliver_exact = false;
3222         int ret = NET_RX_DROP;
3223         __be16 type;
3224
3225         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3226
3227         trace_netif_receive_skb(skb);
3228
3229         /* if we've gotten here through NAPI, check netpoll */
3230         if (netpoll_receive_skb(skb))
3231                 return NET_RX_DROP;
3232
3233         if (!skb->skb_iif)
3234                 skb->skb_iif = skb->dev->ifindex;
3235         orig_dev = skb->dev;
3236
3237         skb_reset_network_header(skb);
3238         skb_reset_transport_header(skb);
3239         skb_reset_mac_len(skb);
3240
3241         pt_prev = NULL;
3242
3243         rcu_read_lock();
3244
3245 another_round:
3246
3247         __this_cpu_inc(softnet_data.processed);
3248
3249         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3250                 skb = vlan_untag(skb);
3251                 if (unlikely(!skb))
3252                         goto out;
3253         }
3254
3255 #ifdef CONFIG_NET_CLS_ACT
3256         if (skb->tc_verd & TC_NCLS) {
3257                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3258                 goto ncls;
3259         }
3260 #endif
3261
3262         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3263                 if (!ptype->dev || ptype->dev == skb->dev) {
3264                         if (pt_prev)
3265                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3266                         pt_prev = ptype;
3267                 }
3268         }
3269
3270 #ifdef CONFIG_NET_CLS_ACT
3271         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3272         if (!skb)
3273                 goto out;
3274 ncls:
3275 #endif
3276
3277         rx_handler = rcu_dereference(skb->dev->rx_handler);
3278         if (vlan_tx_tag_present(skb)) {
3279                 if (pt_prev) {
3280                         ret = deliver_skb(skb, pt_prev, orig_dev);
3281                         pt_prev = NULL;
3282                 }
3283                 if (vlan_do_receive(&skb, !rx_handler))
3284                         goto another_round;
3285                 else if (unlikely(!skb))
3286                         goto out;
3287         }
3288
3289         if (rx_handler) {
3290                 if (pt_prev) {
3291                         ret = deliver_skb(skb, pt_prev, orig_dev);
3292                         pt_prev = NULL;
3293                 }
3294                 switch (rx_handler(&skb)) {
3295                 case RX_HANDLER_CONSUMED:
3296                         goto out;
3297                 case RX_HANDLER_ANOTHER:
3298                         goto another_round;
3299                 case RX_HANDLER_EXACT:
3300                         deliver_exact = true;
3301                 case RX_HANDLER_PASS:
3302                         break;
3303                 default:
3304                         BUG();
3305                 }
3306         }
3307
3308         /* deliver only exact match when indicated */
3309         null_or_dev = deliver_exact ? skb->dev : NULL;
3310
3311         type = skb->protocol;
3312         list_for_each_entry_rcu(ptype,
3313                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3314                 if (ptype->type == type &&
3315                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3316                      ptype->dev == orig_dev)) {
3317                         if (pt_prev)
3318                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3319                         pt_prev = ptype;
3320                 }
3321         }
3322
3323         if (pt_prev) {
3324                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3325         } else {
3326                 atomic_long_inc(&skb->dev->rx_dropped);
3327                 kfree_skb(skb);
3328                 /* Jamal, now you will not able to escape explaining
3329                  * me how you were going to use this. :-)
3330                  */
3331                 ret = NET_RX_DROP;
3332         }
3333
3334 out:
3335         rcu_read_unlock();
3336         return ret;
3337 }
3338
3339 /**
3340  *      netif_receive_skb - process receive buffer from network
3341  *      @skb: buffer to process
3342  *
3343  *      netif_receive_skb() is the main receive data processing function.
3344  *      It always succeeds. The buffer may be dropped during processing
3345  *      for congestion control or by the protocol layers.
3346  *
3347  *      This function may only be called from softirq context and interrupts
3348  *      should be enabled.
3349  *
3350  *      Return values (usually ignored):
3351  *      NET_RX_SUCCESS: no congestion
3352  *      NET_RX_DROP: packet was dropped
3353  */
3354 int netif_receive_skb(struct sk_buff *skb)
3355 {
3356         net_timestamp_check(netdev_tstamp_prequeue, skb);
3357
3358         if (skb_defer_rx_timestamp(skb))
3359                 return NET_RX_SUCCESS;
3360
3361 #ifdef CONFIG_RPS
3362         {
3363                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3364                 int cpu, ret;
3365
3366                 rcu_read_lock();
3367
3368                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3369
3370                 if (cpu >= 0) {
3371                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3372                         rcu_read_unlock();
3373                 } else {
3374                         rcu_read_unlock();
3375                         ret = __netif_receive_skb(skb);
3376                 }
3377
3378                 return ret;
3379         }
3380 #else
3381         return __netif_receive_skb(skb);
3382 #endif
3383 }
3384 EXPORT_SYMBOL(netif_receive_skb);
3385
3386 /* Network device is going away, flush any packets still pending
3387  * Called with irqs disabled.
3388  */
3389 static void flush_backlog(void *arg)
3390 {
3391         struct net_device *dev = arg;
3392         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3393         struct sk_buff *skb, *tmp;
3394
3395         rps_lock(sd);
3396         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3397                 if (skb->dev == dev) {
3398                         __skb_unlink(skb, &sd->input_pkt_queue);
3399                         kfree_skb(skb);
3400                         input_queue_head_incr(sd);
3401                 }
3402         }
3403         rps_unlock(sd);
3404
3405         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3406                 if (skb->dev == dev) {
3407                         __skb_unlink(skb, &sd->process_queue);
3408                         kfree_skb(skb);
3409                         input_queue_head_incr(sd);
3410                 }
3411         }
3412 }
3413
3414 static int napi_gro_complete(struct sk_buff *skb)
3415 {
3416         struct packet_type *ptype;
3417         __be16 type = skb->protocol;
3418         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3419         int err = -ENOENT;
3420
3421         if (NAPI_GRO_CB(skb)->count == 1) {
3422                 skb_shinfo(skb)->gso_size = 0;
3423                 goto out;
3424         }
3425
3426         rcu_read_lock();
3427         list_for_each_entry_rcu(ptype, head, list) {
3428                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3429                         continue;
3430
3431                 err = ptype->gro_complete(skb);
3432                 break;
3433         }
3434         rcu_read_unlock();
3435
3436         if (err) {
3437                 WARN_ON(&ptype->list == head);
3438                 kfree_skb(skb);
3439                 return NET_RX_SUCCESS;
3440         }
3441
3442 out:
3443         return netif_receive_skb(skb);
3444 }
3445
3446 inline void napi_gro_flush(struct napi_struct *napi)
3447 {
3448         struct sk_buff *skb, *next;
3449
3450         for (skb = napi->gro_list; skb; skb = next) {
3451                 next = skb->next;
3452                 skb->next = NULL;
3453                 napi_gro_complete(skb);
3454         }
3455
3456         napi->gro_count = 0;
3457         napi->gro_list = NULL;
3458 }
3459 EXPORT_SYMBOL(napi_gro_flush);
3460
3461 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3462 {
3463         struct sk_buff **pp = NULL;
3464         struct packet_type *ptype;
3465         __be16 type = skb->protocol;
3466         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3467         int same_flow;
3468         int mac_len;
3469         enum gro_result ret;
3470
3471         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3472                 goto normal;
3473
3474         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3475                 goto normal;
3476
3477         rcu_read_lock();
3478         list_for_each_entry_rcu(ptype, head, list) {
3479                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3480                         continue;
3481
3482                 skb_set_network_header(skb, skb_gro_offset(skb));
3483                 mac_len = skb->network_header - skb->mac_header;
3484                 skb->mac_len = mac_len;
3485                 NAPI_GRO_CB(skb)->same_flow = 0;
3486                 NAPI_GRO_CB(skb)->flush = 0;
3487                 NAPI_GRO_CB(skb)->free = 0;
3488
3489                 pp = ptype->gro_receive(&napi->gro_list, skb);
3490                 break;
3491         }
3492         rcu_read_unlock();
3493
3494         if (&ptype->list == head)
3495                 goto normal;
3496
3497         same_flow = NAPI_GRO_CB(skb)->same_flow;
3498         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3499
3500         if (pp) {
3501                 struct sk_buff *nskb = *pp;
3502
3503                 *pp = nskb->next;
3504                 nskb->next = NULL;
3505                 napi_gro_complete(nskb);
3506                 napi->gro_count--;
3507         }
3508
3509         if (same_flow)
3510                 goto ok;
3511
3512         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3513                 goto normal;
3514
3515         napi->gro_count++;
3516         NAPI_GRO_CB(skb)->count = 1;
3517         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3518         skb->next = napi->gro_list;
3519         napi->gro_list = skb;
3520         ret = GRO_HELD;
3521
3522 pull:
3523         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3524                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3525
3526                 BUG_ON(skb->end - skb->tail < grow);
3527
3528                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3529
3530                 skb->tail += grow;
3531                 skb->data_len -= grow;
3532
3533                 skb_shinfo(skb)->frags[0].page_offset += grow;
3534                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3535
3536                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3537                         skb_frag_unref(skb, 0);
3538                         memmove(skb_shinfo(skb)->frags,
3539                                 skb_shinfo(skb)->frags + 1,
3540                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3541                 }
3542         }
3543
3544 ok:
3545         return ret;
3546
3547 normal:
3548         ret = GRO_NORMAL;
3549         goto pull;
3550 }
3551 EXPORT_SYMBOL(dev_gro_receive);
3552
3553 static inline gro_result_t
3554 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3555 {
3556         struct sk_buff *p;
3557
3558         for (p = napi->gro_list; p; p = p->next) {
3559                 unsigned long diffs;
3560
3561                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3562                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3563                 diffs |= compare_ether_header(skb_mac_header(p),
3564                                               skb_gro_mac_header(skb));
3565                 NAPI_GRO_CB(p)->same_flow = !diffs;
3566                 NAPI_GRO_CB(p)->flush = 0;
3567         }
3568
3569         return dev_gro_receive(napi, skb);
3570 }
3571
3572 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3573 {
3574         switch (ret) {
3575         case GRO_NORMAL:
3576                 if (netif_receive_skb(skb))
3577                         ret = GRO_DROP;
3578                 break;
3579
3580         case GRO_DROP:
3581         case GRO_MERGED_FREE:
3582                 kfree_skb(skb);
3583                 break;
3584
3585         case GRO_HELD:
3586         case GRO_MERGED:
3587                 break;
3588         }
3589
3590         return ret;
3591 }
3592 EXPORT_SYMBOL(napi_skb_finish);
3593
3594 void skb_gro_reset_offset(struct sk_buff *skb)
3595 {
3596         NAPI_GRO_CB(skb)->data_offset = 0;
3597         NAPI_GRO_CB(skb)->frag0 = NULL;
3598         NAPI_GRO_CB(skb)->frag0_len = 0;
3599
3600         if (skb->mac_header == skb->tail &&
3601             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3602                 NAPI_GRO_CB(skb)->frag0 =
3603                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3604                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3605         }
3606 }
3607 EXPORT_SYMBOL(skb_gro_reset_offset);
3608
3609 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3610 {
3611         skb_gro_reset_offset(skb);
3612
3613         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3614 }
3615 EXPORT_SYMBOL(napi_gro_receive);
3616
3617 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3618 {
3619         __skb_pull(skb, skb_headlen(skb));
3620         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3621         skb->vlan_tci = 0;
3622         skb->dev = napi->dev;
3623         skb->skb_iif = 0;
3624
3625         napi->skb = skb;
3626 }
3627
3628 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3629 {
3630         struct sk_buff *skb = napi->skb;
3631
3632         if (!skb) {
3633                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3634                 if (skb)
3635                         napi->skb = skb;
3636         }
3637         return skb;
3638 }
3639 EXPORT_SYMBOL(napi_get_frags);
3640
3641 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3642                                gro_result_t ret)
3643 {
3644         switch (ret) {
3645         case GRO_NORMAL:
3646         case GRO_HELD:
3647                 skb->protocol = eth_type_trans(skb, skb->dev);
3648
3649                 if (ret == GRO_HELD)
3650                         skb_gro_pull(skb, -ETH_HLEN);
3651                 else if (netif_receive_skb(skb))
3652                         ret = GRO_DROP;
3653                 break;
3654
3655         case GRO_DROP:
3656         case GRO_MERGED_FREE:
3657                 napi_reuse_skb(napi, skb);
3658                 break;
3659
3660         case GRO_MERGED:
3661                 break;
3662         }
3663
3664         return ret;
3665 }
3666 EXPORT_SYMBOL(napi_frags_finish);
3667
3668 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3669 {
3670         struct sk_buff *skb = napi->skb;
3671         struct ethhdr *eth;
3672         unsigned int hlen;
3673         unsigned int off;
3674
3675         napi->skb = NULL;
3676
3677         skb_reset_mac_header(skb);
3678         skb_gro_reset_offset(skb);
3679
3680         off = skb_gro_offset(skb);
3681         hlen = off + sizeof(*eth);
3682         eth = skb_gro_header_fast(skb, off);
3683         if (skb_gro_header_hard(skb, hlen)) {
3684                 eth = skb_gro_header_slow(skb, hlen, off);
3685                 if (unlikely(!eth)) {
3686                         napi_reuse_skb(napi, skb);
3687                         skb = NULL;
3688                         goto out;
3689                 }
3690         }
3691
3692         skb_gro_pull(skb, sizeof(*eth));
3693
3694         /*
3695          * This works because the only protocols we care about don't require
3696          * special handling.  We'll fix it up properly at the end.
3697          */
3698         skb->protocol = eth->h_proto;
3699
3700 out:
3701         return skb;
3702 }
3703 EXPORT_SYMBOL(napi_frags_skb);
3704
3705 gro_result_t napi_gro_frags(struct napi_struct *napi)
3706 {
3707         struct sk_buff *skb = napi_frags_skb(napi);
3708
3709         if (!skb)
3710                 return GRO_DROP;
3711
3712         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3713 }
3714 EXPORT_SYMBOL(napi_gro_frags);
3715
3716 /*
3717  * net_rps_action sends any pending IPI's for rps.
3718  * Note: called with local irq disabled, but exits with local irq enabled.
3719  */
3720 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3721 {
3722 #ifdef CONFIG_RPS
3723         struct softnet_data *remsd = sd->rps_ipi_list;
3724
3725         if (remsd) {
3726                 sd->rps_ipi_list = NULL;
3727
3728                 local_irq_enable();
3729
3730                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3731                 while (remsd) {
3732                         struct softnet_data *next = remsd->rps_ipi_next;
3733
3734                         if (cpu_online(remsd->cpu))
3735                                 __smp_call_function_single(remsd->cpu,
3736                                                            &remsd->csd, 0);
3737                         remsd = next;
3738                 }
3739         } else
3740 #endif
3741                 local_irq_enable();
3742 }
3743
3744 static int process_backlog(struct napi_struct *napi, int quota)
3745 {
3746         int work = 0;
3747         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3748
3749 #ifdef CONFIG_RPS
3750         /* Check if we have pending ipi, its better to send them now,
3751          * not waiting net_rx_action() end.
3752          */
3753         if (sd->rps_ipi_list) {
3754                 local_irq_disable();
3755                 net_rps_action_and_irq_enable(sd);
3756         }
3757 #endif
3758         napi->weight = weight_p;
3759         local_irq_disable();
3760         while (work < quota) {
3761                 struct sk_buff *skb;
3762                 unsigned int qlen;
3763
3764                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3765                         local_irq_enable();
3766                         __netif_receive_skb(skb);
3767                         local_irq_disable();
3768                         input_queue_head_incr(sd);
3769                         if (++work >= quota) {
3770                                 local_irq_enable();
3771                                 return work;
3772                         }
3773                 }
3774
3775                 rps_lock(sd);
3776                 qlen = skb_queue_len(&sd->input_pkt_queue);
3777                 if (qlen)
3778                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3779                                                    &sd->process_queue);
3780
3781                 if (qlen < quota - work) {
3782                         /*
3783                          * Inline a custom version of __napi_complete().
3784                          * only current cpu owns and manipulates this napi,
3785                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3786                          * we can use a plain write instead of clear_bit(),
3787                          * and we dont need an smp_mb() memory barrier.
3788                          */
3789                         list_del(&napi->poll_list);
3790                         napi->state = 0;
3791
3792                         quota = work + qlen;
3793                 }
3794                 rps_unlock(sd);
3795         }
3796         local_irq_enable();
3797
3798         return work;
3799 }
3800
3801 /**
3802  * __napi_schedule - schedule for receive
3803  * @n: entry to schedule
3804  *
3805  * The entry's receive function will be scheduled to run
3806  */
3807 void __napi_schedule(struct napi_struct *n)
3808 {
3809         unsigned long flags;
3810
3811         local_irq_save(flags);
3812         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3813         local_irq_restore(flags);
3814 }
3815 EXPORT_SYMBOL(__napi_schedule);
3816
3817 void __napi_complete(struct napi_struct *n)
3818 {
3819         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3820         BUG_ON(n->gro_list);
3821
3822         list_del(&n->poll_list);
3823         smp_mb__before_clear_bit();
3824         clear_bit(NAPI_STATE_SCHED, &n->state);
3825 }
3826 EXPORT_SYMBOL(__napi_complete);
3827
3828 void napi_complete(struct napi_struct *n)
3829 {
3830         unsigned long flags;
3831
3832         /*
3833          * don't let napi dequeue from the cpu poll list
3834          * just in case its running on a different cpu
3835          */
3836         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3837                 return;
3838
3839         napi_gro_flush(n);
3840         local_irq_save(flags);
3841         __napi_complete(n);
3842         local_irq_restore(flags);
3843 }
3844 EXPORT_SYMBOL(napi_complete);
3845
3846 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3847                     int (*poll)(struct napi_struct *, int), int weight)
3848 {
3849         INIT_LIST_HEAD(&napi->poll_list);
3850         napi->gro_count = 0;
3851         napi->gro_list = NULL;
3852         napi->skb = NULL;
3853         napi->poll = poll;
3854         napi->weight = weight;
3855         list_add(&napi->dev_list, &dev->napi_list);
3856         napi->dev = dev;
3857 #ifdef CONFIG_NETPOLL
3858         spin_lock_init(&napi->poll_lock);
3859         napi->poll_owner = -1;
3860 #endif
3861         set_bit(NAPI_STATE_SCHED, &napi->state);
3862 }
3863 EXPORT_SYMBOL(netif_napi_add);
3864
3865 void netif_napi_del(struct napi_struct *napi)
3866 {
3867         struct sk_buff *skb, *next;
3868
3869         list_del_init(&napi->dev_list);
3870         napi_free_frags(napi);
3871
3872         for (skb = napi->gro_list; skb; skb = next) {
3873                 next = skb->next;
3874                 skb->next = NULL;
3875                 kfree_skb(skb);
3876         }
3877
3878         napi->gro_list = NULL;
3879         napi->gro_count = 0;
3880 }
3881 EXPORT_SYMBOL(netif_napi_del);
3882
3883 static void net_rx_action(struct softirq_action *h)
3884 {
3885         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3886         unsigned long time_limit = jiffies + 2;
3887         int budget = netdev_budget;
3888         void *have;
3889
3890         local_irq_disable();
3891
3892         while (!list_empty(&sd->poll_list)) {
3893                 struct napi_struct *n;
3894                 int work, weight;
3895
3896                 /* If softirq window is exhuasted then punt.
3897                  * Allow this to run for 2 jiffies since which will allow
3898                  * an average latency of 1.5/HZ.
3899                  */
3900                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3901                         goto softnet_break;
3902
3903                 local_irq_enable();
3904
3905                 /* Even though interrupts have been re-enabled, this
3906                  * access is safe because interrupts can only add new
3907                  * entries to the tail of this list, and only ->poll()
3908                  * calls can remove this head entry from the list.
3909                  */
3910                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3911
3912                 have = netpoll_poll_lock(n);
3913
3914                 weight = n->weight;
3915
3916                 /* This NAPI_STATE_SCHED test is for avoiding a race
3917                  * with netpoll's poll_napi().  Only the entity which
3918                  * obtains the lock and sees NAPI_STATE_SCHED set will
3919                  * actually make the ->poll() call.  Therefore we avoid
3920                  * accidentally calling ->poll() when NAPI is not scheduled.
3921                  */
3922                 work = 0;
3923                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3924                         work = n->poll(n, weight);
3925                         trace_napi_poll(n);
3926                 }
3927
3928                 WARN_ON_ONCE(work > weight);
3929
3930                 budget -= work;
3931
3932                 local_irq_disable();
3933
3934                 /* Drivers must not modify the NAPI state if they
3935                  * consume the entire weight.  In such cases this code
3936                  * still "owns" the NAPI instance and therefore can
3937                  * move the instance around on the list at-will.
3938                  */
3939                 if (unlikely(work == weight)) {
3940                         if (unlikely(napi_disable_pending(n))) {
3941                                 local_irq_enable();
3942                                 napi_complete(n);
3943                                 local_irq_disable();
3944                         } else
3945                                 list_move_tail(&n->poll_list, &sd->poll_list);
3946                 }
3947
3948                 netpoll_poll_unlock(have);
3949         }
3950 out:
3951         net_rps_action_and_irq_enable(sd);
3952
3953 #ifdef CONFIG_NET_DMA
3954         /*
3955          * There may not be any more sk_buffs coming right now, so push
3956          * any pending DMA copies to hardware
3957          */
3958         dma_issue_pending_all();
3959 #endif
3960
3961         return;
3962
3963 softnet_break:
3964         sd->time_squeeze++;
3965         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3966         goto out;
3967 }
3968
3969 static gifconf_func_t *gifconf_list[NPROTO];
3970
3971 /**
3972  *      register_gifconf        -       register a SIOCGIF handler
3973  *      @family: Address family
3974  *      @gifconf: Function handler
3975  *
3976  *      Register protocol dependent address dumping routines. The handler
3977  *      that is passed must not be freed or reused until it has been replaced
3978  *      by another handler.
3979  */
3980 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3981 {
3982         if (family >= NPROTO)
3983                 return -EINVAL;
3984         gifconf_list[family] = gifconf;
3985         return 0;
3986 }
3987 EXPORT_SYMBOL(register_gifconf);
3988
3989
3990 /*
3991  *      Map an interface index to its name (SIOCGIFNAME)
3992  */
3993
3994 /*
3995  *      We need this ioctl for efficient implementation of the
3996  *      if_indextoname() function required by the IPv6 API.  Without
3997  *      it, we would have to search all the interfaces to find a
3998  *      match.  --pb
3999  */
4000
4001 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4002 {
4003         struct net_device *dev;
4004         struct ifreq ifr;
4005
4006         /*
4007          *      Fetch the caller's info block.
4008          */
4009
4010         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4011                 return -EFAULT;
4012
4013         rcu_read_lock();
4014         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4015         if (!dev) {
4016                 rcu_read_unlock();
4017                 return -ENODEV;
4018         }
4019
4020         strcpy(ifr.ifr_name, dev->name);
4021         rcu_read_unlock();
4022
4023         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4024                 return -EFAULT;
4025         return 0;
4026 }
4027
4028 /*
4029  *      Perform a SIOCGIFCONF call. This structure will change
4030  *      size eventually, and there is nothing I can do about it.
4031  *      Thus we will need a 'compatibility mode'.
4032  */
4033
4034 static int dev_ifconf(struct net *net, char __user *arg)
4035 {
4036         struct ifconf ifc;
4037         struct net_device *dev;
4038         char __user *pos;
4039         int len;
4040         int total;
4041         int i;
4042
4043         /*
4044          *      Fetch the caller's info block.
4045          */
4046
4047         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4048                 return -EFAULT;
4049
4050         pos = ifc.ifc_buf;
4051         len = ifc.ifc_len;
4052
4053         /*
4054          *      Loop over the interfaces, and write an info block for each.
4055          */
4056
4057         total = 0;
4058         for_each_netdev(net, dev) {
4059                 for (i = 0; i < NPROTO; i++) {
4060                         if (gifconf_list[i]) {
4061                                 int done;
4062                                 if (!pos)
4063                                         done = gifconf_list[i](dev, NULL, 0);
4064                                 else
4065                                         done = gifconf_list[i](dev, pos + total,
4066                                                                len - total);
4067                                 if (done < 0)
4068                                         return -EFAULT;
4069                                 total += done;
4070                         }
4071                 }
4072         }
4073
4074         /*
4075          *      All done.  Write the updated control block back to the caller.
4076          */
4077         ifc.ifc_len = total;
4078
4079         /*
4080          *      Both BSD and Solaris return 0 here, so we do too.
4081          */
4082         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4083 }
4084
4085 #ifdef CONFIG_PROC_FS
4086
4087 #define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4088
4089 struct dev_iter_state {
4090         struct seq_net_private p;
4091         unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4092 };
4093
4094 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4095 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4096 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4097
4098 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4099 {
4100         struct dev_iter_state *state = seq->private;
4101         struct net *net = seq_file_net(seq);
4102         struct net_device *dev;
4103         struct hlist_node *p;
4104         struct hlist_head *h;
4105         unsigned int count, bucket, offset;
4106
4107         bucket = get_bucket(state->pos);
4108         offset = get_offset(state->pos);
4109         h = &net->dev_name_head[bucket];
4110         count = 0;
4111         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4112                 if (count++ == offset) {
4113                         state->pos = set_bucket_offset(bucket, count);
4114                         return dev;
4115                 }
4116         }
4117
4118         return NULL;
4119 }
4120
4121 static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4122 {
4123         struct dev_iter_state *state = seq->private;
4124         struct net_device *dev;
4125         unsigned int bucket;
4126
4127         bucket = get_bucket(state->pos);
4128         do {
4129                 dev = dev_from_same_bucket(seq);
4130                 if (dev)
4131                         return dev;
4132
4133                 bucket++;
4134                 state->pos = set_bucket_offset(bucket, 0);
4135         } while (bucket < NETDEV_HASHENTRIES);
4136
4137         return NULL;
4138 }
4139
4140 /*
4141  *      This is invoked by the /proc filesystem handler to display a device
4142  *      in detail.
4143  */
4144 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4145         __acquires(RCU)
4146 {
4147         struct dev_iter_state *state = seq->private;
4148
4149         rcu_read_lock();
4150         if (!*pos)
4151                 return SEQ_START_TOKEN;
4152
4153         /* check for end of the hash */
4154         if (state->pos == 0 && *pos > 1)
4155                 return NULL;
4156
4157         return dev_from_new_bucket(seq);
4158 }
4159
4160 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4161 {
4162         struct net_device *dev;
4163
4164         ++*pos;
4165
4166         if (v == SEQ_START_TOKEN)
4167                 return dev_from_new_bucket(seq);
4168
4169         dev = dev_from_same_bucket(seq);
4170         if (dev)
4171                 return dev;
4172
4173         return dev_from_new_bucket(seq);
4174 }
4175
4176 void dev_seq_stop(struct seq_file *seq, void *v)
4177         __releases(RCU)
4178 {
4179         rcu_read_unlock();
4180 }
4181
4182 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4183 {
4184         struct rtnl_link_stats64 temp;
4185         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4186
4187         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4188                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4189                    dev->name, stats->rx_bytes, stats->rx_packets,
4190                    stats->rx_errors,
4191                    stats->rx_dropped + stats->rx_missed_errors,
4192                    stats->rx_fifo_errors,
4193                    stats->rx_length_errors + stats->rx_over_errors +
4194                     stats->rx_crc_errors + stats->rx_frame_errors,
4195                    stats->rx_compressed, stats->multicast,
4196                    stats->tx_bytes, stats->tx_packets,
4197                    stats->tx_errors, stats->tx_dropped,
4198                    stats->tx_fifo_errors, stats->collisions,
4199                    stats->tx_carrier_errors +
4200                     stats->tx_aborted_errors +
4201                     stats->tx_window_errors +
4202                     stats->tx_heartbeat_errors,
4203                    stats->tx_compressed);
4204 }
4205
4206 /*
4207  *      Called from the PROCfs module. This now uses the new arbitrary sized
4208  *      /proc/net interface to create /proc/net/dev
4209  */
4210 static int dev_seq_show(struct seq_file *seq, void *v)
4211 {
4212         if (v == SEQ_START_TOKEN)
4213                 seq_puts(seq, "Inter-|   Receive                            "
4214                               "                    |  Transmit\n"
4215                               " face |bytes    packets errs drop fifo frame "
4216                               "compressed multicast|bytes    packets errs "
4217                               "drop fifo colls carrier compressed\n");
4218         else
4219                 dev_seq_printf_stats(seq, v);
4220         return 0;
4221 }
4222
4223 static struct softnet_data *softnet_get_online(loff_t *pos)
4224 {
4225         struct softnet_data *sd = NULL;
4226
4227         while (*pos < nr_cpu_ids)
4228                 if (cpu_online(*pos)) {
4229                         sd = &per_cpu(softnet_data, *pos);
4230                         break;
4231                 } else
4232                         ++*pos;
4233         return sd;
4234 }
4235
4236 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4237 {
4238         return softnet_get_online(pos);
4239 }
4240
4241 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4242 {
4243         ++*pos;
4244         return softnet_get_online(pos);
4245 }
4246
4247 static void softnet_seq_stop(struct seq_file *seq, void *v)
4248 {
4249 }
4250
4251 static int softnet_seq_show(struct seq_file *seq, void *v)
4252 {
4253         struct softnet_data *sd = v;
4254
4255         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4256                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4257                    0, 0, 0, 0, /* was fastroute */
4258                    sd->cpu_collision, sd->received_rps);
4259         return 0;
4260 }
4261
4262 static const struct seq_operations dev_seq_ops = {
4263         .start = dev_seq_start,
4264         .next  = dev_seq_next,
4265         .stop  = dev_seq_stop,
4266         .show  = dev_seq_show,
4267 };
4268
4269 static int dev_seq_open(struct inode *inode, struct file *file)
4270 {
4271         return seq_open_net(inode, file, &dev_seq_ops,
4272                             sizeof(struct dev_iter_state));
4273 }
4274
4275 static const struct file_operations dev_seq_fops = {
4276         .owner   = THIS_MODULE,
4277         .open    = dev_seq_open,
4278         .read    = seq_read,
4279         .llseek  = seq_lseek,
4280         .release = seq_release_net,
4281 };
4282
4283 static const struct seq_operations softnet_seq_ops = {
4284         .start = softnet_seq_start,
4285         .next  = softnet_seq_next,
4286         .stop  = softnet_seq_stop,
4287         .show  = softnet_seq_show,
4288 };
4289
4290 static int softnet_seq_open(struct inode *inode, struct file *file)
4291 {
4292         return seq_open(file, &softnet_seq_ops);
4293 }
4294
4295 static const struct file_operations softnet_seq_fops = {
4296         .owner   = THIS_MODULE,
4297         .open    = softnet_seq_open,
4298         .read    = seq_read,
4299         .llseek  = seq_lseek,
4300         .release = seq_release,
4301 };
4302
4303 static void *ptype_get_idx(loff_t pos)
4304 {
4305         struct packet_type *pt = NULL;
4306         loff_t i = 0;
4307         int t;
4308
4309         list_for_each_entry_rcu(pt, &ptype_all, list) {
4310                 if (i == pos)
4311                         return pt;
4312                 ++i;
4313         }
4314
4315         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4316                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4317                         if (i == pos)
4318                                 return pt;
4319                         ++i;
4320                 }
4321         }
4322         return NULL;
4323 }
4324
4325 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4326         __acquires(RCU)
4327 {
4328         rcu_read_lock();
4329         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4330 }
4331
4332 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4333 {
4334         struct packet_type *pt;
4335         struct list_head *nxt;
4336         int hash;
4337
4338         ++*pos;
4339         if (v == SEQ_START_TOKEN)
4340                 return ptype_get_idx(0);
4341
4342         pt = v;
4343         nxt = pt->list.next;
4344         if (pt->type == htons(ETH_P_ALL)) {
4345                 if (nxt != &ptype_all)
4346                         goto found;
4347                 hash = 0;
4348                 nxt = ptype_base[0].next;
4349         } else
4350                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4351
4352         while (nxt == &ptype_base[hash]) {
4353                 if (++hash >= PTYPE_HASH_SIZE)
4354                         return NULL;
4355                 nxt = ptype_base[hash].next;
4356         }
4357 found:
4358         return list_entry(nxt, struct packet_type, list);
4359 }
4360
4361 static void ptype_seq_stop(struct seq_file *seq, void *v)
4362         __releases(RCU)
4363 {
4364         rcu_read_unlock();
4365 }
4366
4367 static int ptype_seq_show(struct seq_file *seq, void *v)
4368 {
4369         struct packet_type *pt = v;
4370
4371         if (v == SEQ_START_TOKEN)
4372                 seq_puts(seq, "Type Device      Function\n");
4373         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4374                 if (pt->type == htons(ETH_P_ALL))
4375                         seq_puts(seq, "ALL ");
4376                 else
4377                         seq_printf(seq, "%04x", ntohs(pt->type));
4378
4379                 seq_printf(seq, " %-8s %pF\n",
4380                            pt->dev ? pt->dev->name : "", pt->func);
4381         }
4382
4383         return 0;
4384 }
4385
4386 static const struct seq_operations ptype_seq_ops = {
4387         .start = ptype_seq_start,
4388         .next  = ptype_seq_next,
4389         .stop  = ptype_seq_stop,
4390         .show  = ptype_seq_show,
4391 };
4392
4393 static int ptype_seq_open(struct inode *inode, struct file *file)
4394 {
4395         return seq_open_net(inode, file, &ptype_seq_ops,
4396                         sizeof(struct seq_net_private));
4397 }
4398
4399 static const struct file_operations ptype_seq_fops = {
4400         .owner   = THIS_MODULE,
4401         .open    = ptype_seq_open,
4402         .read    = seq_read,
4403         .llseek  = seq_lseek,
4404         .release = seq_release_net,
4405 };
4406
4407
4408 static int __net_init dev_proc_net_init(struct net *net)
4409 {
4410         int rc = -ENOMEM;
4411
4412         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4413                 goto out;
4414         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4415                 goto out_dev;
4416         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4417                 goto out_softnet;
4418
4419         if (wext_proc_init(net))
4420                 goto out_ptype;
4421         rc = 0;
4422 out:
4423         return rc;
4424 out_ptype:
4425         proc_net_remove(net, "ptype");
4426 out_softnet:
4427         proc_net_remove(net, "softnet_stat");
4428 out_dev:
4429         proc_net_remove(net, "dev");
4430         goto out;
4431 }
4432
4433 static void __net_exit dev_proc_net_exit(struct net *net)
4434 {
4435         wext_proc_exit(net);
4436
4437         proc_net_remove(net, "ptype");
4438         proc_net_remove(net, "softnet_stat");
4439         proc_net_remove(net, "dev");
4440 }
4441
4442 static struct pernet_operations __net_initdata dev_proc_ops = {
4443         .init = dev_proc_net_init,
4444         .exit = dev_proc_net_exit,
4445 };
4446
4447 static int __init dev_proc_init(void)
4448 {
4449         return register_pernet_subsys(&dev_proc_ops);
4450 }
4451 #else
4452 #define dev_proc_init() 0
4453 #endif  /* CONFIG_PROC_FS */
4454
4455
4456 /**
4457  *      netdev_set_master       -       set up master pointer
4458  *      @slave: slave device
4459  *      @master: new master device
4460  *
4461  *      Changes the master device of the slave. Pass %NULL to break the
4462  *      bonding. The caller must hold the RTNL semaphore. On a failure
4463  *      a negative errno code is returned. On success the reference counts
4464  *      are adjusted and the function returns zero.
4465  */
4466 int netdev_set_master(struct net_device *slave, struct net_device *master)
4467 {
4468         struct net_device *old = slave->master;
4469
4470         ASSERT_RTNL();
4471
4472         if (master) {
4473                 if (old)
4474                         return -EBUSY;
4475                 dev_hold(master);
4476         }
4477
4478         slave->master = master;
4479
4480         if (old)
4481                 dev_put(old);
4482         return 0;
4483 }
4484 EXPORT_SYMBOL(netdev_set_master);
4485
4486 /**
4487  *      netdev_set_bond_master  -       set up bonding master/slave pair
4488  *      @slave: slave device
4489  *      @master: new master device
4490  *
4491  *      Changes the master device of the slave. Pass %NULL to break the
4492  *      bonding. The caller must hold the RTNL semaphore. On a failure
4493  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4494  *      to the routing socket and the function returns zero.
4495  */
4496 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4497 {
4498         int err;
4499
4500         ASSERT_RTNL();
4501
4502         err = netdev_set_master(slave, master);
4503         if (err)
4504                 return err;
4505         if (master)
4506                 slave->flags |= IFF_SLAVE;
4507         else
4508                 slave->flags &= ~IFF_SLAVE;
4509
4510         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4511         return 0;
4512 }
4513 EXPORT_SYMBOL(netdev_set_bond_master);
4514
4515 static void dev_change_rx_flags(struct net_device *dev, int flags)
4516 {
4517         const struct net_device_ops *ops = dev->netdev_ops;
4518
4519         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4520                 ops->ndo_change_rx_flags(dev, flags);
4521 }
4522
4523 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4524 {
4525         unsigned short old_flags = dev->flags;
4526         uid_t uid;
4527         gid_t gid;
4528
4529         ASSERT_RTNL();
4530
4531         dev->flags |= IFF_PROMISC;
4532         dev->promiscuity += inc;
4533         if (dev->promiscuity == 0) {
4534                 /*
4535                  * Avoid overflow.
4536                  * If inc causes overflow, untouch promisc and return error.
4537                  */
4538                 if (inc < 0)
4539                         dev->flags &= ~IFF_PROMISC;
4540                 else {
4541                         dev->promiscuity -= inc;
4542                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4543                                 "set promiscuity failed, promiscuity feature "
4544                                 "of device might be broken.\n", dev->name);
4545                         return -EOVERFLOW;
4546                 }
4547         }
4548         if (dev->flags != old_flags) {
4549                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4550                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4551                                                                "left");
4552                 if (audit_enabled) {
4553                         current_uid_gid(&uid, &gid);
4554                         audit_log(current->audit_context, GFP_ATOMIC,
4555                                 AUDIT_ANOM_PROMISCUOUS,
4556                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4557                                 dev->name, (dev->flags & IFF_PROMISC),
4558                                 (old_flags & IFF_PROMISC),
4559                                 audit_get_loginuid(current),
4560                                 uid, gid,
4561                                 audit_get_sessionid(current));
4562                 }
4563
4564                 dev_change_rx_flags(dev, IFF_PROMISC);
4565         }
4566         return 0;
4567 }
4568
4569 /**
4570  *      dev_set_promiscuity     - update promiscuity count on a device
4571  *      @dev: device
4572  *      @inc: modifier
4573  *
4574  *      Add or remove promiscuity from a device. While the count in the device
4575  *      remains above zero the interface remains promiscuous. Once it hits zero
4576  *      the device reverts back to normal filtering operation. A negative inc
4577  *      value is used to drop promiscuity on the device.
4578  *      Return 0 if successful or a negative errno code on error.
4579  */
4580 int dev_set_promiscuity(struct net_device *dev, int inc)
4581 {
4582         unsigned short old_flags = dev->flags;
4583         int err;
4584
4585         err = __dev_set_promiscuity(dev, inc);
4586         if (err < 0)
4587                 return err;
4588         if (dev->flags != old_flags)
4589                 dev_set_rx_mode(dev);
4590         return err;
4591 }
4592 EXPORT_SYMBOL(dev_set_promiscuity);
4593
4594 /**
4595  *      dev_set_allmulti        - update allmulti count on a device
4596  *      @dev: device
4597  *      @inc: modifier
4598  *
4599  *      Add or remove reception of all multicast frames to a device. While the
4600  *      count in the device remains above zero the interface remains listening
4601  *      to all interfaces. Once it hits zero the device reverts back to normal
4602  *      filtering operation. A negative @inc value is used to drop the counter
4603  *      when releasing a resource needing all multicasts.
4604  *      Return 0 if successful or a negative errno code on error.
4605  */
4606
4607 int dev_set_allmulti(struct net_device *dev, int inc)
4608 {
4609         unsigned short old_flags = dev->flags;
4610
4611         ASSERT_RTNL();
4612
4613         dev->flags |= IFF_ALLMULTI;
4614         dev->allmulti += inc;
4615         if (dev->allmulti == 0) {
4616                 /*
4617                  * Avoid overflow.
4618                  * If inc causes overflow, untouch allmulti and return error.
4619                  */
4620                 if (inc < 0)
4621                         dev->flags &= ~IFF_ALLMULTI;
4622                 else {
4623                         dev->allmulti -= inc;
4624                         printk(KERN_WARNING "%s: allmulti touches roof, "
4625                                 "set allmulti failed, allmulti feature of "
4626                                 "device might be broken.\n", dev->name);
4627                         return -EOVERFLOW;
4628                 }
4629         }
4630         if (dev->flags ^ old_flags) {
4631                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4632                 dev_set_rx_mode(dev);
4633         }
4634         return 0;
4635 }
4636 EXPORT_SYMBOL(dev_set_allmulti);
4637
4638 /*
4639  *      Upload unicast and multicast address lists to device and
4640  *      configure RX filtering. When the device doesn't support unicast
4641  *      filtering it is put in promiscuous mode while unicast addresses
4642  *      are present.
4643  */
4644 void __dev_set_rx_mode(struct net_device *dev)
4645 {
4646         const struct net_device_ops *ops = dev->netdev_ops;
4647
4648         /* dev_open will call this function so the list will stay sane. */
4649         if (!(dev->flags&IFF_UP))
4650                 return;
4651
4652         if (!netif_device_present(dev))
4653                 return;
4654
4655         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4656                 /* Unicast addresses changes may only happen under the rtnl,
4657                  * therefore calling __dev_set_promiscuity here is safe.
4658                  */
4659                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4660                         __dev_set_promiscuity(dev, 1);
4661                         dev->uc_promisc = true;
4662                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4663                         __dev_set_promiscuity(dev, -1);
4664                         dev->uc_promisc = false;
4665                 }
4666         }
4667
4668         if (ops->ndo_set_rx_mode)
4669                 ops->ndo_set_rx_mode(dev);
4670 }
4671
4672 void dev_set_rx_mode(struct net_device *dev)
4673 {
4674         netif_addr_lock_bh(dev);
4675         __dev_set_rx_mode(dev);
4676         netif_addr_unlock_bh(dev);
4677 }
4678
4679 /**
4680  *      dev_get_flags - get flags reported to userspace
4681  *      @dev: device
4682  *
4683  *      Get the combination of flag bits exported through APIs to userspace.
4684  */
4685 unsigned dev_get_flags(const struct net_device *dev)
4686 {
4687         unsigned flags;
4688
4689         flags = (dev->flags & ~(IFF_PROMISC |
4690                                 IFF_ALLMULTI |
4691                                 IFF_RUNNING |
4692                                 IFF_LOWER_UP |
4693                                 IFF_DORMANT)) |
4694                 (dev->gflags & (IFF_PROMISC |
4695                                 IFF_ALLMULTI));
4696
4697         if (netif_running(dev)) {
4698                 if (netif_oper_up(dev))
4699                         flags |= IFF_RUNNING;
4700                 if (netif_carrier_ok(dev))
4701                         flags |= IFF_LOWER_UP;
4702                 if (netif_dormant(dev))
4703                         flags |= IFF_DORMANT;
4704         }
4705
4706         return flags;
4707 }
4708 EXPORT_SYMBOL(dev_get_flags);
4709
4710 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4711 {
4712         int old_flags = dev->flags;
4713         int ret;
4714
4715         ASSERT_RTNL();
4716
4717         /*
4718          *      Set the flags on our device.
4719          */
4720
4721         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4722                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4723                                IFF_AUTOMEDIA)) |
4724                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4725                                     IFF_ALLMULTI));
4726
4727         /*
4728          *      Load in the correct multicast list now the flags have changed.
4729          */
4730
4731         if ((old_flags ^ flags) & IFF_MULTICAST)
4732                 dev_change_rx_flags(dev, IFF_MULTICAST);
4733
4734         dev_set_rx_mode(dev);
4735
4736         /*
4737          *      Have we downed the interface. We handle IFF_UP ourselves
4738          *      according to user attempts to set it, rather than blindly
4739          *      setting it.
4740          */
4741
4742         ret = 0;
4743         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4744                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4745
4746                 if (!ret)
4747                         dev_set_rx_mode(dev);
4748         }
4749
4750         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4751                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4752
4753                 dev->gflags ^= IFF_PROMISC;
4754                 dev_set_promiscuity(dev, inc);
4755         }
4756
4757         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4758            is important. Some (broken) drivers set IFF_PROMISC, when
4759            IFF_ALLMULTI is requested not asking us and not reporting.
4760          */
4761         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4762                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4763
4764                 dev->gflags ^= IFF_ALLMULTI;
4765                 dev_set_allmulti(dev, inc);
4766         }
4767
4768         return ret;
4769 }
4770
4771 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4772 {
4773         unsigned int changes = dev->flags ^ old_flags;
4774
4775         if (changes & IFF_UP) {
4776                 if (dev->flags & IFF_UP)
4777                         call_netdevice_notifiers(NETDEV_UP, dev);
4778                 else
4779                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4780         }
4781
4782         if (dev->flags & IFF_UP &&
4783             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4784                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4785 }
4786
4787 /**
4788  *      dev_change_flags - change device settings
4789  *      @dev: device
4790  *      @flags: device state flags
4791  *
4792  *      Change settings on device based state flags. The flags are
4793  *      in the userspace exported format.
4794  */
4795 int dev_change_flags(struct net_device *dev, unsigned flags)
4796 {
4797         int ret, changes;
4798         int old_flags = dev->flags;
4799
4800         ret = __dev_change_flags(dev, flags);
4801         if (ret < 0)
4802                 return ret;
4803
4804         changes = old_flags ^ dev->flags;
4805         if (changes)
4806                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4807
4808         __dev_notify_flags(dev, old_flags);
4809         return ret;
4810 }
4811 EXPORT_SYMBOL(dev_change_flags);
4812
4813 /**
4814  *      dev_set_mtu - Change maximum transfer unit
4815  *      @dev: device
4816  *      @new_mtu: new transfer unit
4817  *
4818  *      Change the maximum transfer size of the network device.
4819  */
4820 int dev_set_mtu(struct net_device *dev, int new_mtu)
4821 {
4822         const struct net_device_ops *ops = dev->netdev_ops;
4823         int err;
4824
4825         if (new_mtu == dev->mtu)
4826                 return 0;
4827
4828         /*      MTU must be positive.    */
4829         if (new_mtu < 0)
4830                 return -EINVAL;
4831
4832         if (!netif_device_present(dev))
4833                 return -ENODEV;
4834
4835         err = 0;
4836         if (ops->ndo_change_mtu)
4837                 err = ops->ndo_change_mtu(dev, new_mtu);
4838         else
4839                 dev->mtu = new_mtu;
4840
4841         if (!err && dev->flags & IFF_UP)
4842                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4843         return err;
4844 }
4845 EXPORT_SYMBOL(dev_set_mtu);
4846
4847 /**
4848  *      dev_set_group - Change group this device belongs to
4849  *      @dev: device
4850  *      @new_group: group this device should belong to
4851  */
4852 void dev_set_group(struct net_device *dev, int new_group)
4853 {
4854         dev->group = new_group;
4855 }
4856 EXPORT_SYMBOL(dev_set_group);
4857
4858 /**
4859  *      dev_set_mac_address - Change Media Access Control Address
4860  *      @dev: device
4861  *      @sa: new address
4862  *
4863  *      Change the hardware (MAC) address of the device
4864  */
4865 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4866 {
4867         const struct net_device_ops *ops = dev->netdev_ops;
4868         int err;
4869
4870         if (!ops->ndo_set_mac_address)
4871                 return -EOPNOTSUPP;
4872         if (sa->sa_family != dev->type)
4873                 return -EINVAL;
4874         if (!netif_device_present(dev))
4875                 return -ENODEV;
4876         err = ops->ndo_set_mac_address(dev, sa);
4877         if (!err)
4878                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4879         return err;
4880 }
4881 EXPORT_SYMBOL(dev_set_mac_address);
4882
4883 /*
4884  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4885  */
4886 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4887 {
4888         int err;
4889         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4890
4891         if (!dev)
4892                 return -ENODEV;
4893
4894         switch (cmd) {
4895         case SIOCGIFFLAGS:      /* Get interface flags */
4896                 ifr->ifr_flags = (short) dev_get_flags(dev);
4897                 return 0;
4898
4899         case SIOCGIFMETRIC:     /* Get the metric on the interface
4900                                    (currently unused) */
4901                 ifr->ifr_metric = 0;
4902                 return 0;
4903
4904         case SIOCGIFMTU:        /* Get the MTU of a device */
4905                 ifr->ifr_mtu = dev->mtu;
4906                 return 0;
4907
4908         case SIOCGIFHWADDR:
4909                 if (!dev->addr_len)
4910                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4911                 else
4912                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4913                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4914                 ifr->ifr_hwaddr.sa_family = dev->type;
4915                 return 0;
4916
4917         case SIOCGIFSLAVE:
4918                 err = -EINVAL;
4919                 break;
4920
4921         case SIOCGIFMAP:
4922                 ifr->ifr_map.mem_start = dev->mem_start;
4923                 ifr->ifr_map.mem_end   = dev->mem_end;
4924                 ifr->ifr_map.base_addr = dev->base_addr;
4925                 ifr->ifr_map.irq       = dev->irq;
4926                 ifr->ifr_map.dma       = dev->dma;
4927                 ifr->ifr_map.port      = dev->if_port;
4928                 return 0;
4929
4930         case SIOCGIFINDEX:
4931                 ifr->ifr_ifindex = dev->ifindex;
4932                 return 0;
4933
4934         case SIOCGIFTXQLEN:
4935                 ifr->ifr_qlen = dev->tx_queue_len;
4936                 return 0;
4937
4938         default:
4939                 /* dev_ioctl() should ensure this case
4940                  * is never reached
4941                  */
4942                 WARN_ON(1);
4943                 err = -ENOTTY;
4944                 break;
4945
4946         }
4947         return err;
4948 }
4949
4950 /*
4951  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4952  */
4953 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4954 {
4955         int err;
4956         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4957         const struct net_device_ops *ops;
4958
4959         if (!dev)
4960                 return -ENODEV;
4961
4962         ops = dev->netdev_ops;
4963
4964         switch (cmd) {
4965         case SIOCSIFFLAGS:      /* Set interface flags */
4966                 return dev_change_flags(dev, ifr->ifr_flags);
4967
4968         case SIOCSIFMETRIC:     /* Set the metric on the interface
4969                                    (currently unused) */
4970                 return -EOPNOTSUPP;
4971
4972         case SIOCSIFMTU:        /* Set the MTU of a device */
4973                 return dev_set_mtu(dev, ifr->ifr_mtu);
4974
4975         case SIOCSIFHWADDR:
4976                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4977
4978         case SIOCSIFHWBROADCAST:
4979                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4980                         return -EINVAL;
4981                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4982                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4983                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4984                 return 0;
4985
4986         case SIOCSIFMAP:
4987                 if (ops->ndo_set_config) {
4988                         if (!netif_device_present(dev))
4989                                 return -ENODEV;
4990                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4991                 }
4992                 return -EOPNOTSUPP;
4993
4994         case SIOCADDMULTI:
4995                 if (!ops->ndo_set_rx_mode ||
4996                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4997                         return -EINVAL;
4998                 if (!netif_device_present(dev))
4999                         return -ENODEV;
5000                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5001
5002         case SIOCDELMULTI:
5003                 if (!ops->ndo_set_rx_mode ||
5004                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5005                         return -EINVAL;
5006                 if (!netif_device_present(dev))
5007                         return -ENODEV;
5008                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5009
5010         case SIOCSIFTXQLEN:
5011                 if (ifr->ifr_qlen < 0)
5012                         return -EINVAL;
5013                 dev->tx_queue_len = ifr->ifr_qlen;
5014                 return 0;
5015
5016         case SIOCSIFNAME:
5017                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5018                 return dev_change_name(dev, ifr->ifr_newname);
5019
5020         case SIOCSHWTSTAMP:
5021                 err = net_hwtstamp_validate(ifr);
5022                 if (err)
5023                         return err;
5024                 /* fall through */
5025
5026         /*
5027          *      Unknown or private ioctl
5028          */
5029         default:
5030                 if ((cmd >= SIOCDEVPRIVATE &&
5031                     cmd <= SIOCDEVPRIVATE + 15) ||
5032                     cmd == SIOCBONDENSLAVE ||
5033                     cmd == SIOCBONDRELEASE ||
5034                     cmd == SIOCBONDSETHWADDR ||
5035                     cmd == SIOCBONDSLAVEINFOQUERY ||
5036                     cmd == SIOCBONDINFOQUERY ||
5037                     cmd == SIOCBONDCHANGEACTIVE ||
5038                     cmd == SIOCGMIIPHY ||
5039                     cmd == SIOCGMIIREG ||
5040                     cmd == SIOCSMIIREG ||
5041                     cmd == SIOCBRADDIF ||
5042                     cmd == SIOCBRDELIF ||
5043                     cmd == SIOCSHWTSTAMP ||
5044                     cmd == SIOCWANDEV) {
5045                         err = -EOPNOTSUPP;
5046                         if (ops->ndo_do_ioctl) {
5047                                 if (netif_device_present(dev))
5048                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5049                                 else
5050                                         err = -ENODEV;
5051                         }
5052                 } else
5053                         err = -EINVAL;
5054
5055         }
5056         return err;
5057 }
5058
5059 /*
5060  *      This function handles all "interface"-type I/O control requests. The actual
5061  *      'doing' part of this is dev_ifsioc above.
5062  */
5063
5064 /**
5065  *      dev_ioctl       -       network device ioctl
5066  *      @net: the applicable net namespace
5067  *      @cmd: command to issue
5068  *      @arg: pointer to a struct ifreq in user space
5069  *
5070  *      Issue ioctl functions to devices. This is normally called by the
5071  *      user space syscall interfaces but can sometimes be useful for
5072  *      other purposes. The return value is the return from the syscall if
5073  *      positive or a negative errno code on error.
5074  */
5075
5076 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5077 {
5078         struct ifreq ifr;
5079         int ret;
5080         char *colon;
5081
5082         /* One special case: SIOCGIFCONF takes ifconf argument
5083            and requires shared lock, because it sleeps writing
5084            to user space.
5085          */
5086
5087         if (cmd == SIOCGIFCONF) {
5088                 rtnl_lock();
5089                 ret = dev_ifconf(net, (char __user *) arg);
5090                 rtnl_unlock();
5091                 return ret;
5092         }
5093         if (cmd == SIOCGIFNAME)
5094                 return dev_ifname(net, (struct ifreq __user *)arg);
5095
5096         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5097                 return -EFAULT;
5098
5099         ifr.ifr_name[IFNAMSIZ-1] = 0;
5100
5101         colon = strchr(ifr.ifr_name, ':');
5102         if (colon)
5103                 *colon = 0;
5104
5105         /*
5106          *      See which interface the caller is talking about.
5107          */
5108
5109         switch (cmd) {
5110         /*
5111          *      These ioctl calls:
5112          *      - can be done by all.
5113          *      - atomic and do not require locking.
5114          *      - return a value
5115          */
5116         case SIOCGIFFLAGS:
5117         case SIOCGIFMETRIC:
5118         case SIOCGIFMTU:
5119         case SIOCGIFHWADDR:
5120         case SIOCGIFSLAVE:
5121         case SIOCGIFMAP:
5122         case SIOCGIFINDEX:
5123         case SIOCGIFTXQLEN:
5124                 dev_load(net, ifr.ifr_name);
5125                 rcu_read_lock();
5126                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5127                 rcu_read_unlock();
5128                 if (!ret) {
5129                         if (colon)
5130                                 *colon = ':';
5131                         if (copy_to_user(arg, &ifr,
5132                                          sizeof(struct ifreq)))
5133                                 ret = -EFAULT;
5134                 }
5135                 return ret;
5136
5137         case SIOCETHTOOL:
5138                 dev_load(net, ifr.ifr_name);
5139                 rtnl_lock();
5140                 ret = dev_ethtool(net, &ifr);
5141                 rtnl_unlock();
5142                 if (!ret) {
5143                         if (colon)
5144                                 *colon = ':';
5145                         if (copy_to_user(arg, &ifr,
5146                                          sizeof(struct ifreq)))
5147                                 ret = -EFAULT;
5148                 }
5149                 return ret;
5150
5151         /*
5152          *      These ioctl calls:
5153          *      - require superuser power.
5154          *      - require strict serialization.
5155          *      - return a value
5156          */
5157         case SIOCGMIIPHY:
5158         case SIOCGMIIREG:
5159         case SIOCSIFNAME:
5160                 if (!capable(CAP_NET_ADMIN))
5161                         return -EPERM;
5162                 dev_load(net, ifr.ifr_name);
5163                 rtnl_lock();
5164                 ret = dev_ifsioc(net, &ifr, cmd);
5165                 rtnl_unlock();
5166                 if (!ret) {
5167                         if (colon)
5168                                 *colon = ':';
5169                         if (copy_to_user(arg, &ifr,
5170                                          sizeof(struct ifreq)))
5171                                 ret = -EFAULT;
5172                 }
5173                 return ret;
5174
5175         /*
5176          *      These ioctl calls:
5177          *      - require superuser power.
5178          *      - require strict serialization.
5179          *      - do not return a value
5180          */
5181         case SIOCSIFFLAGS:
5182         case SIOCSIFMETRIC:
5183         case SIOCSIFMTU:
5184         case SIOCSIFMAP:
5185         case SIOCSIFHWADDR:
5186         case SIOCSIFSLAVE:
5187         case SIOCADDMULTI:
5188         case SIOCDELMULTI:
5189         case SIOCSIFHWBROADCAST:
5190         case SIOCSIFTXQLEN:
5191         case SIOCSMIIREG:
5192         case SIOCBONDENSLAVE:
5193         case SIOCBONDRELEASE:
5194         case SIOCBONDSETHWADDR:
5195         case SIOCBONDCHANGEACTIVE:
5196         case SIOCBRADDIF:
5197         case SIOCBRDELIF:
5198         case SIOCSHWTSTAMP:
5199                 if (!capable(CAP_NET_ADMIN))
5200                         return -EPERM;
5201                 /* fall through */
5202         case SIOCBONDSLAVEINFOQUERY:
5203         case SIOCBONDINFOQUERY:
5204                 dev_load(net, ifr.ifr_name);
5205                 rtnl_lock();
5206                 ret = dev_ifsioc(net, &ifr, cmd);
5207                 rtnl_unlock();
5208                 return ret;
5209
5210         case SIOCGIFMEM:
5211                 /* Get the per device memory space. We can add this but
5212                  * currently do not support it */
5213         case SIOCSIFMEM:
5214                 /* Set the per device memory buffer space.
5215                  * Not applicable in our case */
5216         case SIOCSIFLINK:
5217                 return -ENOTTY;
5218
5219         /*
5220          *      Unknown or private ioctl.
5221          */
5222         default:
5223                 if (cmd == SIOCWANDEV ||
5224                     (cmd >= SIOCDEVPRIVATE &&
5225                      cmd <= SIOCDEVPRIVATE + 15)) {
5226                         dev_load(net, ifr.ifr_name);
5227                         rtnl_lock();
5228                         ret = dev_ifsioc(net, &ifr, cmd);
5229                         rtnl_unlock();
5230                         if (!ret && copy_to_user(arg, &ifr,
5231                                                  sizeof(struct ifreq)))
5232                                 ret = -EFAULT;
5233                         return ret;
5234                 }
5235                 /* Take care of Wireless Extensions */
5236                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5237                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5238                 return -ENOTTY;
5239         }
5240 }
5241
5242
5243 /**
5244  *      dev_new_index   -       allocate an ifindex
5245  *      @net: the applicable net namespace
5246  *
5247  *      Returns a suitable unique value for a new device interface
5248  *      number.  The caller must hold the rtnl semaphore or the
5249  *      dev_base_lock to be sure it remains unique.
5250  */
5251 static int dev_new_index(struct net *net)
5252 {
5253         static int ifindex;
5254         for (;;) {
5255                 if (++ifindex <= 0)
5256                         ifindex = 1;
5257                 if (!__dev_get_by_index(net, ifindex))
5258                         return ifindex;
5259         }
5260 }
5261
5262 /* Delayed registration/unregisteration */
5263 static LIST_HEAD(net_todo_list);
5264
5265 static void net_set_todo(struct net_device *dev)
5266 {
5267         list_add_tail(&dev->todo_list, &net_todo_list);
5268 }
5269
5270 static void rollback_registered_many(struct list_head *head)
5271 {
5272         struct net_device *dev, *tmp;
5273
5274         BUG_ON(dev_boot_phase);
5275         ASSERT_RTNL();
5276
5277         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5278                 /* Some devices call without registering
5279                  * for initialization unwind. Remove those
5280                  * devices and proceed with the remaining.
5281                  */
5282                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5283                         pr_debug("unregister_netdevice: device %s/%p never "
5284                                  "was registered\n", dev->name, dev);
5285
5286                         WARN_ON(1);
5287                         list_del(&dev->unreg_list);
5288                         continue;
5289                 }
5290                 dev->dismantle = true;
5291                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5292         }
5293
5294         /* If device is running, close it first. */
5295         dev_close_many(head);
5296
5297         list_for_each_entry(dev, head, unreg_list) {
5298                 /* And unlink it from device chain. */
5299                 unlist_netdevice(dev);
5300
5301                 dev->reg_state = NETREG_UNREGISTERING;
5302         }
5303
5304         synchronize_net();
5305
5306         list_for_each_entry(dev, head, unreg_list) {
5307                 /* Shutdown queueing discipline. */
5308                 dev_shutdown(dev);
5309
5310
5311                 /* Notify protocols, that we are about to destroy
5312                    this device. They should clean all the things.
5313                 */
5314                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5315
5316                 if (!dev->rtnl_link_ops ||
5317                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5318                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5319
5320                 /*
5321                  *      Flush the unicast and multicast chains
5322                  */
5323                 dev_uc_flush(dev);
5324                 dev_mc_flush(dev);
5325
5326                 if (dev->netdev_ops->ndo_uninit)
5327                         dev->netdev_ops->ndo_uninit(dev);
5328
5329                 /* Notifier chain MUST detach us from master device. */
5330                 WARN_ON(dev->master);
5331
5332                 /* Remove entries from kobject tree */
5333                 netdev_unregister_kobject(dev);
5334         }
5335
5336         /* Process any work delayed until the end of the batch */
5337         dev = list_first_entry(head, struct net_device, unreg_list);
5338         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5339
5340         synchronize_net();
5341
5342         list_for_each_entry(dev, head, unreg_list)
5343                 dev_put(dev);
5344 }
5345
5346 static void rollback_registered(struct net_device *dev)
5347 {
5348         LIST_HEAD(single);
5349
5350         list_add(&dev->unreg_list, &single);
5351         rollback_registered_many(&single);
5352         list_del(&single);
5353 }
5354
5355 static netdev_features_t netdev_fix_features(struct net_device *dev,
5356         netdev_features_t features)
5357 {
5358         /* Fix illegal checksum combinations */
5359         if ((features & NETIF_F_HW_CSUM) &&
5360             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5361                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5362                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5363         }
5364
5365         /* Fix illegal SG+CSUM combinations. */
5366         if ((features & NETIF_F_SG) &&
5367             !(features & NETIF_F_ALL_CSUM)) {
5368                 netdev_dbg(dev,
5369                         "Dropping NETIF_F_SG since no checksum feature.\n");
5370                 features &= ~NETIF_F_SG;
5371         }
5372
5373         /* TSO requires that SG is present as well. */
5374         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5375                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5376                 features &= ~NETIF_F_ALL_TSO;
5377         }
5378
5379         /* TSO ECN requires that TSO is present as well. */
5380         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5381                 features &= ~NETIF_F_TSO_ECN;
5382
5383         /* Software GSO depends on SG. */
5384         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5385                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5386                 features &= ~NETIF_F_GSO;
5387         }
5388
5389         /* UFO needs SG and checksumming */
5390         if (features & NETIF_F_UFO) {
5391                 /* maybe split UFO into V4 and V6? */
5392                 if (!((features & NETIF_F_GEN_CSUM) ||
5393                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5394                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5395                         netdev_dbg(dev,
5396                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5397                         features &= ~NETIF_F_UFO;
5398                 }
5399
5400                 if (!(features & NETIF_F_SG)) {
5401                         netdev_dbg(dev,
5402                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5403                         features &= ~NETIF_F_UFO;
5404                 }
5405         }
5406
5407         return features;
5408 }
5409
5410 int __netdev_update_features(struct net_device *dev)
5411 {
5412         netdev_features_t features;
5413         int err = 0;
5414
5415         ASSERT_RTNL();
5416
5417         features = netdev_get_wanted_features(dev);
5418
5419         if (dev->netdev_ops->ndo_fix_features)
5420                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5421
5422         /* driver might be less strict about feature dependencies */
5423         features = netdev_fix_features(dev, features);
5424
5425         if (dev->features == features)
5426                 return 0;
5427
5428         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5429                 &dev->features, &features);
5430
5431         if (dev->netdev_ops->ndo_set_features)
5432                 err = dev->netdev_ops->ndo_set_features(dev, features);
5433
5434         if (unlikely(err < 0)) {
5435                 netdev_err(dev,
5436                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5437                         err, &features, &dev->features);
5438                 return -1;
5439         }
5440
5441         if (!err)
5442                 dev->features = features;
5443
5444         return 1;
5445 }
5446
5447 /**
5448  *      netdev_update_features - recalculate device features
5449  *      @dev: the device to check
5450  *
5451  *      Recalculate dev->features set and send notifications if it
5452  *      has changed. Should be called after driver or hardware dependent
5453  *      conditions might have changed that influence the features.
5454  */
5455 void netdev_update_features(struct net_device *dev)
5456 {
5457         if (__netdev_update_features(dev))
5458                 netdev_features_change(dev);
5459 }
5460 EXPORT_SYMBOL(netdev_update_features);
5461
5462 /**
5463  *      netdev_change_features - recalculate device features
5464  *      @dev: the device to check
5465  *
5466  *      Recalculate dev->features set and send notifications even
5467  *      if they have not changed. Should be called instead of
5468  *      netdev_update_features() if also dev->vlan_features might
5469  *      have changed to allow the changes to be propagated to stacked
5470  *      VLAN devices.
5471  */
5472 void netdev_change_features(struct net_device *dev)
5473 {
5474         __netdev_update_features(dev);
5475         netdev_features_change(dev);
5476 }
5477 EXPORT_SYMBOL(netdev_change_features);
5478
5479 /**
5480  *      netif_stacked_transfer_operstate -      transfer operstate
5481  *      @rootdev: the root or lower level device to transfer state from
5482  *      @dev: the device to transfer operstate to
5483  *
5484  *      Transfer operational state from root to device. This is normally
5485  *      called when a stacking relationship exists between the root
5486  *      device and the device(a leaf device).
5487  */
5488 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5489                                         struct net_device *dev)
5490 {
5491         if (rootdev->operstate == IF_OPER_DORMANT)
5492                 netif_dormant_on(dev);
5493         else
5494                 netif_dormant_off(dev);
5495
5496         if (netif_carrier_ok(rootdev)) {
5497                 if (!netif_carrier_ok(dev))
5498                         netif_carrier_on(dev);
5499         } else {
5500                 if (netif_carrier_ok(dev))
5501                         netif_carrier_off(dev);
5502         }
5503 }
5504 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5505
5506 #ifdef CONFIG_RPS
5507 static int netif_alloc_rx_queues(struct net_device *dev)
5508 {
5509         unsigned int i, count = dev->num_rx_queues;
5510         struct netdev_rx_queue *rx;
5511
5512         BUG_ON(count < 1);
5513
5514         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5515         if (!rx) {
5516                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5517                 return -ENOMEM;
5518         }
5519         dev->_rx = rx;
5520
5521         for (i = 0; i < count; i++)
5522                 rx[i].dev = dev;
5523         return 0;
5524 }
5525 #endif
5526
5527 static void netdev_init_one_queue(struct net_device *dev,
5528                                   struct netdev_queue *queue, void *_unused)
5529 {
5530         /* Initialize queue lock */
5531         spin_lock_init(&queue->_xmit_lock);
5532         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5533         queue->xmit_lock_owner = -1;
5534         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5535         queue->dev = dev;
5536 }
5537
5538 static int netif_alloc_netdev_queues(struct net_device *dev)
5539 {
5540         unsigned int count = dev->num_tx_queues;
5541         struct netdev_queue *tx;
5542
5543         BUG_ON(count < 1);
5544
5545         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5546         if (!tx) {
5547                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5548                        count);
5549                 return -ENOMEM;
5550         }
5551         dev->_tx = tx;
5552
5553         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5554         spin_lock_init(&dev->tx_global_lock);
5555
5556         return 0;
5557 }
5558
5559 /**
5560  *      register_netdevice      - register a network device
5561  *      @dev: device to register
5562  *
5563  *      Take a completed network device structure and add it to the kernel
5564  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5565  *      chain. 0 is returned on success. A negative errno code is returned
5566  *      on a failure to set up the device, or if the name is a duplicate.
5567  *
5568  *      Callers must hold the rtnl semaphore. You may want
5569  *      register_netdev() instead of this.
5570  *
5571  *      BUGS:
5572  *      The locking appears insufficient to guarantee two parallel registers
5573  *      will not get the same name.
5574  */
5575
5576 int register_netdevice(struct net_device *dev)
5577 {
5578         int ret;
5579         struct net *net = dev_net(dev);
5580
5581         BUG_ON(dev_boot_phase);
5582         ASSERT_RTNL();
5583
5584         might_sleep();
5585
5586         /* When net_device's are persistent, this will be fatal. */
5587         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5588         BUG_ON(!net);
5589
5590         spin_lock_init(&dev->addr_list_lock);
5591         netdev_set_addr_lockdep_class(dev);
5592
5593         dev->iflink = -1;
5594
5595         ret = dev_get_valid_name(dev, dev->name);
5596         if (ret < 0)
5597                 goto out;
5598
5599         /* Init, if this function is available */
5600         if (dev->netdev_ops->ndo_init) {
5601                 ret = dev->netdev_ops->ndo_init(dev);
5602                 if (ret) {
5603                         if (ret > 0)
5604                                 ret = -EIO;
5605                         goto out;
5606                 }
5607         }
5608
5609         dev->ifindex = dev_new_index(net);
5610         if (dev->iflink == -1)
5611                 dev->iflink = dev->ifindex;
5612
5613         /* Transfer changeable features to wanted_features and enable
5614          * software offloads (GSO and GRO).
5615          */
5616         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5617         dev->features |= NETIF_F_SOFT_FEATURES;
5618         dev->wanted_features = dev->features & dev->hw_features;
5619
5620         /* Turn on no cache copy if HW is doing checksum */
5621         if (!(dev->flags & IFF_LOOPBACK)) {
5622                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5623                 if (dev->features & NETIF_F_ALL_CSUM) {
5624                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5625                         dev->features |= NETIF_F_NOCACHE_COPY;
5626                 }
5627         }
5628
5629         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5630          */
5631         dev->vlan_features |= NETIF_F_HIGHDMA;
5632
5633         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5634         ret = notifier_to_errno(ret);
5635         if (ret)
5636                 goto err_uninit;
5637
5638         ret = netdev_register_kobject(dev);
5639         if (ret)
5640                 goto err_uninit;
5641         dev->reg_state = NETREG_REGISTERED;
5642
5643         __netdev_update_features(dev);
5644
5645         /*
5646          *      Default initial state at registry is that the
5647          *      device is present.
5648          */
5649
5650         set_bit(__LINK_STATE_PRESENT, &dev->state);
5651
5652         dev_init_scheduler(dev);
5653         dev_hold(dev);
5654         list_netdevice(dev);
5655
5656         /* Notify protocols, that a new device appeared. */
5657         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5658         ret = notifier_to_errno(ret);
5659         if (ret) {
5660                 rollback_registered(dev);
5661                 dev->reg_state = NETREG_UNREGISTERED;
5662         }
5663         /*
5664          *      Prevent userspace races by waiting until the network
5665          *      device is fully setup before sending notifications.
5666          */
5667         if (!dev->rtnl_link_ops ||
5668             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5669                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5670
5671 out:
5672         return ret;
5673
5674 err_uninit:
5675         if (dev->netdev_ops->ndo_uninit)
5676                 dev->netdev_ops->ndo_uninit(dev);
5677         goto out;
5678 }
5679 EXPORT_SYMBOL(register_netdevice);
5680
5681 /**
5682  *      init_dummy_netdev       - init a dummy network device for NAPI
5683  *      @dev: device to init
5684  *
5685  *      This takes a network device structure and initialize the minimum
5686  *      amount of fields so it can be used to schedule NAPI polls without
5687  *      registering a full blown interface. This is to be used by drivers
5688  *      that need to tie several hardware interfaces to a single NAPI
5689  *      poll scheduler due to HW limitations.
5690  */
5691 int init_dummy_netdev(struct net_device *dev)
5692 {
5693         /* Clear everything. Note we don't initialize spinlocks
5694          * are they aren't supposed to be taken by any of the
5695          * NAPI code and this dummy netdev is supposed to be
5696          * only ever used for NAPI polls
5697          */
5698         memset(dev, 0, sizeof(struct net_device));
5699
5700         /* make sure we BUG if trying to hit standard
5701          * register/unregister code path
5702          */
5703         dev->reg_state = NETREG_DUMMY;
5704
5705         /* NAPI wants this */
5706         INIT_LIST_HEAD(&dev->napi_list);
5707
5708         /* a dummy interface is started by default */
5709         set_bit(__LINK_STATE_PRESENT, &dev->state);
5710         set_bit(__LINK_STATE_START, &dev->state);
5711
5712         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5713          * because users of this 'device' dont need to change
5714          * its refcount.
5715          */
5716
5717         return 0;
5718 }
5719 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5720
5721
5722 /**
5723  *      register_netdev - register a network device
5724  *      @dev: device to register
5725  *
5726  *      Take a completed network device structure and add it to the kernel
5727  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5728  *      chain. 0 is returned on success. A negative errno code is returned
5729  *      on a failure to set up the device, or if the name is a duplicate.
5730  *
5731  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5732  *      and expands the device name if you passed a format string to
5733  *      alloc_netdev.
5734  */
5735 int register_netdev(struct net_device *dev)
5736 {
5737         int err;
5738
5739         rtnl_lock();
5740         err = register_netdevice(dev);
5741         rtnl_unlock();
5742         return err;
5743 }
5744 EXPORT_SYMBOL(register_netdev);
5745
5746 int netdev_refcnt_read(const struct net_device *dev)
5747 {
5748         int i, refcnt = 0;
5749
5750         for_each_possible_cpu(i)
5751                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5752         return refcnt;
5753 }
5754 EXPORT_SYMBOL(netdev_refcnt_read);
5755
5756 /*
5757  * netdev_wait_allrefs - wait until all references are gone.
5758  *
5759  * This is called when unregistering network devices.
5760  *
5761  * Any protocol or device that holds a reference should register
5762  * for netdevice notification, and cleanup and put back the
5763  * reference if they receive an UNREGISTER event.
5764  * We can get stuck here if buggy protocols don't correctly
5765  * call dev_put.
5766  */
5767 static void netdev_wait_allrefs(struct net_device *dev)
5768 {
5769         unsigned long rebroadcast_time, warning_time;
5770         int refcnt;
5771
5772         linkwatch_forget_dev(dev);
5773
5774         rebroadcast_time = warning_time = jiffies;
5775         refcnt = netdev_refcnt_read(dev);
5776
5777         while (refcnt != 0) {
5778                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5779                         rtnl_lock();
5780
5781                         /* Rebroadcast unregister notification */
5782                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5783                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5784                          * should have already handle it the first time */
5785
5786                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5787                                      &dev->state)) {
5788                                 /* We must not have linkwatch events
5789                                  * pending on unregister. If this
5790                                  * happens, we simply run the queue
5791                                  * unscheduled, resulting in a noop
5792                                  * for this device.
5793                                  */
5794                                 linkwatch_run_queue();
5795                         }
5796
5797                         __rtnl_unlock();
5798
5799                         rebroadcast_time = jiffies;
5800                 }
5801
5802                 msleep(250);
5803
5804                 refcnt = netdev_refcnt_read(dev);
5805
5806                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5807                         printk(KERN_EMERG "unregister_netdevice: "
5808                                "waiting for %s to become free. Usage "
5809                                "count = %d\n",
5810                                dev->name, refcnt);
5811                         warning_time = jiffies;
5812                 }
5813         }
5814 }
5815
5816 /* The sequence is:
5817  *
5818  *      rtnl_lock();
5819  *      ...
5820  *      register_netdevice(x1);
5821  *      register_netdevice(x2);
5822  *      ...
5823  *      unregister_netdevice(y1);
5824  *      unregister_netdevice(y2);
5825  *      ...
5826  *      rtnl_unlock();
5827  *      free_netdev(y1);
5828  *      free_netdev(y2);
5829  *
5830  * We are invoked by rtnl_unlock().
5831  * This allows us to deal with problems:
5832  * 1) We can delete sysfs objects which invoke hotplug
5833  *    without deadlocking with linkwatch via keventd.
5834  * 2) Since we run with the RTNL semaphore not held, we can sleep
5835  *    safely in order to wait for the netdev refcnt to drop to zero.
5836  *
5837  * We must not return until all unregister events added during
5838  * the interval the lock was held have been completed.
5839  */
5840 void netdev_run_todo(void)
5841 {
5842         struct list_head list;
5843
5844         /* Snapshot list, allow later requests */
5845         list_replace_init(&net_todo_list, &list);
5846
5847         __rtnl_unlock();
5848
5849         /* Wait for rcu callbacks to finish before attempting to drain
5850          * the device list.  This usually avoids a 250ms wait.
5851          */
5852         if (!list_empty(&list))
5853                 rcu_barrier();
5854
5855         while (!list_empty(&list)) {
5856                 struct net_device *dev
5857                         = list_first_entry(&list, struct net_device, todo_list);
5858                 list_del(&dev->todo_list);
5859
5860                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5861                         printk(KERN_ERR "network todo '%s' but state %d\n",
5862                                dev->name, dev->reg_state);
5863                         dump_stack();
5864                         continue;
5865                 }
5866
5867                 dev->reg_state = NETREG_UNREGISTERED;
5868
5869                 on_each_cpu(flush_backlog, dev, 1);
5870
5871                 netdev_wait_allrefs(dev);
5872
5873                 /* paranoia */
5874                 BUG_ON(netdev_refcnt_read(dev));
5875                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5876                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5877                 WARN_ON(dev->dn_ptr);
5878
5879                 if (dev->destructor)
5880                         dev->destructor(dev);
5881
5882                 /* Free network device */
5883                 kobject_put(&dev->dev.kobj);
5884         }
5885 }
5886
5887 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5888  * fields in the same order, with only the type differing.
5889  */
5890 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5891                                     const struct net_device_stats *netdev_stats)
5892 {
5893 #if BITS_PER_LONG == 64
5894         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5895         memcpy(stats64, netdev_stats, sizeof(*stats64));
5896 #else
5897         size_t i, n = sizeof(*stats64) / sizeof(u64);
5898         const unsigned long *src = (const unsigned long *)netdev_stats;
5899         u64 *dst = (u64 *)stats64;
5900
5901         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5902                      sizeof(*stats64) / sizeof(u64));
5903         for (i = 0; i < n; i++)
5904                 dst[i] = src[i];
5905 #endif
5906 }
5907
5908 /**
5909  *      dev_get_stats   - get network device statistics
5910  *      @dev: device to get statistics from
5911  *      @storage: place to store stats
5912  *
5913  *      Get network statistics from device. Return @storage.
5914  *      The device driver may provide its own method by setting
5915  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5916  *      otherwise the internal statistics structure is used.
5917  */
5918 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5919                                         struct rtnl_link_stats64 *storage)
5920 {
5921         const struct net_device_ops *ops = dev->netdev_ops;
5922
5923         if (ops->ndo_get_stats64) {
5924                 memset(storage, 0, sizeof(*storage));
5925                 ops->ndo_get_stats64(dev, storage);
5926         } else if (ops->ndo_get_stats) {
5927                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5928         } else {
5929                 netdev_stats_to_stats64(storage, &dev->stats);
5930         }
5931         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5932         return storage;
5933 }
5934 EXPORT_SYMBOL(dev_get_stats);
5935
5936 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5937 {
5938         struct netdev_queue *queue = dev_ingress_queue(dev);
5939
5940 #ifdef CONFIG_NET_CLS_ACT
5941         if (queue)
5942                 return queue;
5943         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5944         if (!queue)
5945                 return NULL;
5946         netdev_init_one_queue(dev, queue, NULL);
5947         queue->qdisc = &noop_qdisc;
5948         queue->qdisc_sleeping = &noop_qdisc;
5949         rcu_assign_pointer(dev->ingress_queue, queue);
5950 #endif
5951         return queue;
5952 }
5953
5954 /**
5955  *      alloc_netdev_mqs - allocate network device
5956  *      @sizeof_priv:   size of private data to allocate space for
5957  *      @name:          device name format string
5958  *      @setup:         callback to initialize device
5959  *      @txqs:          the number of TX subqueues to allocate
5960  *      @rxqs:          the number of RX subqueues to allocate
5961  *
5962  *      Allocates a struct net_device with private data area for driver use
5963  *      and performs basic initialization.  Also allocates subquue structs
5964  *      for each queue on the device.
5965  */
5966 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5967                 void (*setup)(struct net_device *),
5968                 unsigned int txqs, unsigned int rxqs)
5969 {
5970         struct net_device *dev;
5971         size_t alloc_size;
5972         struct net_device *p;
5973
5974         BUG_ON(strlen(name) >= sizeof(dev->name));
5975
5976         if (txqs < 1) {
5977                 pr_err("alloc_netdev: Unable to allocate device "
5978                        "with zero queues.\n");
5979                 return NULL;
5980         }
5981
5982 #ifdef CONFIG_RPS
5983         if (rxqs < 1) {
5984                 pr_err("alloc_netdev: Unable to allocate device "
5985                        "with zero RX queues.\n");
5986                 return NULL;
5987         }
5988 #endif
5989
5990         alloc_size = sizeof(struct net_device);
5991         if (sizeof_priv) {
5992                 /* ensure 32-byte alignment of private area */
5993                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5994                 alloc_size += sizeof_priv;
5995         }
5996         /* ensure 32-byte alignment of whole construct */
5997         alloc_size += NETDEV_ALIGN - 1;
5998
5999         p = kzalloc(alloc_size, GFP_KERNEL);
6000         if (!p) {
6001                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6002                 return NULL;
6003         }
6004
6005         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6006         dev->padded = (char *)dev - (char *)p;
6007
6008         dev->pcpu_refcnt = alloc_percpu(int);
6009         if (!dev->pcpu_refcnt)
6010                 goto free_p;
6011
6012         if (dev_addr_init(dev))
6013                 goto free_pcpu;
6014
6015         dev_mc_init(dev);
6016         dev_uc_init(dev);
6017
6018         dev_net_set(dev, &init_net);
6019
6020         dev->gso_max_size = GSO_MAX_SIZE;
6021
6022         INIT_LIST_HEAD(&dev->napi_list);
6023         INIT_LIST_HEAD(&dev->unreg_list);
6024         INIT_LIST_HEAD(&dev->link_watch_list);
6025         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6026         setup(dev);
6027
6028         dev->num_tx_queues = txqs;
6029         dev->real_num_tx_queues = txqs;
6030         if (netif_alloc_netdev_queues(dev))
6031                 goto free_all;
6032
6033 #ifdef CONFIG_RPS
6034         dev->num_rx_queues = rxqs;
6035         dev->real_num_rx_queues = rxqs;
6036         if (netif_alloc_rx_queues(dev))
6037                 goto free_all;
6038 #endif
6039
6040         strcpy(dev->name, name);
6041         dev->group = INIT_NETDEV_GROUP;
6042         return dev;
6043
6044 free_all:
6045         free_netdev(dev);
6046         return NULL;
6047
6048 free_pcpu:
6049         free_percpu(dev->pcpu_refcnt);
6050         kfree(dev->_tx);
6051 #ifdef CONFIG_RPS
6052         kfree(dev->_rx);
6053 #endif
6054
6055 free_p:
6056         kfree(p);
6057         return NULL;
6058 }
6059 EXPORT_SYMBOL(alloc_netdev_mqs);
6060
6061 /**
6062  *      free_netdev - free network device
6063  *      @dev: device
6064  *
6065  *      This function does the last stage of destroying an allocated device
6066  *      interface. The reference to the device object is released.
6067  *      If this is the last reference then it will be freed.
6068  */
6069 void free_netdev(struct net_device *dev)
6070 {
6071         struct napi_struct *p, *n;
6072
6073         release_net(dev_net(dev));
6074
6075         kfree(dev->_tx);
6076 #ifdef CONFIG_RPS
6077         kfree(dev->_rx);
6078 #endif
6079
6080         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6081
6082         /* Flush device addresses */
6083         dev_addr_flush(dev);
6084
6085         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6086                 netif_napi_del(p);
6087
6088         free_percpu(dev->pcpu_refcnt);
6089         dev->pcpu_refcnt = NULL;
6090
6091         /*  Compatibility with error handling in drivers */
6092         if (dev->reg_state == NETREG_UNINITIALIZED) {
6093                 kfree((char *)dev - dev->padded);
6094                 return;
6095         }
6096
6097         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6098         dev->reg_state = NETREG_RELEASED;
6099
6100         /* will free via device release */
6101         put_device(&dev->dev);
6102 }
6103 EXPORT_SYMBOL(free_netdev);
6104
6105 /**
6106  *      synchronize_net -  Synchronize with packet receive processing
6107  *
6108  *      Wait for packets currently being received to be done.
6109  *      Does not block later packets from starting.
6110  */
6111 void synchronize_net(void)
6112 {
6113         might_sleep();
6114         if (rtnl_is_locked())
6115                 synchronize_rcu_expedited();
6116         else
6117                 synchronize_rcu();
6118 }
6119 EXPORT_SYMBOL(synchronize_net);
6120
6121 /**
6122  *      unregister_netdevice_queue - remove device from the kernel
6123  *      @dev: device
6124  *      @head: list
6125  *
6126  *      This function shuts down a device interface and removes it
6127  *      from the kernel tables.
6128  *      If head not NULL, device is queued to be unregistered later.
6129  *
6130  *      Callers must hold the rtnl semaphore.  You may want
6131  *      unregister_netdev() instead of this.
6132  */
6133
6134 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6135 {
6136         ASSERT_RTNL();
6137
6138         if (head) {
6139                 list_move_tail(&dev->unreg_list, head);
6140         } else {
6141                 rollback_registered(dev);
6142                 /* Finish processing unregister after unlock */
6143                 net_set_todo(dev);
6144         }
6145 }
6146 EXPORT_SYMBOL(unregister_netdevice_queue);
6147
6148 /**
6149  *      unregister_netdevice_many - unregister many devices
6150  *      @head: list of devices
6151  */
6152 void unregister_netdevice_many(struct list_head *head)
6153 {
6154         struct net_device *dev;
6155
6156         if (!list_empty(head)) {
6157                 rollback_registered_many(head);
6158                 list_for_each_entry(dev, head, unreg_list)
6159                         net_set_todo(dev);
6160         }
6161 }
6162 EXPORT_SYMBOL(unregister_netdevice_many);
6163
6164 /**
6165  *      unregister_netdev - remove device from the kernel
6166  *      @dev: device
6167  *
6168  *      This function shuts down a device interface and removes it
6169  *      from the kernel tables.
6170  *
6171  *      This is just a wrapper for unregister_netdevice that takes
6172  *      the rtnl semaphore.  In general you want to use this and not
6173  *      unregister_netdevice.
6174  */
6175 void unregister_netdev(struct net_device *dev)
6176 {
6177         rtnl_lock();
6178         unregister_netdevice(dev);
6179         rtnl_unlock();
6180 }
6181 EXPORT_SYMBOL(unregister_netdev);
6182
6183 /**
6184  *      dev_change_net_namespace - move device to different nethost namespace
6185  *      @dev: device
6186  *      @net: network namespace
6187  *      @pat: If not NULL name pattern to try if the current device name
6188  *            is already taken in the destination network namespace.
6189  *
6190  *      This function shuts down a device interface and moves it
6191  *      to a new network namespace. On success 0 is returned, on
6192  *      a failure a netagive errno code is returned.
6193  *
6194  *      Callers must hold the rtnl semaphore.
6195  */
6196
6197 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6198 {
6199         int err;
6200
6201         ASSERT_RTNL();
6202
6203         /* Don't allow namespace local devices to be moved. */
6204         err = -EINVAL;
6205         if (dev->features & NETIF_F_NETNS_LOCAL)
6206                 goto out;
6207
6208         /* Ensure the device has been registrered */
6209         err = -EINVAL;
6210         if (dev->reg_state != NETREG_REGISTERED)
6211                 goto out;
6212
6213         /* Get out if there is nothing todo */
6214         err = 0;
6215         if (net_eq(dev_net(dev), net))
6216                 goto out;
6217
6218         /* Pick the destination device name, and ensure
6219          * we can use it in the destination network namespace.
6220          */
6221         err = -EEXIST;
6222         if (__dev_get_by_name(net, dev->name)) {
6223                 /* We get here if we can't use the current device name */
6224                 if (!pat)
6225                         goto out;
6226                 if (dev_get_valid_name(dev, pat) < 0)
6227                         goto out;
6228         }
6229
6230         /*
6231          * And now a mini version of register_netdevice unregister_netdevice.
6232          */
6233
6234         /* If device is running close it first. */
6235         dev_close(dev);
6236
6237         /* And unlink it from device chain */
6238         err = -ENODEV;
6239         unlist_netdevice(dev);
6240
6241         synchronize_net();
6242
6243         /* Shutdown queueing discipline. */
6244         dev_shutdown(dev);
6245
6246         /* Notify protocols, that we are about to destroy
6247            this device. They should clean all the things.
6248
6249            Note that dev->reg_state stays at NETREG_REGISTERED.
6250            This is wanted because this way 8021q and macvlan know
6251            the device is just moving and can keep their slaves up.
6252         */
6253         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6254         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6255         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6256
6257         /*
6258          *      Flush the unicast and multicast chains
6259          */
6260         dev_uc_flush(dev);
6261         dev_mc_flush(dev);
6262
6263         /* Actually switch the network namespace */
6264         dev_net_set(dev, net);
6265
6266         /* If there is an ifindex conflict assign a new one */
6267         if (__dev_get_by_index(net, dev->ifindex)) {
6268                 int iflink = (dev->iflink == dev->ifindex);
6269                 dev->ifindex = dev_new_index(net);
6270                 if (iflink)
6271                         dev->iflink = dev->ifindex;
6272         }
6273
6274         /* Fixup kobjects */
6275         err = device_rename(&dev->dev, dev->name);
6276         WARN_ON(err);
6277
6278         /* Add the device back in the hashes */
6279         list_netdevice(dev);
6280
6281         /* Notify protocols, that a new device appeared. */
6282         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6283
6284         /*
6285          *      Prevent userspace races by waiting until the network
6286          *      device is fully setup before sending notifications.
6287          */
6288         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6289
6290         synchronize_net();
6291         err = 0;
6292 out:
6293         return err;
6294 }
6295 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6296
6297 static int dev_cpu_callback(struct notifier_block *nfb,
6298                             unsigned long action,
6299                             void *ocpu)
6300 {
6301         struct sk_buff **list_skb;
6302         struct sk_buff *skb;
6303         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6304         struct softnet_data *sd, *oldsd;
6305
6306         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6307                 return NOTIFY_OK;
6308
6309         local_irq_disable();
6310         cpu = smp_processor_id();
6311         sd = &per_cpu(softnet_data, cpu);
6312         oldsd = &per_cpu(softnet_data, oldcpu);
6313
6314         /* Find end of our completion_queue. */
6315         list_skb = &sd->completion_queue;
6316         while (*list_skb)
6317                 list_skb = &(*list_skb)->next;
6318         /* Append completion queue from offline CPU. */
6319         *list_skb = oldsd->completion_queue;
6320         oldsd->completion_queue = NULL;
6321
6322         /* Append output queue from offline CPU. */
6323         if (oldsd->output_queue) {
6324                 *sd->output_queue_tailp = oldsd->output_queue;
6325                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6326                 oldsd->output_queue = NULL;
6327                 oldsd->output_queue_tailp = &oldsd->output_queue;
6328         }
6329         /* Append NAPI poll list from offline CPU. */
6330         if (!list_empty(&oldsd->poll_list)) {
6331                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6332                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6333         }
6334
6335         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6336         local_irq_enable();
6337
6338         /* Process offline CPU's input_pkt_queue */
6339         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6340                 netif_rx(skb);
6341                 input_queue_head_incr(oldsd);
6342         }
6343         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6344                 netif_rx(skb);
6345                 input_queue_head_incr(oldsd);
6346         }
6347
6348         return NOTIFY_OK;
6349 }
6350
6351
6352 /**
6353  *      netdev_increment_features - increment feature set by one
6354  *      @all: current feature set
6355  *      @one: new feature set
6356  *      @mask: mask feature set
6357  *
6358  *      Computes a new feature set after adding a device with feature set
6359  *      @one to the master device with current feature set @all.  Will not
6360  *      enable anything that is off in @mask. Returns the new feature set.
6361  */
6362 netdev_features_t netdev_increment_features(netdev_features_t all,
6363         netdev_features_t one, netdev_features_t mask)
6364 {
6365         if (mask & NETIF_F_GEN_CSUM)
6366                 mask |= NETIF_F_ALL_CSUM;
6367         mask |= NETIF_F_VLAN_CHALLENGED;
6368
6369         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6370         all &= one | ~NETIF_F_ALL_FOR_ALL;
6371
6372         /* If one device supports hw checksumming, set for all. */
6373         if (all & NETIF_F_GEN_CSUM)
6374                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6375
6376         return all;
6377 }
6378 EXPORT_SYMBOL(netdev_increment_features);
6379
6380 static struct hlist_head *netdev_create_hash(void)
6381 {
6382         int i;
6383         struct hlist_head *hash;
6384
6385         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6386         if (hash != NULL)
6387                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6388                         INIT_HLIST_HEAD(&hash[i]);
6389
6390         return hash;
6391 }
6392
6393 /* Initialize per network namespace state */
6394 static int __net_init netdev_init(struct net *net)
6395 {
6396         INIT_LIST_HEAD(&net->dev_base_head);
6397
6398         net->dev_name_head = netdev_create_hash();
6399         if (net->dev_name_head == NULL)
6400                 goto err_name;
6401
6402         net->dev_index_head = netdev_create_hash();
6403         if (net->dev_index_head == NULL)
6404                 goto err_idx;
6405
6406         return 0;
6407
6408 err_idx:
6409         kfree(net->dev_name_head);
6410 err_name:
6411         return -ENOMEM;
6412 }
6413
6414 /**
6415  *      netdev_drivername - network driver for the device
6416  *      @dev: network device
6417  *
6418  *      Determine network driver for device.
6419  */
6420 const char *netdev_drivername(const struct net_device *dev)
6421 {
6422         const struct device_driver *driver;
6423         const struct device *parent;
6424         const char *empty = "";
6425
6426         parent = dev->dev.parent;
6427         if (!parent)
6428                 return empty;
6429
6430         driver = parent->driver;
6431         if (driver && driver->name)
6432                 return driver->name;
6433         return empty;
6434 }
6435
6436 int __netdev_printk(const char *level, const struct net_device *dev,
6437                            struct va_format *vaf)
6438 {
6439         int r;
6440
6441         if (dev && dev->dev.parent)
6442                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6443                                netdev_name(dev), vaf);
6444         else if (dev)
6445                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6446         else
6447                 r = printk("%s(NULL net_device): %pV", level, vaf);
6448
6449         return r;
6450 }
6451 EXPORT_SYMBOL(__netdev_printk);
6452
6453 int netdev_printk(const char *level, const struct net_device *dev,
6454                   const char *format, ...)
6455 {
6456         struct va_format vaf;
6457         va_list args;
6458         int r;
6459
6460         va_start(args, format);
6461
6462         vaf.fmt = format;
6463         vaf.va = &args;
6464
6465         r = __netdev_printk(level, dev, &vaf);
6466         va_end(args);
6467
6468         return r;
6469 }
6470 EXPORT_SYMBOL(netdev_printk);
6471
6472 #define define_netdev_printk_level(func, level)                 \
6473 int func(const struct net_device *dev, const char *fmt, ...)    \
6474 {                                                               \
6475         int r;                                                  \
6476         struct va_format vaf;                                   \
6477         va_list args;                                           \
6478                                                                 \
6479         va_start(args, fmt);                                    \
6480                                                                 \
6481         vaf.fmt = fmt;                                          \
6482         vaf.va = &args;                                         \
6483                                                                 \
6484         r = __netdev_printk(level, dev, &vaf);                  \
6485         va_end(args);                                           \
6486                                                                 \
6487         return r;                                               \
6488 }                                                               \
6489 EXPORT_SYMBOL(func);
6490
6491 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6492 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6493 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6494 define_netdev_printk_level(netdev_err, KERN_ERR);
6495 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6496 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6497 define_netdev_printk_level(netdev_info, KERN_INFO);
6498
6499 static void __net_exit netdev_exit(struct net *net)
6500 {
6501         kfree(net->dev_name_head);
6502         kfree(net->dev_index_head);
6503 }
6504
6505 static struct pernet_operations __net_initdata netdev_net_ops = {
6506         .init = netdev_init,
6507         .exit = netdev_exit,
6508 };
6509
6510 static void __net_exit default_device_exit(struct net *net)
6511 {
6512         struct net_device *dev, *aux;
6513         /*
6514          * Push all migratable network devices back to the
6515          * initial network namespace
6516          */
6517         rtnl_lock();
6518         for_each_netdev_safe(net, dev, aux) {
6519                 int err;
6520                 char fb_name[IFNAMSIZ];
6521
6522                 /* Ignore unmoveable devices (i.e. loopback) */
6523                 if (dev->features & NETIF_F_NETNS_LOCAL)
6524                         continue;
6525
6526                 /* Leave virtual devices for the generic cleanup */
6527                 if (dev->rtnl_link_ops)
6528                         continue;
6529
6530                 /* Push remaining network devices to init_net */
6531                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6532                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6533                 if (err) {
6534                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6535                                 __func__, dev->name, err);
6536                         BUG();
6537                 }
6538         }
6539         rtnl_unlock();
6540 }
6541
6542 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6543 {
6544         /* At exit all network devices most be removed from a network
6545          * namespace.  Do this in the reverse order of registration.
6546          * Do this across as many network namespaces as possible to
6547          * improve batching efficiency.
6548          */
6549         struct net_device *dev;
6550         struct net *net;
6551         LIST_HEAD(dev_kill_list);
6552
6553         rtnl_lock();
6554         list_for_each_entry(net, net_list, exit_list) {
6555                 for_each_netdev_reverse(net, dev) {
6556                         if (dev->rtnl_link_ops)
6557                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6558                         else
6559                                 unregister_netdevice_queue(dev, &dev_kill_list);
6560                 }
6561         }
6562         unregister_netdevice_many(&dev_kill_list);
6563         list_del(&dev_kill_list);
6564         rtnl_unlock();
6565 }
6566
6567 static struct pernet_operations __net_initdata default_device_ops = {
6568         .exit = default_device_exit,
6569         .exit_batch = default_device_exit_batch,
6570 };
6571
6572 /*
6573  *      Initialize the DEV module. At boot time this walks the device list and
6574  *      unhooks any devices that fail to initialise (normally hardware not
6575  *      present) and leaves us with a valid list of present and active devices.
6576  *
6577  */
6578
6579 /*
6580  *       This is called single threaded during boot, so no need
6581  *       to take the rtnl semaphore.
6582  */
6583 static int __init net_dev_init(void)
6584 {
6585         int i, rc = -ENOMEM;
6586
6587         BUG_ON(!dev_boot_phase);
6588
6589         if (dev_proc_init())
6590                 goto out;
6591
6592         if (netdev_kobject_init())
6593                 goto out;
6594
6595         INIT_LIST_HEAD(&ptype_all);
6596         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6597                 INIT_LIST_HEAD(&ptype_base[i]);
6598
6599         if (register_pernet_subsys(&netdev_net_ops))
6600                 goto out;
6601
6602         /*
6603          *      Initialise the packet receive queues.
6604          */
6605
6606         for_each_possible_cpu(i) {
6607                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6608
6609                 memset(sd, 0, sizeof(*sd));
6610                 skb_queue_head_init(&sd->input_pkt_queue);
6611                 skb_queue_head_init(&sd->process_queue);
6612                 sd->completion_queue = NULL;
6613                 INIT_LIST_HEAD(&sd->poll_list);
6614                 sd->output_queue = NULL;
6615                 sd->output_queue_tailp = &sd->output_queue;
6616 #ifdef CONFIG_RPS
6617                 sd->csd.func = rps_trigger_softirq;
6618                 sd->csd.info = sd;
6619                 sd->csd.flags = 0;
6620                 sd->cpu = i;
6621 #endif
6622
6623                 sd->backlog.poll = process_backlog;
6624                 sd->backlog.weight = weight_p;
6625                 sd->backlog.gro_list = NULL;
6626                 sd->backlog.gro_count = 0;
6627         }
6628
6629         dev_boot_phase = 0;
6630
6631         /* The loopback device is special if any other network devices
6632          * is present in a network namespace the loopback device must
6633          * be present. Since we now dynamically allocate and free the
6634          * loopback device ensure this invariant is maintained by
6635          * keeping the loopback device as the first device on the
6636          * list of network devices.  Ensuring the loopback devices
6637          * is the first device that appears and the last network device
6638          * that disappears.
6639          */
6640         if (register_pernet_device(&loopback_net_ops))
6641                 goto out;
6642
6643         if (register_pernet_device(&default_device_ops))
6644                 goto out;
6645
6646         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6647         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6648
6649         hotcpu_notifier(dev_cpu_callback, 0);
6650         dst_init();
6651         dev_mcast_init();
6652         rc = 0;
6653 out:
6654         return rc;
6655 }
6656
6657 subsys_initcall(net_dev_init);
6658
6659 static int __init initialize_hashrnd(void)
6660 {
6661         get_random_bytes(&hashrnd, sizeof(hashrnd));
6662         return 0;
6663 }
6664
6665 late_initcall_sync(initialize_hashrnd);
6666