[NET_SCHED]: ingress: switch back to using ingress_lock
[linux-2.6.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52         spin_lock_bh(&dev->queue_lock);
53         spin_lock(&dev->ingress_lock);
54 }
55
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58         spin_unlock(&dev->ingress_lock);
59         spin_unlock_bh(&dev->queue_lock);
60 }
61
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65
66    netif_tx_lock serializes accesses to device driver.
67
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71
72
73 /* Kick device.
74    Note, that this procedure can be called by a watchdog timer, so that
75    we do not check dev->tbusy flag here.
76
77    Returns:  0  - queue is empty.
78             >0  - queue is not empty, but throttled.
79             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
80
81    NOTE: Called under dev->queue_lock with locally disabled BH.
82 */
83
84 static inline int qdisc_restart(struct net_device *dev)
85 {
86         struct Qdisc *q = dev->qdisc;
87         struct sk_buff *skb;
88
89         /* Dequeue packet */
90         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
91                 unsigned nolock = (dev->features & NETIF_F_LLTX);
92
93                 dev->gso_skb = NULL;
94
95                 /*
96                  * When the driver has LLTX set it does its own locking
97                  * in start_xmit. No need to add additional overhead by
98                  * locking again. These checks are worth it because
99                  * even uncongested locks can be quite expensive.
100                  * The driver can do trylock like here too, in case
101                  * of lock congestion it should return -1 and the packet
102                  * will be requeued.
103                  */
104                 if (!nolock) {
105                         if (!netif_tx_trylock(dev)) {
106                         collision:
107                                 /* So, someone grabbed the driver. */
108
109                                 /* It may be transient configuration error,
110                                    when hard_start_xmit() recurses. We detect
111                                    it by checking xmit owner and drop the
112                                    packet when deadloop is detected.
113                                 */
114                                 if (dev->xmit_lock_owner == smp_processor_id()) {
115                                         kfree_skb(skb);
116                                         if (net_ratelimit())
117                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
118                                         return -1;
119                                 }
120                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
121                                 goto requeue;
122                         }
123                 }
124
125                 {
126                         /* And release queue */
127                         spin_unlock(&dev->queue_lock);
128
129                         if (!netif_queue_stopped(dev)) {
130                                 int ret;
131
132                                 ret = dev_hard_start_xmit(skb, dev);
133                                 if (ret == NETDEV_TX_OK) {
134                                         if (!nolock) {
135                                                 netif_tx_unlock(dev);
136                                         }
137                                         spin_lock(&dev->queue_lock);
138                                         return -1;
139                                 }
140                                 if (ret == NETDEV_TX_LOCKED && nolock) {
141                                         spin_lock(&dev->queue_lock);
142                                         goto collision;
143                                 }
144                         }
145
146                         /* NETDEV_TX_BUSY - we need to requeue */
147                         /* Release the driver */
148                         if (!nolock) {
149                                 netif_tx_unlock(dev);
150                         }
151                         spin_lock(&dev->queue_lock);
152                         q = dev->qdisc;
153                 }
154
155                 /* Device kicked us out :(
156                    This is possible in three cases:
157
158                    0. driver is locked
159                    1. fastroute is enabled
160                    2. device cannot determine busy state
161                       before start of transmission (f.e. dialout)
162                    3. device is buggy (ppp)
163                  */
164
165 requeue:
166                 if (skb->next)
167                         dev->gso_skb = skb;
168                 else
169                         q->ops->requeue(skb, q);
170                 netif_schedule(dev);
171                 return 1;
172         }
173         BUG_ON((int) q->q.qlen < 0);
174         return q->q.qlen;
175 }
176
177 void __qdisc_run(struct net_device *dev)
178 {
179         if (unlikely(dev->qdisc == &noop_qdisc))
180                 goto out;
181
182         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
183                 /* NOTHING */;
184
185 out:
186         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
187 }
188
189 static void dev_watchdog(unsigned long arg)
190 {
191         struct net_device *dev = (struct net_device *)arg;
192
193         netif_tx_lock(dev);
194         if (dev->qdisc != &noop_qdisc) {
195                 if (netif_device_present(dev) &&
196                     netif_running(dev) &&
197                     netif_carrier_ok(dev)) {
198                         if (netif_queue_stopped(dev) &&
199                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
200
201                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
202                                        dev->name);
203                                 dev->tx_timeout(dev);
204                         }
205                         if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
206                                 dev_hold(dev);
207                 }
208         }
209         netif_tx_unlock(dev);
210
211         dev_put(dev);
212 }
213
214 static void dev_watchdog_init(struct net_device *dev)
215 {
216         init_timer(&dev->watchdog_timer);
217         dev->watchdog_timer.data = (unsigned long)dev;
218         dev->watchdog_timer.function = dev_watchdog;
219 }
220
221 void __netdev_watchdog_up(struct net_device *dev)
222 {
223         if (dev->tx_timeout) {
224                 if (dev->watchdog_timeo <= 0)
225                         dev->watchdog_timeo = 5*HZ;
226                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
227                         dev_hold(dev);
228         }
229 }
230
231 static void dev_watchdog_up(struct net_device *dev)
232 {
233         __netdev_watchdog_up(dev);
234 }
235
236 static void dev_watchdog_down(struct net_device *dev)
237 {
238         netif_tx_lock_bh(dev);
239         if (del_timer(&dev->watchdog_timer))
240                 dev_put(dev);
241         netif_tx_unlock_bh(dev);
242 }
243
244 void netif_carrier_on(struct net_device *dev)
245 {
246         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
247                 linkwatch_fire_event(dev);
248         if (netif_running(dev))
249                 __netdev_watchdog_up(dev);
250 }
251
252 void netif_carrier_off(struct net_device *dev)
253 {
254         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
255                 linkwatch_fire_event(dev);
256 }
257
258 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
259    under all circumstances. It is difficult to invent anything faster or
260    cheaper.
261  */
262
263 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
264 {
265         kfree_skb(skb);
266         return NET_XMIT_CN;
267 }
268
269 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
270 {
271         return NULL;
272 }
273
274 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
275 {
276         if (net_ratelimit())
277                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
278                        skb->dev->name);
279         kfree_skb(skb);
280         return NET_XMIT_CN;
281 }
282
283 struct Qdisc_ops noop_qdisc_ops = {
284         .id             =       "noop",
285         .priv_size      =       0,
286         .enqueue        =       noop_enqueue,
287         .dequeue        =       noop_dequeue,
288         .requeue        =       noop_requeue,
289         .owner          =       THIS_MODULE,
290 };
291
292 struct Qdisc noop_qdisc = {
293         .enqueue        =       noop_enqueue,
294         .dequeue        =       noop_dequeue,
295         .flags          =       TCQ_F_BUILTIN,
296         .ops            =       &noop_qdisc_ops,
297         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
298 };
299
300 static struct Qdisc_ops noqueue_qdisc_ops = {
301         .id             =       "noqueue",
302         .priv_size      =       0,
303         .enqueue        =       noop_enqueue,
304         .dequeue        =       noop_dequeue,
305         .requeue        =       noop_requeue,
306         .owner          =       THIS_MODULE,
307 };
308
309 static struct Qdisc noqueue_qdisc = {
310         .enqueue        =       NULL,
311         .dequeue        =       noop_dequeue,
312         .flags          =       TCQ_F_BUILTIN,
313         .ops            =       &noqueue_qdisc_ops,
314         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
315 };
316
317
318 static const u8 prio2band[TC_PRIO_MAX+1] =
319         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
320
321 /* 3-band FIFO queue: old style, but should be a bit faster than
322    generic prio+fifo combination.
323  */
324
325 #define PFIFO_FAST_BANDS 3
326
327 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
328                                              struct Qdisc *qdisc)
329 {
330         struct sk_buff_head *list = qdisc_priv(qdisc);
331         return list + prio2band[skb->priority & TC_PRIO_MAX];
332 }
333
334 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
335 {
336         struct sk_buff_head *list = prio2list(skb, qdisc);
337
338         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
339                 qdisc->q.qlen++;
340                 return __qdisc_enqueue_tail(skb, qdisc, list);
341         }
342
343         return qdisc_drop(skb, qdisc);
344 }
345
346 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
347 {
348         int prio;
349         struct sk_buff_head *list = qdisc_priv(qdisc);
350
351         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
352                 if (!skb_queue_empty(list + prio)) {
353                         qdisc->q.qlen--;
354                         return __qdisc_dequeue_head(qdisc, list + prio);
355                 }
356         }
357
358         return NULL;
359 }
360
361 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
362 {
363         qdisc->q.qlen++;
364         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
365 }
366
367 static void pfifo_fast_reset(struct Qdisc* qdisc)
368 {
369         int prio;
370         struct sk_buff_head *list = qdisc_priv(qdisc);
371
372         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
373                 __qdisc_reset_queue(qdisc, list + prio);
374
375         qdisc->qstats.backlog = 0;
376         qdisc->q.qlen = 0;
377 }
378
379 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
380 {
381         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
382
383         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
384         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
385         return skb->len;
386
387 rtattr_failure:
388         return -1;
389 }
390
391 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
392 {
393         int prio;
394         struct sk_buff_head *list = qdisc_priv(qdisc);
395
396         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
397                 skb_queue_head_init(list + prio);
398
399         return 0;
400 }
401
402 static struct Qdisc_ops pfifo_fast_ops = {
403         .id             =       "pfifo_fast",
404         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
405         .enqueue        =       pfifo_fast_enqueue,
406         .dequeue        =       pfifo_fast_dequeue,
407         .requeue        =       pfifo_fast_requeue,
408         .init           =       pfifo_fast_init,
409         .reset          =       pfifo_fast_reset,
410         .dump           =       pfifo_fast_dump,
411         .owner          =       THIS_MODULE,
412 };
413
414 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
415 {
416         void *p;
417         struct Qdisc *sch;
418         unsigned int size;
419         int err = -ENOBUFS;
420
421         /* ensure that the Qdisc and the private data are 32-byte aligned */
422         size = QDISC_ALIGN(sizeof(*sch));
423         size += ops->priv_size + (QDISC_ALIGNTO - 1);
424
425         p = kzalloc(size, GFP_KERNEL);
426         if (!p)
427                 goto errout;
428         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
429         sch->padded = (char *) sch - (char *) p;
430
431         INIT_LIST_HEAD(&sch->list);
432         skb_queue_head_init(&sch->q);
433         sch->ops = ops;
434         sch->enqueue = ops->enqueue;
435         sch->dequeue = ops->dequeue;
436         sch->dev = dev;
437         dev_hold(dev);
438         atomic_set(&sch->refcnt, 1);
439
440         return sch;
441 errout:
442         return ERR_PTR(-err);
443 }
444
445 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
446                                  unsigned int parentid)
447 {
448         struct Qdisc *sch;
449
450         sch = qdisc_alloc(dev, ops);
451         if (IS_ERR(sch))
452                 goto errout;
453         sch->stats_lock = &dev->queue_lock;
454         sch->parent = parentid;
455
456         if (!ops->init || ops->init(sch, NULL) == 0)
457                 return sch;
458
459         qdisc_destroy(sch);
460 errout:
461         return NULL;
462 }
463
464 /* Under dev->queue_lock and BH! */
465
466 void qdisc_reset(struct Qdisc *qdisc)
467 {
468         struct Qdisc_ops *ops = qdisc->ops;
469
470         if (ops->reset)
471                 ops->reset(qdisc);
472 }
473
474 /* this is the rcu callback function to clean up a qdisc when there
475  * are no further references to it */
476
477 static void __qdisc_destroy(struct rcu_head *head)
478 {
479         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
480         kfree((char *) qdisc - qdisc->padded);
481 }
482
483 /* Under dev->queue_lock and BH! */
484
485 void qdisc_destroy(struct Qdisc *qdisc)
486 {
487         struct Qdisc_ops  *ops = qdisc->ops;
488
489         if (qdisc->flags & TCQ_F_BUILTIN ||
490             !atomic_dec_and_test(&qdisc->refcnt))
491                 return;
492
493         list_del(&qdisc->list);
494 #ifdef CONFIG_NET_ESTIMATOR
495         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
496 #endif
497         if (ops->reset)
498                 ops->reset(qdisc);
499         if (ops->destroy)
500                 ops->destroy(qdisc);
501
502         module_put(ops->owner);
503         dev_put(qdisc->dev);
504         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
505 }
506
507 void dev_activate(struct net_device *dev)
508 {
509         /* No queueing discipline is attached to device;
510            create default one i.e. pfifo_fast for devices,
511            which need queueing and noqueue_qdisc for
512            virtual interfaces
513          */
514
515         if (dev->qdisc_sleeping == &noop_qdisc) {
516                 struct Qdisc *qdisc;
517                 if (dev->tx_queue_len) {
518                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
519                                                   TC_H_ROOT);
520                         if (qdisc == NULL) {
521                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
522                                 return;
523                         }
524                         list_add_tail(&qdisc->list, &dev->qdisc_list);
525                 } else {
526                         qdisc =  &noqueue_qdisc;
527                 }
528                 dev->qdisc_sleeping = qdisc;
529         }
530
531         if (!netif_carrier_ok(dev))
532                 /* Delay activation until next carrier-on event */
533                 return;
534
535         spin_lock_bh(&dev->queue_lock);
536         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
537         if (dev->qdisc != &noqueue_qdisc) {
538                 dev->trans_start = jiffies;
539                 dev_watchdog_up(dev);
540         }
541         spin_unlock_bh(&dev->queue_lock);
542 }
543
544 void dev_deactivate(struct net_device *dev)
545 {
546         struct Qdisc *qdisc;
547
548         spin_lock_bh(&dev->queue_lock);
549         qdisc = dev->qdisc;
550         dev->qdisc = &noop_qdisc;
551
552         qdisc_reset(qdisc);
553
554         spin_unlock_bh(&dev->queue_lock);
555
556         dev_watchdog_down(dev);
557
558         /* Wait for outstanding dev_queue_xmit calls. */
559         synchronize_rcu();
560
561         /* Wait for outstanding qdisc_run calls. */
562         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
563                 yield();
564
565         if (dev->gso_skb) {
566                 kfree_skb(dev->gso_skb);
567                 dev->gso_skb = NULL;
568         }
569 }
570
571 void dev_init_scheduler(struct net_device *dev)
572 {
573         qdisc_lock_tree(dev);
574         dev->qdisc = &noop_qdisc;
575         dev->qdisc_sleeping = &noop_qdisc;
576         INIT_LIST_HEAD(&dev->qdisc_list);
577         qdisc_unlock_tree(dev);
578
579         dev_watchdog_init(dev);
580 }
581
582 void dev_shutdown(struct net_device *dev)
583 {
584         struct Qdisc *qdisc;
585
586         qdisc_lock_tree(dev);
587         qdisc = dev->qdisc_sleeping;
588         dev->qdisc = &noop_qdisc;
589         dev->qdisc_sleeping = &noop_qdisc;
590         qdisc_destroy(qdisc);
591 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
592         if ((qdisc = dev->qdisc_ingress) != NULL) {
593                 dev->qdisc_ingress = NULL;
594                 qdisc_destroy(qdisc);
595         }
596 #endif
597         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
598         qdisc_unlock_tree(dev);
599 }
600
601 EXPORT_SYMBOL(netif_carrier_on);
602 EXPORT_SYMBOL(netif_carrier_off);
603 EXPORT_SYMBOL(noop_qdisc);
604 EXPORT_SYMBOL(qdisc_create_dflt);
605 EXPORT_SYMBOL(qdisc_destroy);
606 EXPORT_SYMBOL(qdisc_reset);
607 EXPORT_SYMBOL(qdisc_lock_tree);
608 EXPORT_SYMBOL(qdisc_unlock_tree);