[PKT_SCHED]: GRED: Use central VQ change procedure
[linux-2.6.git] / net / sched / sch_gred.c
1 /*
2  * net/sched/sch_gred.c Generic Random Early Detection queue.
3  *
4  *
5  *              This program is free software; you can redistribute it and/or
6  *              modify it under the terms of the GNU General Public License
7  *              as published by the Free Software Foundation; either version
8  *              2 of the License, or (at your option) any later version.
9  *
10  * Authors:    J Hadi Salim (hadi@cyberus.ca) 1998-2002
11  *
12  *             991129: -  Bug fix with grio mode
13  *                     - a better sing. AvgQ mode with Grio(WRED)
14  *                     - A finer grained VQ dequeue based on sugestion
15  *                       from Ren Liu
16  *                     - More error checks
17  *
18  *
19  *
20  *  For all the glorious comments look at Alexey's sch_red.c
21  */
22
23 #include <linux/config.h>
24 #include <linux/module.h>
25 #include <asm/uaccess.h>
26 #include <asm/system.h>
27 #include <linux/bitops.h>
28 #include <linux/types.h>
29 #include <linux/kernel.h>
30 #include <linux/sched.h>
31 #include <linux/string.h>
32 #include <linux/mm.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/in.h>
36 #include <linux/errno.h>
37 #include <linux/interrupt.h>
38 #include <linux/if_ether.h>
39 #include <linux/inet.h>
40 #include <linux/netdevice.h>
41 #include <linux/etherdevice.h>
42 #include <linux/notifier.h>
43 #include <net/ip.h>
44 #include <net/route.h>
45 #include <linux/skbuff.h>
46 #include <net/sock.h>
47 #include <net/pkt_sched.h>
48
49 #if 1 /* control */
50 #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
51 #else
52 #define DPRINTK(format,args...)
53 #endif
54
55 #if 0 /* data */
56 #define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
57 #else
58 #define D2PRINTK(format,args...)
59 #endif
60
61 #define GRED_DEF_PRIO (MAX_DPs / 2)
62
63 struct gred_sched_data;
64 struct gred_sched;
65
66 struct gred_sched_data
67 {
68 /* Parameters */
69         u32             limit;          /* HARD maximal queue length    */
70         u32             qth_min;        /* Min average length threshold: A scaled */
71         u32             qth_max;        /* Max average length threshold: A scaled */
72         u32             DP;             /* the drop pramaters */
73         char            Wlog;           /* log(W)               */
74         char            Plog;           /* random number bits   */
75         u32             Scell_max;
76         u32             Rmask;
77         u32             bytesin;        /* bytes seen on virtualQ so far*/
78         u32             packetsin;      /* packets seen on virtualQ so far*/
79         u32             backlog;        /* bytes on the virtualQ */
80         u32             forced; /* packets dropped for exceeding limits */
81         u32             early;  /* packets dropped as a warning */
82         u32             other;  /* packets dropped by invoking drop() */
83         u32             pdrop;  /* packets dropped because we exceeded physical queue limits */
84         char            Scell_log;
85         u8              Stab[256];
86         u8              prio;        /* the prio of this vq */
87
88 /* Variables */
89         unsigned long   qave;           /* Average queue length: A scaled */
90         int             qcount;         /* Packets since last random number generation */
91         u32             qR;             /* Cached random number */
92
93         psched_time_t   qidlestart;     /* Start of idle period */
94 };
95
96 enum {
97         GRED_WRED_MODE = 1,
98         GRED_RIO_MODE,
99 };
100
101 struct gred_sched
102 {
103         struct gred_sched_data *tab[MAX_DPs];
104         unsigned long   flags;
105         u32             DPs;   
106         u32             def; 
107         u8              initd; 
108 };
109
110 static inline int gred_wred_mode(struct gred_sched *table)
111 {
112         return test_bit(GRED_WRED_MODE, &table->flags);
113 }
114
115 static inline void gred_enable_wred_mode(struct gred_sched *table)
116 {
117         __set_bit(GRED_WRED_MODE, &table->flags);
118 }
119
120 static inline void gred_disable_wred_mode(struct gred_sched *table)
121 {
122         __clear_bit(GRED_WRED_MODE, &table->flags);
123 }
124
125 static inline int gred_rio_mode(struct gred_sched *table)
126 {
127         return test_bit(GRED_RIO_MODE, &table->flags);
128 }
129
130 static inline void gred_enable_rio_mode(struct gred_sched *table)
131 {
132         __set_bit(GRED_RIO_MODE, &table->flags);
133 }
134
135 static inline void gred_disable_rio_mode(struct gred_sched *table)
136 {
137         __clear_bit(GRED_RIO_MODE, &table->flags);
138 }
139
140 static inline int gred_wred_mode_check(struct Qdisc *sch)
141 {
142         struct gred_sched *table = qdisc_priv(sch);
143         int i;
144
145         /* Really ugly O(n^2) but shouldn't be necessary too frequent. */
146         for (i = 0; i < table->DPs; i++) {
147                 struct gred_sched_data *q = table->tab[i];
148                 int n;
149
150                 if (q == NULL)
151                         continue;
152
153                 for (n = 0; n < table->DPs; n++)
154                         if (table->tab[n] && table->tab[n] != q &&
155                             table->tab[n]->prio == q->prio)
156                                 return 1;
157         }
158
159         return 0;
160 }
161
162 static int
163 gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
164 {
165         psched_time_t now;
166         struct gred_sched_data *q=NULL;
167         struct gred_sched *t= qdisc_priv(sch);
168         unsigned long   qave=0; 
169         int i=0;
170
171         if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
172                 D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
173                 goto do_enqueue;
174         }
175
176
177         if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
178                 printk("GRED: setting to default (%d)\n ",t->def);
179                 if (!(q=t->tab[t->def])) {
180                         DPRINTK("GRED: setting to default FAILED! dropping!! "
181                             "(%d)\n ", t->def);
182                         goto drop;
183                 }
184                 /* fix tc_index? --could be controvesial but needed for
185                    requeueing */
186                 skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
187         }
188
189         D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
190             "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
191             sch->qstats.backlog);
192         /* sum up all the qaves of prios <= to ours to get the new qave*/
193         if (!gred_wred_mode(t) && gred_rio_mode(t)) {
194                 for (i=0;i<t->DPs;i++) {
195                         if ((!t->tab[i]) || (i==q->DP)) 
196                                 continue; 
197                                 
198                         if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
199                                 qave +=t->tab[i]->qave;
200                 }
201                         
202         }
203
204         q->packetsin++;
205         q->bytesin+=skb->len;
206
207         if (gred_wred_mode(t)) {
208                 qave=0;
209                 q->qave=t->tab[t->def]->qave;
210                 q->qidlestart=t->tab[t->def]->qidlestart;
211         }
212
213         if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
214                 long us_idle;
215                 PSCHED_GET_TIME(now);
216                 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
217                 PSCHED_SET_PASTPERFECT(q->qidlestart);
218
219                 q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
220         } else {
221                 if (gred_wred_mode(t)) {
222                         q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
223                 } else {
224                         q->qave += q->backlog - (q->qave >> q->Wlog);
225                 }
226
227         }
228         
229
230         if (gred_wred_mode(t))
231                 t->tab[t->def]->qave=q->qave;
232
233         if ((q->qave+qave) < q->qth_min) {
234                 q->qcount = -1;
235 enqueue:
236                 if (q->backlog + skb->len <= q->limit) {
237                         q->backlog += skb->len;
238 do_enqueue:
239                         __skb_queue_tail(&sch->q, skb);
240                         sch->qstats.backlog += skb->len;
241                         sch->bstats.bytes += skb->len;
242                         sch->bstats.packets++;
243                         return 0;
244                 } else {
245                         q->pdrop++;
246                 }
247
248 drop:
249                 kfree_skb(skb);
250                 sch->qstats.drops++;
251                 return NET_XMIT_DROP;
252         }
253         if ((q->qave+qave) >= q->qth_max) {
254                 q->qcount = -1;
255                 sch->qstats.overlimits++;
256                 q->forced++;
257                 goto drop;
258         }
259         if (++q->qcount) {
260                 if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
261                         goto enqueue;
262                 q->qcount = 0;
263                 q->qR = net_random()&q->Rmask;
264                 sch->qstats.overlimits++;
265                 q->early++;
266                 goto drop;
267         }
268         q->qR = net_random()&q->Rmask;
269         goto enqueue;
270 }
271
272 static int
273 gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
274 {
275         struct gred_sched_data *q;
276         struct gred_sched *t= qdisc_priv(sch);
277         q= t->tab[(skb->tc_index&0xf)];
278 /* error checking here -- probably unnecessary */
279         PSCHED_SET_PASTPERFECT(q->qidlestart);
280
281         __skb_queue_head(&sch->q, skb);
282         sch->qstats.backlog += skb->len;
283         sch->qstats.requeues++;
284         q->backlog += skb->len;
285         return 0;
286 }
287
288 static struct sk_buff *
289 gred_dequeue(struct Qdisc* sch)
290 {
291         struct sk_buff *skb;
292         struct gred_sched_data *q;
293         struct gred_sched *t= qdisc_priv(sch);
294
295         skb = __skb_dequeue(&sch->q);
296         if (skb) {
297                 sch->qstats.backlog -= skb->len;
298                 q= t->tab[(skb->tc_index&0xf)];
299                 if (q) {
300                         q->backlog -= skb->len;
301                         if (!q->backlog && !gred_wred_mode(t))
302                                 PSCHED_GET_TIME(q->qidlestart);
303                 } else {
304                         D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 
305                 }
306                 return skb;
307         }
308
309         if (gred_wred_mode(t)) {
310                         q= t->tab[t->def];
311                         if (!q) 
312                                 D2PRINTK("no default VQ set: Results will be "
313                                        "screwed up\n");
314                         else
315                                 PSCHED_GET_TIME(q->qidlestart);
316         }
317
318         return NULL;
319 }
320
321 static unsigned int gred_drop(struct Qdisc* sch)
322 {
323         struct sk_buff *skb;
324
325         struct gred_sched_data *q;
326         struct gred_sched *t= qdisc_priv(sch);
327
328         skb = __skb_dequeue_tail(&sch->q);
329         if (skb) {
330                 unsigned int len = skb->len;
331                 sch->qstats.backlog -= len;
332                 sch->qstats.drops++;
333                 q= t->tab[(skb->tc_index&0xf)];
334                 if (q) {
335                         q->backlog -= len;
336                         q->other++;
337                         if (!q->backlog && !gred_wred_mode(t))
338                                 PSCHED_GET_TIME(q->qidlestart);
339                 } else {
340                         D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 
341                 }
342
343                 kfree_skb(skb);
344                 return len;
345         }
346
347         q=t->tab[t->def];
348         if (!q) {
349                 D2PRINTK("no default VQ set: Results might be screwed up\n");
350                 return 0;
351         }
352
353         PSCHED_GET_TIME(q->qidlestart);
354         return 0;
355
356 }
357
358 static void gred_reset(struct Qdisc* sch)
359 {
360         int i;
361         struct gred_sched_data *q;
362         struct gred_sched *t= qdisc_priv(sch);
363
364         __skb_queue_purge(&sch->q);
365
366         sch->qstats.backlog = 0;
367
368         for (i=0;i<t->DPs;i++) {
369                 q= t->tab[i];
370                 if (!q) 
371                         continue; 
372                 PSCHED_SET_PASTPERFECT(q->qidlestart);
373                 q->qave = 0;
374                 q->qcount = -1;
375                 q->backlog = 0;
376                 q->other=0;
377                 q->forced=0;
378                 q->pdrop=0;
379                 q->early=0;
380         }
381 }
382
383 static inline void gred_destroy_vq(struct gred_sched_data *q)
384 {
385         kfree(q);
386 }
387
388 static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
389 {
390         struct gred_sched *table = qdisc_priv(sch);
391         struct tc_gred_sopt *sopt;
392         int i;
393
394         if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
395                 return -EINVAL;
396
397         sopt = RTA_DATA(dps);
398
399         if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
400                 return -EINVAL;
401
402         sch_tree_lock(sch);
403         table->DPs = sopt->DPs;
404         table->def = sopt->def_DP;
405
406         /*
407          * Every entry point to GRED is synchronized with the above code
408          * and the DP is checked against DPs, i.e. shadowed VQs can no
409          * longer be found so we can unlock right here.
410          */
411         sch_tree_unlock(sch);
412
413         if (sopt->grio) {
414                 gred_enable_rio_mode(table);
415                 gred_disable_wred_mode(table);
416                 if (gred_wred_mode_check(sch))
417                         gred_enable_wred_mode(table);
418         } else {
419                 gred_disable_rio_mode(table);
420                 gred_disable_wred_mode(table);
421         }
422
423         for (i = table->DPs; i < MAX_DPs; i++) {
424                 if (table->tab[i]) {
425                         printk(KERN_WARNING "GRED: Warning: Destroying "
426                                "shadowed VQ 0x%x\n", i);
427                         gred_destroy_vq(table->tab[i]);
428                         table->tab[i] = NULL;
429                 }
430         }
431
432         table->initd = 0;
433
434         return 0;
435 }
436
437 static inline int gred_change_vq(struct Qdisc *sch, int dp,
438                                  struct tc_gred_qopt *ctl, int prio, u8 *stab)
439 {
440         struct gred_sched *table = qdisc_priv(sch);
441         struct gred_sched_data *q;
442
443         if (table->tab[dp] == NULL) {
444                 table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL);
445                 if (table->tab[dp] == NULL)
446                         return -ENOMEM;
447                 memset(table->tab[dp], 0, sizeof(*q));
448         }
449
450         q = table->tab[dp];
451         q->DP = dp;
452         q->prio = prio;
453
454         q->Wlog = ctl->Wlog;
455         q->Plog = ctl->Plog;
456         q->limit = ctl->limit;
457         q->Scell_log = ctl->Scell_log;
458         q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
459         q->Scell_max = (255<<q->Scell_log);
460         q->qth_min = ctl->qth_min<<ctl->Wlog;
461         q->qth_max = ctl->qth_max<<ctl->Wlog;
462         q->qave=0;
463         q->backlog=0;
464         q->qcount = -1;
465         q->other=0;
466         q->forced=0;
467         q->pdrop=0;
468         q->early=0;
469
470         PSCHED_SET_PASTPERFECT(q->qidlestart);
471         memcpy(q->Stab, stab, 256);
472
473         return 0;
474 }
475
476 static int gred_change(struct Qdisc *sch, struct rtattr *opt)
477 {
478         struct gred_sched *table = qdisc_priv(sch);
479         struct tc_gred_qopt *ctl;
480         struct rtattr *tb[TCA_GRED_MAX];
481         int err = -EINVAL, prio = GRED_DEF_PRIO;
482         u8 *stab;
483
484         if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
485                 return -EINVAL;
486
487         if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
488                 return gred_change_table_def(sch, opt);
489
490         if (tb[TCA_GRED_PARMS-1] == NULL ||
491             RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
492             tb[TCA_GRED_STAB-1] == NULL ||
493             RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
494                 return -EINVAL;
495
496         ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
497         stab = RTA_DATA(tb[TCA_GRED_STAB-1]);
498
499         if (ctl->DP >= table->DPs)
500                 goto errout;
501
502         if (gred_rio_mode(table)) {
503                 if (ctl->prio == 0) {
504                         int def_prio = GRED_DEF_PRIO;
505
506                         if (table->tab[table->def])
507                                 def_prio = table->tab[table->def]->prio;
508
509                         printk(KERN_DEBUG "GRED: DP %u does not have a prio "
510                                "setting default to %d\n", ctl->DP, def_prio);
511
512                         prio = def_prio;
513                 } else
514                         prio = ctl->prio;
515         }
516
517         sch_tree_lock(sch);
518
519         err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
520         if (err < 0)
521                 goto errout_locked;
522
523         if (table->tab[table->def] == NULL) {
524                 if (gred_rio_mode(table))
525                         prio = table->tab[ctl->DP]->prio;
526
527                 err = gred_change_vq(sch, table->def, ctl, prio, stab);
528                 if (err < 0)
529                         goto errout_locked;
530         }
531
532         table->initd = 1;
533
534         if (gred_rio_mode(table)) {
535                 gred_disable_wred_mode(table);
536                 if (gred_wred_mode_check(sch))
537                         gred_enable_wred_mode(table);
538         }
539
540         err = 0;
541
542 errout_locked:
543         sch_tree_unlock(sch);
544 errout:
545         return err;
546 }
547
548 static int gred_init(struct Qdisc *sch, struct rtattr *opt)
549 {
550         struct rtattr *tb[TCA_GRED_MAX];
551
552         if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
553                 return -EINVAL;
554
555         if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
556                 return -EINVAL;
557
558         return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
559 }
560
561 static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
562 {
563         struct gred_sched *table = qdisc_priv(sch);
564         struct rtattr *parms, *opts = NULL;
565         int i;
566         struct tc_gred_sopt sopt = {
567                 .DPs    = table->DPs,
568                 .def_DP = table->def,
569                 .grio   = gred_rio_mode(table),
570         };
571
572         opts = RTA_NEST(skb, TCA_OPTIONS);
573         RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
574         parms = RTA_NEST(skb, TCA_GRED_PARMS);
575
576         for (i = 0; i < MAX_DPs; i++) {
577                 struct gred_sched_data *q = table->tab[i];
578                 struct tc_gred_qopt opt;
579
580                 memset(&opt, 0, sizeof(opt));
581
582                 if (!q) {
583                         /* hack -- fix at some point with proper message
584                            This is how we indicate to tc that there is no VQ
585                            at this DP */
586
587                         opt.DP = MAX_DPs + i;
588                         goto append_opt;
589                 }
590
591                 opt.limit       = q->limit;
592                 opt.DP          = q->DP;
593                 opt.backlog     = q->backlog;
594                 opt.prio        = q->prio;
595                 opt.qth_min     = q->qth_min >> q->Wlog;
596                 opt.qth_max     = q->qth_max >> q->Wlog;
597                 opt.Wlog        = q->Wlog;
598                 opt.Plog        = q->Plog;
599                 opt.Scell_log   = q->Scell_log;
600                 opt.other       = q->other;
601                 opt.early       = q->early;
602                 opt.forced      = q->forced;
603                 opt.pdrop       = q->pdrop;
604                 opt.packets     = q->packetsin;
605                 opt.bytesin     = q->bytesin;
606
607                 if (q->qave) {
608                         if (gred_wred_mode(table)) {
609                                 q->qidlestart=table->tab[table->def]->qidlestart;
610                                 q->qave=table->tab[table->def]->qave;
611                         }
612                         if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
613                                 long idle;
614                                 unsigned long qave;
615                                 psched_time_t now;
616                                 PSCHED_GET_TIME(now);
617                                 idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
618                                 qave  = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
619                                 opt.qave = qave >> q->Wlog;
620
621                         } else {
622                                 opt.qave = q->qave >> q->Wlog;
623                         }
624                 }
625
626 append_opt:
627                 RTA_APPEND(skb, sizeof(opt), &opt);
628         }
629
630         RTA_NEST_END(skb, parms);
631
632         return RTA_NEST_END(skb, opts);
633
634 rtattr_failure:
635         return RTA_NEST_CANCEL(skb, opts);
636 }
637
638 static void gred_destroy(struct Qdisc *sch)
639 {
640         struct gred_sched *table = qdisc_priv(sch);
641         int i;
642
643         for (i = 0;i < table->DPs; i++) {
644                 if (table->tab[i])
645                         gred_destroy_vq(table->tab[i]);
646         }
647 }
648
649 static struct Qdisc_ops gred_qdisc_ops = {
650         .next           =       NULL,
651         .cl_ops         =       NULL,
652         .id             =       "gred",
653         .priv_size      =       sizeof(struct gred_sched),
654         .enqueue        =       gred_enqueue,
655         .dequeue        =       gred_dequeue,
656         .requeue        =       gred_requeue,
657         .drop           =       gred_drop,
658         .init           =       gred_init,
659         .reset          =       gred_reset,
660         .destroy        =       gred_destroy,
661         .change         =       gred_change,
662         .dump           =       gred_dump,
663         .owner          =       THIS_MODULE,
664 };
665
666 static int __init gred_module_init(void)
667 {
668         return register_qdisc(&gred_qdisc_ops);
669 }
670 static void __exit gred_module_exit(void) 
671 {
672         unregister_qdisc(&gred_qdisc_ops);
673 }
674 module_init(gred_module_init)
675 module_exit(gred_module_exit)
676 MODULE_LICENSE("GPL");