break trunk temporary - upgrade to 2.6.21.1 and iptables 1.3.7
[openwrt/staging/chunkeey.git] / target / linux / generic-2.6 / patches / 200-sched_esfq.patch
index 6830b833ad786d8384fb5006cb0faf91bd8ca02e..dba20cff28f9dcb0d284376da21801bf8ef57f06 100644 (file)
@@ -1,7 +1,7 @@
-diff -urN linux-2.6.19.old/include/linux/pkt_sched.h linux-2.6.19.dev/include/linux/pkt_sched.h
---- linux-2.6.19.old/include/linux/pkt_sched.h 2006-11-29 22:57:37.000000000 +0100
-+++ linux-2.6.19.dev/include/linux/pkt_sched.h 2006-12-14 03:13:51.000000000 +0100
-@@ -146,8 +146,35 @@
+diff -Naur linux-2.6.20.orig/include/linux/pkt_sched.h linux-2.6.20/include/linux/pkt_sched.h
+--- linux-2.6.20.orig/include/linux/pkt_sched.h        2007-02-04 10:44:54.000000000 -0800
++++ linux-2.6.20/include/linux/pkt_sched.h     2007-02-14 23:58:41.000000000 -0800
+@@ -146,8 +146,40 @@
   *
   *    The only reason for this is efficiency, it is possible
   *    to change these parameters in compile time.
@@ -22,6 +22,11 @@ diff -urN linux-2.6.19.old/include/linux/pkt_sched.h linux-2.6.19.dev/include/li
 +      TCA_SFQ_HASH_DSTDIR,
 +      TCA_SFQ_HASH_SRCDIR,
 +      TCA_SFQ_HASH_FWMARKDIR,
++      /* conntrack */
++      TCA_SFQ_HASH_CTORIGDST,
++      TCA_SFQ_HASH_CTORIGSRC,
++      TCA_SFQ_HASH_CTREPLDST,
++      TCA_SFQ_HASH_CTREPLSRC,
 +};
 +
 +struct tc_esfq_qopt
@@ -37,31 +42,29 @@ diff -urN linux-2.6.19.old/include/linux/pkt_sched.h linux-2.6.19.dev/include/li
  /* RED section */
  
  enum
-diff -urN linux-2.6.19.old/net/sched/Kconfig linux-2.6.19.dev/net/sched/Kconfig
---- linux-2.6.19.old/net/sched/Kconfig 2006-11-29 22:57:37.000000000 +0100
-+++ linux-2.6.19.dev/net/sched/Kconfig 2006-12-14 03:13:51.000000000 +0100
-@@ -185,6 +185,28 @@
+diff -Naur linux-2.6.20.orig/net/sched/Kconfig linux-2.6.20/net/sched/Kconfig
+--- linux-2.6.20.orig/net/sched/Kconfig        2007-02-04 10:44:54.000000000 -0800
++++ linux-2.6.20/net/sched/Kconfig     2007-02-14 23:58:41.000000000 -0800
+@@ -189,6 +189,26 @@
          To compile this code as a module, choose M here: the
          module will be called sch_sfq.
  
 +config NET_SCH_ESFQ
-+      tristate "ESFQ queue"
-+      depends on NET_SCHED
++      tristate "Enhanced Stochastic Fairness Queueing (ESFQ)"
 +      ---help---
 +        Say Y here if you want to use the Enhanced Stochastic Fairness
 +        Queueing (ESFQ) packet scheduling algorithm for some of your network
 +        devices or as a leaf discipline for a classful qdisc such as HTB or
 +        CBQ (see the top of <file:net/sched/sch_esfq.c> for details and
 +        references to the SFQ algorithm).
-+        
++
 +        This is an enchanced SFQ version which allows you to control some
-+        hardcoded values in the SFQ scheduler: queue depth, hash table size,
-+        and queues limit.
-+        
-+        ESFQ also adds control to the hash function used to identify packet
-+        flows. The original SFQ hashes by individual flow (TCP session or UDP
-+        stream); ESFQ can hash by src or dst IP as well, which can be more
-+        fair to users in some networking situations.
++        hardcoded values in the SFQ scheduler.
++
++        ESFQ also adds control of the hash function used to identify packet
++        flows. The original SFQ discipline hashes by connection; ESFQ add
++        several other hashing methods, such as by src IP or by dst IP, which
++        can be more fair to users in some networking situations.
 +        
 +        To compile this code as a module, choose M here: the
 +        module will be called sch_esfq.
@@ -69,10 +72,10 @@ diff -urN linux-2.6.19.old/net/sched/Kconfig linux-2.6.19.dev/net/sched/Kconfig
  config NET_SCH_TEQL
        tristate "True Link Equalizer (TEQL)"
        ---help---
-diff -urN linux-2.6.19.old/net/sched/Makefile linux-2.6.19.dev/net/sched/Makefile
---- linux-2.6.19.old/net/sched/Makefile        2006-11-29 22:57:37.000000000 +0100
-+++ linux-2.6.19.dev/net/sched/Makefile        2006-12-14 03:13:51.000000000 +0100
-@@ -23,6 +23,7 @@
+diff -Naur linux-2.6.20.orig/net/sched/Makefile linux-2.6.20/net/sched/Makefile
+--- linux-2.6.20.orig/net/sched/Makefile       2007-02-04 10:44:54.000000000 -0800
++++ linux-2.6.20/net/sched/Makefile    2007-02-14 23:58:41.000000000 -0800
+@@ -24,6 +24,7 @@
  obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o 
  obj-$(CONFIG_NET_SCH_DSMARK)  += sch_dsmark.o
  obj-$(CONFIG_NET_SCH_SFQ)     += sch_sfq.o
@@ -80,10 +83,10 @@ diff -urN linux-2.6.19.old/net/sched/Makefile linux-2.6.19.dev/net/sched/Makefil
  obj-$(CONFIG_NET_SCH_TBF)     += sch_tbf.o
  obj-$(CONFIG_NET_SCH_TEQL)    += sch_teql.o
  obj-$(CONFIG_NET_SCH_PRIO)    += sch_prio.o
-diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_esfq.c
---- linux-2.6.19.old/net/sched/sch_esfq.c      1970-01-01 01:00:00.000000000 +0100
-+++ linux-2.6.19.dev/net/sched/sch_esfq.c      2006-12-14 03:13:51.000000000 +0100
-@@ -0,0 +1,644 @@
+diff -Naur linux-2.6.20.orig/net/sched/sch_esfq.c linux-2.6.20/net/sched/sch_esfq.c
+--- linux-2.6.20.orig/net/sched/sch_esfq.c     1969-12-31 16:00:00.000000000 -0800
++++ linux-2.6.20/net/sched/sch_esfq.c  2007-02-15 00:19:56.000000000 -0800
+@@ -0,0 +1,704 @@
 +/*
 + * net/sched/sch_esfq.c       Extended Stochastic Fairness Queueing discipline.
 + *
@@ -103,12 +106,12 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 + *
 + *            Corey Hickey, <bugfood-c@fatooh.org>
 + *            Maintenance of the Linux 2.6 port.
-+ *            Added fwmark hash (thanks to Robert Kurjata)
++ *            Added fwmark hash (thanks to Robert Kurjata).
 + *            Added direct hashing for src, dst, and fwmark.
++ *            Added usage of jhash.
 + *            
 + */
 +
-+#include <linux/autoconf.h>
 +#include <linux/module.h>
 +#include <asm/uaccess.h>
 +#include <asm/system.h>
@@ -135,12 +138,16 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +#include <linux/skbuff.h>
 +#include <net/sock.h>
 +#include <net/pkt_sched.h>
++#include <linux/jhash.h>
 +
++#ifdef CONFIG_NF_CONNTRACK_ENABLED
++#include <net/netfilter/nf_conntrack.h>
++#endif
 +
 +/*    Stochastic Fairness Queuing algorithm.
 +      For more comments look at sch_sfq.c.
 +      The difference is that you can change limit, depth,
-+      hash table size and choose 7 hash types.
++      hash table size and choose alternate hash types.
 +      
 +      classic:        same as in sch_sfq.c
 +      dst:            destination IP address
@@ -149,9 +156,11 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +      dst_direct:
 +      src_direct:
 +      fwmark_direct:  direct hashing of the above sources
++      ctorigdst:      original destination IP address
++      ctorigsrc:      original source IP address
++      ctrepldst:      reply destination IP address
++      ctreplsrc:      reply source IP 
 +      
-+      TODO: 
-+              make sfq_change work.
 +*/
 +
 +
@@ -190,20 +199,24 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +      unsigned        dyn_range;      /*                                 saved range */
 +};
 +
-+static __inline__ unsigned esfq_hash_u32(struct esfq_sched_data *q,u32 h)
++/* This contains the info we will hash. */
++struct esfq_packet_info
 +{
-+      int pert = q->perturbation;
-+
-+      if (pert)
-+              h = (h<<pert) ^ (h>>(0x1F - pert));
-+
-+      h = ntohl(h) * 2654435761UL;
-+      return h & (q->hash_divisor-1);
-+}
++      u32     proto;          /* protocol or port */
++      u32     src;            /* source from packet header */
++      u32     dst;            /* destination from packet header */
++      u32     ctorigsrc;      /* original source from conntrack */
++      u32     ctorigdst;      /* original destination from conntrack */
++      u32     ctreplsrc;      /* reply source from conntrack */
++      u32     ctrepldst;      /* reply destination from conntrack */
++      u32     mark;           /* netfilter mark (fwmark) */
++};
 +
 +/* Hash input values directly into the "nearest" slot, taking into account the
 + * range of input values seen. This is most useful when the hash table is at
-+ * least as large as the range of possible values. */
++ * least as large as the range of possible values.
++ * Note: this functionality was added before the change to using jhash, and may
++ * no longer be useful. */
 +static __inline__ unsigned esfq_hash_direct(struct esfq_sched_data *q, u32 h)
 +{
 +      /* adjust minimum and maximum */
@@ -224,83 +237,128 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +              return (h - q->dyn_min) * (q->hash_divisor - 1) / q->dyn_range;
 +}
 +
-+static __inline__ unsigned esfq_fold_hash_classic(struct esfq_sched_data *q, u32 h, u32 h1)
++static __inline__ unsigned esfq_jhash_1word(struct esfq_sched_data *q,u32 a)
 +{
-+      int pert = q->perturbation;
++      return jhash_1word(a, q->perturbation) & (q->hash_divisor-1);
++}
 +
-+      /* Have we any rotation primitives? If not, WHY? */
-+      h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
-+      h ^= h>>10;
-+      return h & (q->hash_divisor-1);
++static __inline__ unsigned esfq_jhash_2words(struct esfq_sched_data *q, u32 a, u32 b)
++{
++      return jhash_2words(a, b, q->perturbation) & (q->hash_divisor-1);
 +}
 +
-+static unsigned esfq_hash(struct esfq_sched_data *q, struct sk_buff *skb)
++static __inline__ unsigned esfq_jhash_3words(struct esfq_sched_data *q, u32 a, u32 b, u32 c)
 +{
-+      u32 h, h2;
-+      u32 hs;
-+      u32 nfm;
++      return jhash_3words(a, b, c, q->perturbation) & (q->hash_divisor-1);
++}
 +
++
++static unsigned esfq_hash(struct esfq_sched_data *q, struct sk_buff *skb)
++{
++      struct esfq_packet_info info;
++#ifdef CONFIG_NF_CONNTRACK_ENABLED
++      enum ip_conntrack_info ctinfo;
++      struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
++#endif
++      
 +      switch (skb->protocol) {
 +      case __constant_htons(ETH_P_IP):
 +      {
 +              struct iphdr *iph = skb->nh.iph;
-+              h = iph->daddr;
-+              hs = iph->saddr;
-+              nfm = skb->nfmark;
-+              h2 = hs^iph->protocol;
++              info.dst = iph->daddr;
++              info.src = iph->saddr;
 +              if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 +                  (iph->protocol == IPPROTO_TCP ||
 +                   iph->protocol == IPPROTO_UDP ||
 +                   iph->protocol == IPPROTO_SCTP ||
 +                   iph->protocol == IPPROTO_DCCP ||
 +                   iph->protocol == IPPROTO_ESP))
-+                      h2 ^= *(((u32*)iph) + iph->ihl);
++                      info.proto = *(((u32*)iph) + iph->ihl);
++              else
++                      info.proto = iph->protocol;
 +              break;
 +      }
 +      case __constant_htons(ETH_P_IPV6):
 +      {
 +              struct ipv6hdr *iph = skb->nh.ipv6h;
-+              h = iph->daddr.s6_addr32[3];
-+              hs = iph->saddr.s6_addr32[3];
-+              nfm = skb->nfmark;
-+              h2 = hs^iph->nexthdr;
++              /* Hash ipv6 addresses into a u32. This isn't ideal,
++               * but the code is simple. */
++              info.dst = jhash2(iph->daddr.s6_addr32, 4, q->perturbation);
++              info.src = jhash2(iph->saddr.s6_addr32, 4, q->perturbation);
 +              if (iph->nexthdr == IPPROTO_TCP ||
 +                  iph->nexthdr == IPPROTO_UDP ||
 +                  iph->nexthdr == IPPROTO_SCTP ||
 +                  iph->nexthdr == IPPROTO_DCCP ||
 +                  iph->nexthdr == IPPROTO_ESP)
-+                      h2 ^= *(u32*)&iph[1];
++                      info.proto = *(u32*)&iph[1];
++              else
++                      info.proto = iph->nexthdr;
 +              break;
 +      }
 +      default:
-+              h = (u32)(unsigned long)skb->dst;
-+              hs = (u32)(unsigned long)skb->sk;
-+              nfm = skb->nfmark;
-+              h2 = hs^skb->protocol;
++              info.dst   = (u32)(unsigned long)skb->dst;
++              info.src   = (u32)(unsigned long)skb->sk;
++              info.proto = skb->protocol;
++      }
++
++      info.mark = skb->mark;
++
++#ifdef CONFIG_NF_CONNTRACK_ENABLED
++      /* defaults if there is no conntrack info */
++      info.ctorigsrc = info.src;
++      info.ctorigdst = info.dst;
++      info.ctreplsrc = info.dst;
++      info.ctrepldst = info.src;
++      /* collect conntrack info */
++      if (ct && ct != &nf_conntrack_untracked) {
++              if (skb->protocol == __constant_htons(ETH_P_IP)) {
++                      info.ctorigsrc = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
++                      info.ctorigdst = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip;
++                      info.ctreplsrc = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip;
++                      info.ctrepldst = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip;
++              }
++              else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
++                      /* Again, hash ipv6 addresses into a single u32. */
++                      info.ctorigsrc = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6, 4, q->perturbation);
++                      info.ctorigdst = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip6, 4, q->perturbation);
++                      info.ctreplsrc = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6, 4, q->perturbation);
++                      info.ctrepldst = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6, 4, q->perturbation);
++              }
++
 +      }
++#endif
++
 +      switch(q->hash_kind)
 +      {
 +      case TCA_SFQ_HASH_CLASSIC:
-+              return esfq_fold_hash_classic(q, h, h2);
++              return esfq_jhash_3words(q, info.dst, info.src, info.proto);
 +      case TCA_SFQ_HASH_DST:
-+              return esfq_hash_u32(q,h);
++              return esfq_jhash_1word(q, info.dst);
 +      case TCA_SFQ_HASH_DSTDIR:
-+              return esfq_hash_direct(q, ntohl(h));
++              return esfq_hash_direct(q, ntohl(info.dst));
 +      case TCA_SFQ_HASH_SRC:
-+              return esfq_hash_u32(q,hs);
++              return esfq_jhash_1word(q, info.src);
 +      case TCA_SFQ_HASH_SRCDIR:
-+              return esfq_hash_direct(q, ntohl(hs));
-+#ifdef CONFIG_NETFILTER
++              return esfq_hash_direct(q, ntohl(info.src));
 +      case TCA_SFQ_HASH_FWMARK:
-+              return esfq_hash_u32(q,nfm);
++              return esfq_jhash_1word(q, info.mark);
 +      case TCA_SFQ_HASH_FWMARKDIR:
-+              return esfq_hash_direct(q,nfm);
++              return esfq_hash_direct(q, info.mark);
++#ifdef CONFIG_NF_CONNTRACK_ENABLED
++      case TCA_SFQ_HASH_CTORIGDST:
++              return esfq_jhash_1word(q, info.ctorigdst);
++      case TCA_SFQ_HASH_CTORIGSRC:
++              return esfq_jhash_1word(q, info.ctorigsrc);
++      case TCA_SFQ_HASH_CTREPLDST:
++              return esfq_jhash_1word(q, info.ctrepldst);
++      case TCA_SFQ_HASH_CTREPLSRC:
++              return esfq_jhash_1word(q, info.ctreplsrc);
 +#endif
 +      default:
 +              if (net_ratelimit())
 +                      printk(KERN_WARNING "ESFQ: Unknown hash method. Falling back to classic.\n");
 +      }
-+      return esfq_fold_hash_classic(q, h, h2);
++      return esfq_jhash_3words(q, info.dst, info.src, info.proto);
 +}
 +
 +static inline void esfq_link(struct esfq_sched_data *q, esfq_index x)
@@ -365,6 +423,7 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +              esfq_dec(q, x);
 +              sch->q.qlen--;
 +              sch->qstats.drops++;
++              sch->qstats.backlog -= len;
 +              return len;
 +      }
 +
@@ -381,6 +440,7 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +              sch->q.qlen--;
 +              q->ht[q->hash[d]] = q->depth;
 +              sch->qstats.drops++;
++              sch->qstats.backlog -= len;
 +              return len;
 +      }
 +
@@ -400,6 +460,7 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +              q->ht[hash] = x = q->dep[depth].next;
 +              q->hash[x] = hash;
 +      }
++      sch->qstats.backlog += skb->len;
 +      __skb_queue_tail(&q->qs[x], skb);
 +      esfq_inc(q, x);
 +      if (q->qs[x].qlen == 1) {               /* The flow is new */
@@ -436,6 +497,7 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +              q->ht[hash] = x = q->dep[depth].next;
 +              q->hash[x] = hash;
 +      }
++      sch->qstats.backlog += skb->len;
 +      __skb_queue_head(&q->qs[x], skb);
 +      esfq_inc(q, x);
 +      if (q->qs[x].qlen == 1) {               /* The flow is new */
@@ -480,6 +542,7 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +      skb = __skb_dequeue(&q->qs[a]);
 +      esfq_dec(q, a);
 +      sch->q.qlen--;
++      sch->qstats.backlog -= skb->len;
 +      
 +      /* Is the slot empty? */
 +      if (q->qs[a].qlen == 0) {
@@ -542,7 +605,7 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +      
 +      if (ctl->hash_kind) {
 +              q->hash_kind = ctl->hash_kind;
-+              if (q->hash_kind !=  TCA_SFQ_HASH_CLASSIC)
++              if (q->hash_kind != TCA_SFQ_HASH_CLASSIC)
 +                      q->perturb_period = 0;
 +      }
 +      
@@ -566,7 +629,7 @@ diff -urN linux-2.6.19.old/net/sched/sch_esfq.c linux-2.6.19.dev/net/sched/sch_e
 +{
 +      struct esfq_sched_data *q = qdisc_priv(sch);
 +      struct tc_esfq_qopt *ctl;
-+      esfq_index p = ~0UL/2;
++      esfq_index p = ~0U/2;
 +      int i;
 +      
 +      if (opt && opt->rta_len < RTA_LENGTH(sizeof(*ctl)))