Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
authorLinus Torvalds <torvalds@g5.osdl.org>
Fri, 9 Sep 2005 21:25:22 +0000 (14:25 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Fri, 9 Sep 2005 21:25:22 +0000 (14:25 -0700)
include/linux/dccp.h
net/dccp/ccids/ccid3.c
net/dccp/ccids/ccid3.h
net/dccp/ccids/lib/packet_history.h
net/dccp/dccp.h
net/dccp/input.c
net/dccp/ipv4.c
net/dccp/minisocks.c
net/dccp/options.c
net/ipv4/af_inet.c
net/ipv4/fib_trie.c

index 007c290..8bf4bac 100644 (file)
@@ -432,7 +432,10 @@ struct dccp_sock {
        struct ccid                     *dccps_hc_rx_ccid;
        struct ccid                     *dccps_hc_tx_ccid;
        struct dccp_options_received    dccps_options_received;
+       struct timeval                  dccps_epoch;
        enum dccp_role                  dccps_role:2;
+       __u8                            dccps_hc_rx_insert_options:1;
+       __u8                            dccps_hc_tx_insert_options:1;
 };
  
 static inline struct dccp_sock *dccp_sk(const struct sock *sk)
index 7bf3b3a..ea30012 100644 (file)
 #include "ccid3.h"
 
 /*
- * Reason for maths with 10 here is to avoid 32 bit overflow when a is big.
+ * Reason for maths here is to avoid 32 bit overflow when a is big.
+ * With this we get close to the limit.
  */
 static inline u32 usecs_div(const u32 a, const u32 b)
 {
-       const u32 tmp = a * (USEC_PER_SEC / 10);
-       return b > 20 ? tmp / (b / 10) : tmp;
+       const u32 div = a < (UINT_MAX / (USEC_PER_SEC /    10)) ?    10 :
+                       a < (UINT_MAX / (USEC_PER_SEC /    50)) ?    50 :
+                       a < (UINT_MAX / (USEC_PER_SEC /   100)) ?   100 :
+                       a < (UINT_MAX / (USEC_PER_SEC /   500)) ?   500 :
+                       a < (UINT_MAX / (USEC_PER_SEC /  1000)) ?  1000 :
+                       a < (UINT_MAX / (USEC_PER_SEC /  5000)) ?  5000 :
+                       a < (UINT_MAX / (USEC_PER_SEC / 10000)) ? 10000 :
+                       a < (UINT_MAX / (USEC_PER_SEC / 50000)) ? 50000 :
+                                                                100000;
+       const u32 tmp = a * (USEC_PER_SEC / div);
+       return (b >= 2 * div) ? tmp / (b / div) : tmp;
 }
 
 static int ccid3_debug;
@@ -102,8 +112,7 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
 static inline void ccid3_hc_tx_set_state(struct sock *sk,
                                         enum ccid3_hc_tx_states state)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
        enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
 
        ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
@@ -144,8 +153,7 @@ static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx)
  */ 
 static void ccid3_hc_tx_update_x(struct sock *sk)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
        /* To avoid large error in calcX */
        if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
@@ -159,7 +167,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk)
        } else {
                struct timeval now;
 
-               do_gettimeofday(&now);
+               dccp_timestamp(sk, &now);
                if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >=
                    hctx->ccid3hctx_rtt) {
                        hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv,
@@ -174,9 +182,8 @@ static void ccid3_hc_tx_update_x(struct sock *sk)
 static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 {
        struct sock *sk = (struct sock *)data;
-       struct dccp_sock *dp = dccp_sk(sk);
        unsigned long next_tmout = 0;
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
@@ -274,7 +281,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk,
                                   struct sk_buff *skb, int len)
 {
        struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
        struct dccp_tx_hist_entry *new_packet;
        struct timeval now;
        long delay;
@@ -307,7 +314,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk,
                dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
        }
 
-       do_gettimeofday(&now);
+       dccp_timestamp(sk, &now);
 
        switch (hctx->ccid3hctx_state) {
        case TFRC_SSTATE_NO_SENT:
@@ -348,18 +355,20 @@ static int ccid3_hc_tx_send_packet(struct sock *sk,
        }
 
        /* Can we send? if so add options and add to packet history */
-       if (rc == 0)
+       if (rc == 0) {
+               dp->dccps_hc_tx_insert_options = 1;
                new_packet->dccphtx_ccval =
                        DCCP_SKB_CB(skb)->dccpd_ccval =
                                hctx->ccid3hctx_last_win_count;
+       }
 out:
        return rc;
 }
 
 static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       const struct dccp_sock *dp = dccp_sk(sk);
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
        struct timeval now;
 
        BUG_ON(hctx == NULL);
@@ -370,7 +379,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
                return;
        }
 
-       do_gettimeofday(&now);
+       dccp_timestamp(sk, &now);
 
        /* check if we have sent a data packet */
        if (len > 0) {
@@ -445,10 +454,11 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
 
 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       const struct dccp_sock *dp = dccp_sk(sk);
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
        struct ccid3_options_received *opt_recv;
        struct dccp_tx_hist_entry *packet;
+       struct timeval now;
        unsigned long next_tmout; 
        u32 t_elapsed;
        u32 pinv;
@@ -471,7 +481,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
        opt_recv = &hctx->ccid3hctx_options_received;
 
-       t_elapsed = dp->dccps_options_received.dccpor_elapsed_time;
+       t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
        x_recv = opt_recv->ccid3or_receive_rate;
        pinv = opt_recv->ccid3or_loss_event_rate;
 
@@ -496,9 +506,14 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
                }
 
                /* Update RTT */
-               r_sample = timeval_now_delta(&packet->dccphtx_tstamp);
-               /* FIXME: */
-               // r_sample -= usecs_to_jiffies(t_elapsed * 10);
+               dccp_timestamp(sk, &now);
+               r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
+               if (unlikely(r_sample <= t_elapsed))
+                       LIMIT_NETDEBUG(KERN_WARNING
+                                      "%s: r_sample=%uus, t_elapsed=%uus\n",
+                                      __FUNCTION__, r_sample, t_elapsed);
+               else
+                       r_sample -= t_elapsed;
 
                /* Update RTT estimate by 
                 * If (No feedback recv)
@@ -591,8 +606,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
 static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
-       const struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
        if (hctx == NULL || !(sk->sk_state == DCCP_OPEN ||
                              sk->sk_state == DCCP_PARTOPEN))
@@ -606,8 +620,8 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
                                     unsigned char *value)
 {
        int rc = 0;
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       const struct dccp_sock *dp = dccp_sk(sk);
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
        struct ccid3_options_received *opt_recv;
 
        if (hctx == NULL)
@@ -670,11 +684,11 @@ static int ccid3_hc_tx_init(struct sock *sk)
 
        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
 
-       hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx),
-                                                     gfp_any());
-       if (hctx == NULL)
+       dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any());
+       if (dp->dccps_hc_tx_ccid_private == NULL)
                return -ENOMEM;
 
+       hctx = ccid3_hc_tx_sk(sk);
        memset(hctx, 0, sizeof(*hctx));
 
        if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
@@ -696,7 +710,7 @@ static int ccid3_hc_tx_init(struct sock *sk)
 static void ccid3_hc_tx_exit(struct sock *sk)
 {
        struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
        BUG_ON(hctx == NULL);
@@ -738,8 +752,7 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
 static inline void ccid3_hc_rx_set_state(struct sock *sk,
                                         enum ccid3_hc_rx_states state)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
        enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
 
        ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
@@ -751,14 +764,14 @@ static inline void ccid3_hc_rx_set_state(struct sock *sk,
 
 static void ccid3_hc_rx_send_feedback(struct sock *sk)
 {
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
        struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
        struct dccp_rx_hist_entry *packet;
        struct timeval now;
 
        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
 
-       do_gettimeofday(&now);
+       dccp_timestamp(sk, &now);
 
        switch (hcrx->ccid3hcrx_state) {
        case TFRC_RSTATE_NO_DATA:
@@ -767,11 +780,8 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk)
        case TFRC_RSTATE_DATA: {
                const u32 delta = timeval_delta(&now,
                                        &hcrx->ccid3hcrx_tstamp_last_feedback);
-
-               hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv *
-                                         USEC_PER_SEC);
-               if (likely(delta > 1))
-                       hcrx->ccid3hcrx_x_recv /= delta;
+               hcrx->ccid3hcrx_x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv,
+                                                  delta);
        }
                break;
        default:
@@ -801,14 +811,14 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk)
                hcrx->ccid3hcrx_pinv = ~0;
        else
                hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
+       dp->dccps_hc_rx_insert_options = 1;
        dccp_send_ack(sk);
 }
 
 static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
-       const struct dccp_sock *dp = dccp_sk(sk);
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
        u32 x_recv, pinv;
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
 
        if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN ||
                              sk->sk_state == DCCP_PARTOPEN))
@@ -837,8 +847,7 @@ static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 
 static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
        struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
        u32 rtt, delta, x_recv, fval, p, tmp2;
        struct timeval tstamp = { 0, };
@@ -889,10 +898,9 @@ found:
        if (rtt == 0)
                rtt = 1;
 
-       delta = timeval_now_delta(&hcrx->ccid3hcrx_tstamp_last_feedback);
-       x_recv = hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC;
-       if (likely(delta > 1))
-               x_recv /= delta;
+       dccp_timestamp(sk, &tstamp);
+       delta = timeval_delta(&tstamp, &hcrx->ccid3hcrx_tstamp_last_feedback);
+       x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, delta);
 
        tmp1 = (u64)x_recv * (u64)rtt;
        do_div(tmp1,10000000);
@@ -911,8 +919,7 @@ found:
 
 static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 
        if (seq_loss != DCCP_MAX_SEQNO + 1 &&
            list_empty(&hcrx->ccid3hcrx_li_hist)) {
@@ -930,8 +937,7 @@ static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
 
 static void ccid3_hc_rx_detect_loss(struct sock *sk)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
        u8 win_loss;
        const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
                                                      &hcrx->ccid3hcrx_li_hist,
@@ -942,13 +948,12 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk)
 
 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
-       struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
        const struct dccp_options_received *opt_recv;
        struct dccp_rx_hist_entry *packet;
        struct timeval now;
        u8 win_count;
-       u32 p_prev;
+       u32 p_prev, r_sample, t_elapsed;
        int ins;
 
        if (hcrx == NULL)
@@ -957,7 +962,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
        BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
                 hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA));
 
-       opt_recv = &dp->dccps_options_received;
+       opt_recv = &dccp_sk(sk)->dccps_options_received;
 
        switch (DCCP_SKB_CB(skb)->dccpd_type) {
        case DCCP_PKT_ACK:
@@ -967,10 +972,24 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
                if (opt_recv->dccpor_timestamp_echo == 0)
                        break;
                p_prev = hcrx->ccid3hcrx_rtt;
-               do_gettimeofday(&now);
-               hcrx->ccid3hcrx_rtt = timeval_usecs(&now) -
-                                    (opt_recv->dccpor_timestamp_echo -
-                                     opt_recv->dccpor_elapsed_time) * 10;
+               dccp_timestamp(sk, &now);
+               timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10);
+               r_sample = timeval_usecs(&now);
+               t_elapsed = opt_recv->dccpor_elapsed_time * 10;
+
+               if (unlikely(r_sample <= t_elapsed))
+                       LIMIT_NETDEBUG(KERN_WARNING
+                                      "%s: r_sample=%uus, t_elapsed=%uus\n",
+                                      __FUNCTION__, r_sample, t_elapsed);
+               else
+                       r_sample -= t_elapsed;
+
+               if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
+                       hcrx->ccid3hcrx_rtt = r_sample;
+               else
+                       hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 +
+                                             r_sample / 10;
+
                if (p_prev != hcrx->ccid3hcrx_rtt)
                        ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
                                       dccp_role(sk), hcrx->ccid3hcrx_rtt,
@@ -985,7 +1004,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
                return;
        }
 
-       packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp,
+       packet = dccp_rx_hist_entry_new(ccid3_rx_hist, sk, opt_recv->dccpor_ndp,
                                        skb, SLAB_ATOMIC);
        if (packet == NULL) {
                ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet "
@@ -1017,7 +1036,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
                if (ins != 0)
                        break;
 
-               do_gettimeofday(&now);
+               dccp_timestamp(sk, &now);
                if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
                    hcrx->ccid3hcrx_rtt) {
                        hcrx->ccid3hcrx_tstamp_last_ack = now;
@@ -1056,11 +1075,11 @@ static int ccid3_hc_rx_init(struct sock *sk)
 
        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
 
-       hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx),
-                                                     gfp_any());
-       if (hcrx == NULL)
+       dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any());
+       if (dp->dccps_hc_rx_ccid_private == NULL)
                return -ENOMEM;
 
+       hcrx = ccid3_hc_rx_sk(sk);
        memset(hcrx, 0, sizeof(*hcrx));
 
        if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
@@ -1072,18 +1091,16 @@ static int ccid3_hc_rx_init(struct sock *sk)
        hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
        INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
        INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
-       /*
-        * XXX this seems to be paranoid, need to think more about this, for
-        * now start with something different than zero. -acme
-        */
-       hcrx->ccid3hcrx_rtt = USEC_PER_SEC / 5;
+       dccp_timestamp(sk, &hcrx->ccid3hcrx_tstamp_last_ack);
+       hcrx->ccid3hcrx_tstamp_last_feedback = hcrx->ccid3hcrx_tstamp_last_ack;
+       hcrx->ccid3hcrx_rtt = 5000; /* XXX 5ms for now... */
        return 0;
 }
 
 static void ccid3_hc_rx_exit(struct sock *sk)
 {
+       struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
        struct dccp_sock *dp = dccp_sk(sk);
-       struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
 
        ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
 
@@ -1104,8 +1121,7 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
-       const struct dccp_sock *dp = dccp_sk(sk);
-       const struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+       const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 
        if (hcrx == NULL)
                return;
@@ -1117,8 +1133,7 @@ static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 
 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
 {
-       const struct dccp_sock *dp = dccp_sk(sk);
-       const struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+       const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
        if (hctx == NULL)
                return;
index ee8cbac..d16f00d 100644 (file)
@@ -115,7 +115,7 @@ struct ccid3_hc_rx_sock {
        u64                     ccid3hcrx_seqno_last_counter:48,
                                ccid3hcrx_state:8,
                                ccid3hcrx_last_counter:4;
-       unsigned long           ccid3hcrx_rtt;
+       u32                     ccid3hcrx_rtt;
        u32                     ccid3hcrx_p;
        u32                     ccid3hcrx_bytes_recv;
        struct timeval          ccid3hcrx_tstamp_last_feedback;
@@ -128,10 +128,14 @@ struct ccid3_hc_rx_sock {
        u32                     ccid3hcrx_x_recv;
 };
 
-#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \
-    ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field)
+static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
+{
+    return dccp_sk(sk)->dccps_hc_tx_ccid_private;
+}
 
-#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \
-    ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field)
+static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
+{
+    return dccp_sk(sk)->dccps_hc_rx_ccid_private;
+}
 
 #endif /* _DCCP_CCID3_H_ */
index fb90a91..b375ebd 100644 (file)
@@ -134,6 +134,7 @@ static inline struct dccp_tx_hist_entry *
 
 static inline struct dccp_rx_hist_entry *
                     dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
+                                           const struct sock *sk, 
                                            const u32 ndp, 
                                            const struct sk_buff *skb,
                                            const unsigned int __nocast prio)
@@ -148,7 +149,7 @@ static inline struct dccp_rx_hist_entry *
                entry->dccphrx_ccval = dh->dccph_ccval;
                entry->dccphrx_type  = dh->dccph_type;
                entry->dccphrx_ndp   = ndp;
-               do_gettimeofday(&(entry->dccphrx_tstamp));
+               dccp_timestamp(sk, &entry->dccphrx_tstamp);
        }
 
        return entry;
index 33456c0..95c4630 100644 (file)
@@ -426,10 +426,13 @@ extern struct dccp_ackpkts *
                dccp_ackpkts_alloc(unsigned int len,
                                  const unsigned int __nocast priority);
 extern void dccp_ackpkts_free(struct dccp_ackpkts *ap);
-extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state);
+extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, const struct sock *sk,
+                           u64 ackno, u8 state);
 extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap,
                                         struct sock *sk, u64 ackno);
 
+extern void dccp_timestamp(const struct sock *sk, struct timeval *tv);
+
 static inline suseconds_t timeval_usecs(const struct timeval *tv)
 {
        return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
@@ -468,17 +471,6 @@ static inline void timeval_sub_usecs(struct timeval *tv,
        }
 }
 
-/*
- * Returns the difference in usecs between timeval
- * passed in and current time
- */
-static inline suseconds_t timeval_now_delta(const struct timeval *tv)
-{
-       struct timeval now;
-       do_gettimeofday(&now);
-       return timeval_delta(&now, tv);
-}
-
 #ifdef CONFIG_IP_DCCP_DEBUG
 extern void dccp_ackvector_print(const u64 ackno,
                                 const unsigned char *vector, int len);
index ef29cef..c60bc34 100644 (file)
@@ -170,7 +170,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
        if (dp->dccps_options.dccpo_send_ack_vector) {
                struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
 
-               if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+               if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, sk,
                                     DCCP_SKB_CB(skb)->dccpd_seq,
                                     DCCP_ACKPKTS_STATE_RECEIVED)) {
                        LIMIT_NETDEBUG(KERN_WARNING "DCCP: acknowledgeable "
@@ -498,7 +498,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                 * DCCP_ACKPKTS_STATE_ECN_MARKED
                 */
                if (dp->dccps_options.dccpo_send_ack_vector) {
-                       if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+                       if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, sk,
                                             DCCP_SKB_CB(skb)->dccpd_seq,
                                             DCCP_ACKPKTS_STATE_RECEIVED))
                                goto discard;
index 3fc75db..fee9a8c 100644 (file)
@@ -1243,6 +1243,7 @@ static int dccp_v4_init_sock(struct sock *sk)
        static int dccp_ctl_socket_init = 1;
 
        dccp_options_init(&dp->dccps_options);
+       do_gettimeofday(&dp->dccps_epoch);
 
        if (dp->dccps_options.dccpo_send_ack_vector) {
                dp->dccps_hc_rx_ackpkts =
index ce5dff4..18461bc 100644 (file)
@@ -96,6 +96,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
                newdp->dccps_hc_rx_ackpkts = NULL;
                newdp->dccps_role = DCCP_ROLE_SERVER;
                newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
+               do_gettimeofday(&newdp->dccps_epoch);
 
                if (newdp->dccps_options.dccpo_send_ack_vector) {
                        newdp->dccps_hc_rx_ackpkts =
index 382c589..d4c4242 100644 (file)
@@ -72,6 +72,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
        struct dccp_options_received *opt_recv = &dp->dccps_options_received;
        unsigned char opt, len;
        unsigned char *value;
+       u32 elapsed_time;
 
        memset(opt_recv, 0, sizeof(*opt_recv));
 
@@ -139,7 +140,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
                        opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
 
                        dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
-                       do_gettimeofday(&dp->dccps_timestamp_time);
+                       dccp_timestamp(sk, &dp->dccps_timestamp_time);
 
                        dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n",
                                      debug_prefix, opt_recv->dccpor_timestamp,
@@ -159,18 +160,18 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
                                      (unsigned long long)
                                      DCCP_SKB_CB(skb)->dccpd_ack_seq);
 
-                       if (len > 4) {
-                               if (len == 6)
-                                       opt_recv->dccpor_elapsed_time =
-                                                ntohs(*(u16 *)(value + 4));
-                               else
-                                       opt_recv->dccpor_elapsed_time =
-                                                ntohl(*(u32 *)(value + 4));
 
-                               dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n",
-                                     debug_prefix,
-                                     opt_recv->dccpor_elapsed_time);
-                       }
+                       if (len == 4)
+                               break;
+
+                       if (len == 6)
+                               elapsed_time = ntohs(*(u16 *)(value + 4));
+                       else
+                               elapsed_time = ntohl(*(u32 *)(value + 4));
+
+                       /* Give precedence to the biggest ELAPSED_TIME */
+                       if (elapsed_time > opt_recv->dccpor_elapsed_time)
+                               opt_recv->dccpor_elapsed_time = elapsed_time;
                        break;
                case DCCPO_ELAPSED_TIME:
                        if (len != 2 && len != 4)
@@ -180,14 +181,15 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
                                continue;
 
                        if (len == 2)
-                               opt_recv->dccpor_elapsed_time =
-                                                       ntohs(*(u16 *)value);
+                               elapsed_time = ntohs(*(u16 *)value);
                        else
-                               opt_recv->dccpor_elapsed_time =
-                                                       ntohl(*(u32 *)value);
+                               elapsed_time = ntohl(*(u32 *)value);
+
+                       if (elapsed_time > opt_recv->dccpor_elapsed_time)
+                               opt_recv->dccpor_elapsed_time = elapsed_time;
 
                        dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix,
-                                     opt_recv->dccpor_elapsed_time);
+                                     elapsed_time);
                        break;
                        /*
                         * From draft-ietf-dccp-spec-11.txt:
@@ -359,9 +361,13 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb)
 #endif
        struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
        int len = ap->dccpap_buf_vector_len + 2;
-       const u32 elapsed_time = timeval_now_delta(&ap->dccpap_time) / 10;
+       struct timeval now;
+       u32 elapsed_time;
        unsigned char *to, *from;
 
+       dccp_timestamp(sk, &now);
+       elapsed_time = timeval_delta(&now, &ap->dccpap_time) / 10;
+
        if (elapsed_time != 0)
                dccp_insert_option_elapsed_time(sk, skb, elapsed_time);
 
@@ -426,13 +432,29 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb)
                      (unsigned long long) ap->dccpap_ack_ackno);
 }
 
+void dccp_timestamp(const struct sock *sk, struct timeval *tv)
+{
+       const struct dccp_sock *dp = dccp_sk(sk);
+
+       do_gettimeofday(tv);
+       tv->tv_sec  -= dp->dccps_epoch.tv_sec;
+       tv->tv_usec -= dp->dccps_epoch.tv_usec;
+
+       while (tv->tv_usec < 0) {
+               tv->tv_sec--;
+               tv->tv_usec += USEC_PER_SEC;
+       }
+}
+
+EXPORT_SYMBOL_GPL(dccp_timestamp);
+
 void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
 {
        struct timeval tv;
        u32 now;
        
-       do_gettimeofday(&tv);
-       now = (tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10;
+       dccp_timestamp(sk, &tv);
+       now = timeval_usecs(&tv) / 10;
        /* yes this will overflow but that is the point as we want a
         * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
 
@@ -450,13 +472,17 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk,
        const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
                                        "CLIENT TX opt: " : "server TX opt: ";
 #endif
+       struct timeval now;
        u32 tstamp_echo;
-       const u32 elapsed_time =
-                       timeval_now_delta(&dp->dccps_timestamp_time) / 10;
-       const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
-       const int len = 6 + elapsed_time_len;
+       u32 elapsed_time;
+       int len, elapsed_time_len;
        unsigned char *to;
 
+       dccp_timestamp(sk, &now);
+       elapsed_time = timeval_delta(&now, &dp->dccps_timestamp_time) / 10;
+       elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+       len = 6 + elapsed_time_len;
+
        if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
                LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
                                         "timestamp echo!\n");
@@ -505,13 +531,18 @@ void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
                    (dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno !=
                     DCCP_MAX_SEQNO + 1))
                        dccp_insert_option_ack_vector(sk, skb);
-
                if (dp->dccps_timestamp_echo != 0)
                        dccp_insert_option_timestamp_echo(sk, skb);
        }
 
-       ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
-       ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
+       if (dp->dccps_hc_rx_insert_options) {
+               ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
+               dp->dccps_hc_rx_insert_options = 0;
+       }
+       if (dp->dccps_hc_tx_insert_options) {
+               ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
+               dp->dccps_hc_tx_insert_options = 0;
+       }
 
        /* XXX: insert other options when appropriate */
 
@@ -616,7 +647,8 @@ static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap,
 /*
  * Implements the draft-ietf-dccp-spec-11.txt Appendix A
  */
-int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state)
+int dccp_ackpkts_add(struct dccp_ackpkts *ap, const struct sock *sk,
+                    u64 ackno, u8 state)
 {
        /*
         * Check at the right places if the buffer is full, if it is, tell the
@@ -697,7 +729,7 @@ int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state)
        }
 
        ap->dccpap_buf_ackno = ackno;
-       do_gettimeofday(&ap->dccpap_time);
+       dccp_timestamp(sk, &ap->dccpap_time);
 out:
        dccp_pr_debug("");
        dccp_ackpkts_print(ap);
index bf147f8..a9d84f9 100644 (file)
@@ -1248,11 +1248,6 @@ module_init(inet_init);
 /* ------------------------------------------------------------------------ */
 
 #ifdef CONFIG_PROC_FS
-#ifdef CONFIG_IP_FIB_TRIE
-extern int  fib_stat_proc_init(void);
-extern void fib_stat_proc_exit(void);
-#endif
-
 static int __init ipv4_proc_init(void)
 {
        int rc = 0;
@@ -1265,19 +1260,11 @@ static int __init ipv4_proc_init(void)
                goto out_udp;
        if (fib_proc_init())
                goto out_fib;
-#ifdef CONFIG_IP_FIB_TRIE
-         if (fib_stat_proc_init())
-                 goto out_fib_stat;
-#endif
        if (ip_misc_proc_init())
                goto out_misc;
 out:
        return rc;
 out_misc:
-#ifdef CONFIG_IP_FIB_TRIE
-       fib_stat_proc_exit();
-out_fib_stat:
-#endif
        fib_proc_exit();
 out_fib:
        udp4_proc_exit();
index b2dea4e..1b63b48 100644 (file)
@@ -43,7 +43,7 @@
  *             2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.402"
+#define VERSION "0.403"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -164,7 +164,6 @@ static struct node *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
 static void tnode_free(struct tnode *tn);
-static void trie_dump_seq(struct seq_file *seq, struct trie *t);
 
 static kmem_cache_t *fn_alias_kmem __read_mostly;
 static struct trie *trie_local = NULL, *trie_main = NULL;
@@ -1971,558 +1970,525 @@ struct fib_table * __init fib_hash_init(int id)
        return tb;
 }
 
-/* Trie dump functions */
+#ifdef CONFIG_PROC_FS
+/* Depth first Trie walk iterator */
+struct fib_trie_iter {
+       struct tnode *tnode;
+       struct trie *trie;
+       unsigned index;
+       unsigned depth;
+};
 
-static void putspace_seq(struct seq_file *seq, int n)
+static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
 {
-       while (n--)
-               seq_printf(seq, " ");
-}
+       struct tnode *tn = iter->tnode;
+       unsigned cindex = iter->index;
+       struct tnode *p;
 
-static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
-{
-       while (bits--)
-               seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
-}
+       pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
+                iter->tnode, iter->index, iter->depth);
+rescan:
+       while (cindex < (1<<tn->bits)) {
+               struct node *n = tnode_get_child(tn, cindex);
 
-static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
-                  int pend, int cindex, int bits)
-{
-       putspace_seq(seq, indent);
-       if (IS_LEAF(n))
-               seq_printf(seq, "|");
-       else
-               seq_printf(seq, "+");
-       if (bits) {
-               seq_printf(seq, "%d/", cindex);
-               printbin_seq(seq, cindex, bits);
-               seq_printf(seq, ": ");
-       } else
-               seq_printf(seq, "<root>: ");
-       seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
+               if (n) {
+                       if (IS_LEAF(n)) {
+                               iter->tnode = tn;
+                               iter->index = cindex + 1;
+                       } else {
+                               /* push down one level */
+                               iter->tnode = (struct tnode *) n;
+                               iter->index = 0;
+                               ++iter->depth;
+                       }
+                       return n;
+               }
 
-       if (IS_LEAF(n)) {
-               struct leaf *l = (struct leaf *)n;
-               struct fib_alias *fa;
-               int i;
+               ++cindex;
+       }
 
-               seq_printf(seq, "key=%d.%d.%d.%d\n",
-                          n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
-
-               for (i = 32; i >= 0; i--)
-                       if (find_leaf_info(&l->list, i)) {
-                               struct list_head *fa_head = get_fa_head(l, i);
-
-                               if (!fa_head)
-                                       continue;
-
-                               if (list_empty(fa_head))
-                                       continue;
-
-                               putspace_seq(seq, indent+2);
-                               seq_printf(seq, "{/%d...dumping}\n", i);
-
-                               list_for_each_entry_rcu(fa, fa_head, fa_list) {
-                                       putspace_seq(seq, indent+2);
-                                       if (fa->fa_info == NULL) {
-                                               seq_printf(seq, "Error fa_info=NULL\n");
-                                               continue;
-                                       }
-                                       if (fa->fa_info->fib_nh == NULL) {
-                                               seq_printf(seq, "Error _fib_nh=NULL\n");
-                                               continue;
-                                       }
-
-                                       seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
-                                             fa->fa_type,
-                                             fa->fa_scope,
-                                             fa->fa_tos);
-                               }
-                       }
-       } else {
-               struct tnode *tn = (struct tnode *)n;
-               int plen = ((struct tnode *)n)->pos;
-               t_key prf = MASK_PFX(n->key, plen);
-
-               seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
-                          prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
-
-               putspace_seq(seq, indent); seq_printf(seq, "|    ");
-               seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
-               printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
-               seq_printf(seq, "}\n");
-               putspace_seq(seq, indent); seq_printf(seq, "|    ");
-               seq_printf(seq, "{pos=%d", tn->pos);
-               seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
-               seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
-               putspace_seq(seq, indent); seq_printf(seq, "|    ");
-               seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
+       /* Current node exhausted, pop back up */
+       p = NODE_PARENT(tn);
+       if (p) {
+               cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
+               tn = p;
+               --iter->depth;
+               goto rescan;
        }
+
+       /* got root? */
+       return NULL;
 }
 
-static void trie_dump_seq(struct seq_file *seq, struct trie *t)
+static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+                                      struct trie *t)
 {
-       struct node *n;
-       int cindex = 0;
-       int indent = 1;
-       int pend = 0;
-       int depth = 0;
-       struct tnode *tn;
-
-       rcu_read_lock();
-       n = rcu_dereference(t->trie);
-       seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
+       struct node *n = rcu_dereference(t->trie);
 
-       if (!n) {
-               seq_printf(seq, "------ trie is empty\n");
-
-               rcu_read_unlock();
-               return;
+       if (n && IS_TNODE(n)) {
+               iter->tnode = (struct tnode *) n;
+               iter->trie = t;
+               iter->index = 0;
+               iter->depth = 0;
+               return n;
        }
+       return NULL;
+}
 
-       printnode_seq(seq, indent, n, pend, cindex, 0);
-
-       if (!IS_TNODE(n)) {
-               rcu_read_unlock();
-               return;
-       }
-
-       tn = (struct tnode *)n;
-       pend = tn->pos+tn->bits;
-       putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
-       indent += 3;
-       depth++;
-
-       while (tn && cindex < (1 << tn->bits)) {
-               struct node *child = rcu_dereference(tn->child[cindex]);
-               if (!child)
-                       cindex++;
-               else {
-                       /* Got a child */
-                       printnode_seq(seq, indent, child, pend,
-                                     cindex, tn->bits);
-
-                       if (IS_LEAF(child))
-                               cindex++;
-
-                       else {
-                               /*
-                                * New tnode. Decend one level
-                                */
-
-                               depth++;
-                               n = child;
-                               tn = (struct tnode *)n;
-                               pend = tn->pos+tn->bits;
-                               putspace_seq(seq, indent);
-                               seq_printf(seq, "\\--\n");
-                               indent += 3;
-                               cindex = 0;
-                       }
-               }
-
-               /*
-                * Test if we are done
-                */
-
-               while (cindex >= (1 << tn->bits)) {
-                       /*
-                        * Move upwards and test for root
-                        * pop off all traversed  nodes
-                        */
+static void trie_collect_stats(struct trie *t, struct trie_stat *s)
+{
+       struct node *n;
+       struct fib_trie_iter iter;
 
-                       if (NODE_PARENT(tn) == NULL) {
-                               tn = NULL;
-                               break;
-                       }
+       memset(s, 0, sizeof(*s));
 
-                       cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-                       cindex++;
-                       tn = NODE_PARENT(tn);
-                       pend = tn->pos + tn->bits;
-                       indent -= 3;
-                       depth--;
+       rcu_read_lock();
+       for (n = fib_trie_get_first(&iter, t); n;
+            n = fib_trie_get_next(&iter)) {
+               if (IS_LEAF(n)) {
+                       s->leaves++;
+                       s->totdepth += iter.depth;
+                       if (iter.depth > s->maxdepth)
+                               s->maxdepth = iter.depth;
+               } else {
+                       const struct tnode *tn = (const struct tnode *) n;
+                       int i;
+
+                       s->tnodes++;
+                       s->nodesizes[tn->bits]++;
+                       for (i = 0; i < (1<<tn->bits); i++)
+                               if (!tn->child[i])
+                                       s->nullpointers++;
                }
        }
        rcu_read_unlock();
 }
 
-static struct trie_stat *trie_stat_new(void)
+/*
+ *     This outputs /proc/net/fib_triestats
+ */
+static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 {
-       struct trie_stat *s;
-       int i;
+       unsigned i, max, pointers, bytes, avdepth;
 
-       s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
-       if (!s)
-               return NULL;
+       if (stat->leaves)
+               avdepth = stat->totdepth*100 / stat->leaves;
+       else
+               avdepth = 0;
 
-       s->totdepth = 0;
-       s->maxdepth = 0;
-       s->tnodes = 0;
-       s->leaves = 0;
-       s->nullpointers = 0;
+       seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
+       seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
 
-       for (i = 0; i < MAX_CHILDS; i++)
-               s->nodesizes[i] = 0;
+       seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
 
-       return s;
-}
+       bytes = sizeof(struct leaf) * stat->leaves;
+       seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
+       bytes += sizeof(struct tnode) * stat->tnodes;
 
-static struct trie_stat *trie_collect_stats(struct trie *t)
-{
-       struct node *n;
-       struct trie_stat *s = trie_stat_new();
-       int cindex = 0;
-       int pend = 0;
-       int depth = 0;
+       max = MAX_CHILDS-1;
+       while (max >= 0 && stat->nodesizes[max] == 0)
+               max--;
 
-       if (!s)
-               return NULL;
+       pointers = 0;
+       for (i = 1; i <= max; i++)
+               if (stat->nodesizes[i] != 0) {
+                       seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
+                       pointers += (1<<i) * stat->nodesizes[i];
+               }
+       seq_putc(seq, '\n');
+       seq_printf(seq, "\tPointers: %d\n", pointers);
 
-       rcu_read_lock();
-       n = rcu_dereference(t->trie);
+       bytes += sizeof(struct node *) * pointers;
+       seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
+       seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
 
-       if (!n)
-               return s;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+       seq_printf(seq, "Counters:\n---------\n");
+       seq_printf(seq,"gets = %d\n", t->stats.gets);
+       seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
+       seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
+       seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
+       seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+       seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
+#ifdef CLEAR_STATS
+       memset(&(t->stats), 0, sizeof(t->stats));
+#endif
+#endif /*  CONFIG_IP_FIB_TRIE_STATS */
+}
 
-       if (IS_TNODE(n)) {
-               struct tnode *tn = (struct tnode *)n;
-               pend = tn->pos+tn->bits;
-               s->nodesizes[tn->bits]++;
-               depth++;
-
-               while (tn && cindex < (1 << tn->bits)) {
-                       struct node *ch = rcu_dereference(tn->child[cindex]);
-                       if (ch) {
-
-                               /* Got a child */
-
-                               if (IS_LEAF(tn->child[cindex])) {
-                                       cindex++;
-
-                                       /* stats */
-                                       if (depth > s->maxdepth)
-                                               s->maxdepth = depth;
-                                       s->totdepth += depth;
-                                       s->leaves++;
-                               } else {
-                                       /*
-                                        * New tnode. Decend one level
-                                        */
-
-                                       s->tnodes++;
-                                       s->nodesizes[tn->bits]++;
-                                       depth++;
-
-                                       n = ch;
-                                       tn = (struct tnode *)n;
-                                       pend = tn->pos+tn->bits;
-
-                                       cindex = 0;
-                               }
-                       } else {
-                               cindex++;
-                               s->nullpointers++;
-                       }
+static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+{
+       struct trie_stat *stat;
 
-                       /*
-                        * Test if we are done
-                        */
+       stat = kmalloc(sizeof(*stat), GFP_KERNEL);
+       if (!stat)
+               return -ENOMEM;
 
-                       while (cindex >= (1 << tn->bits)) {
-                               /*
-                                * Move upwards and test for root
-                                * pop off all traversed  nodes
-                                */
+       seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+                  sizeof(struct leaf), sizeof(struct tnode));
 
-                               if (NODE_PARENT(tn) == NULL) {
-                                       tn = NULL;
-                                       n = NULL;
-                                       break;
-                               }
+       if (trie_local) {
+               seq_printf(seq, "Local:\n");
+               trie_collect_stats(trie_local, stat);
+               trie_show_stats(seq, stat);
+       }
 
-                               cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-                               tn = NODE_PARENT(tn);
-                               cindex++;
-                               n = (struct node *)tn;
-                               pend = tn->pos+tn->bits;
-                               depth--;
-                       }
-               }
+       if (trie_main) {
+               seq_printf(seq, "Main:\n");
+               trie_collect_stats(trie_main, stat);
+               trie_show_stats(seq, stat);
        }
+       kfree(stat);
 
-       rcu_read_unlock();
-       return s;
+       return 0;
 }
 
-#ifdef CONFIG_PROC_FS
-
-static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
+static int fib_triestat_seq_open(struct inode *inode, struct file *file)
 {
-       return NULL;
+       return single_open(file, fib_triestat_seq_show, NULL);
 }
 
-static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
+static struct file_operations fib_triestat_fops = {
+       .owner  = THIS_MODULE,
+       .open   = fib_triestat_seq_open,
+       .read   = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
+                                     loff_t pos)
 {
+       loff_t idx = 0;
+       struct node *n;
+
+       for (n = fib_trie_get_first(iter, trie_local);
+            n; ++idx, n = fib_trie_get_next(iter)) {
+               if (pos == idx)
+                       return n;
+       }
+
+       for (n = fib_trie_get_first(iter, trie_main);
+            n; ++idx, n = fib_trie_get_next(iter)) {
+               if (pos == idx)
+                       return n;
+       }
        return NULL;
 }
 
-static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
 {
-       if (!ip_fib_main_table)
-               return NULL;
-
-       if (*pos)
-               return fib_triestat_get_next(seq);
-       else
+       rcu_read_lock();
+       if (*pos == 0)
                return SEQ_START_TOKEN;
+       return fib_trie_get_idx(seq->private, *pos - 1);
 }
 
-static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+       struct fib_trie_iter *iter = seq->private;
+       void *l = v;
+
        ++*pos;
        if (v == SEQ_START_TOKEN)
-               return fib_triestat_get_first(seq);
-       else
-               return fib_triestat_get_next(seq);
-}
+               return fib_trie_get_idx(iter, 0);
 
-static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
-{
+       v = fib_trie_get_next(iter);
+       BUG_ON(v == l);
+       if (v)
+               return v;
 
-}
+       /* continue scan in next trie */
+       if (iter->trie == trie_local)
+               return fib_trie_get_first(iter, trie_main);
 
-/*
- *     This outputs /proc/net/fib_triestats
- *
- *     It always works in backward compatibility mode.
- *     The format of the file is not supposed to be changed.
- */
+       return NULL;
+}
 
-static void collect_and_show(struct trie *t, struct seq_file *seq)
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 {
-       int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
-       int i, max, pointers;
-       struct trie_stat *stat;
-       int avdepth;
-
-       stat = trie_collect_stats(t);
-
-       bytes = 0;
-       seq_printf(seq, "trie=%p\n", t);
-
-       if (stat) {
-               if (stat->leaves)
-                       avdepth = stat->totdepth*100 / stat->leaves;
-               else
-                       avdepth = 0;
-               seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
-               seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
+       rcu_read_unlock();
+}
 
-               seq_printf(seq, "Leaves: %d\n", stat->leaves);
-               bytes += sizeof(struct leaf) * stat->leaves;
-               seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
-               bytes += sizeof(struct tnode) * stat->tnodes;
+static void seq_indent(struct seq_file *seq, int n)
+{
+       while (n-- > 0) seq_puts(seq, "   ");
+}
 
-               max = MAX_CHILDS-1;
+static inline const char *rtn_scope(enum rt_scope_t s)
+{
+       static char buf[32];
 
-               while (max >= 0 && stat->nodesizes[max] == 0)
-                       max--;
-               pointers = 0;
+       switch(s) {
+       case RT_SCOPE_UNIVERSE: return "universe";
+       case RT_SCOPE_SITE:     return "site";
+       case RT_SCOPE_LINK:     return "link";
+       case RT_SCOPE_HOST:     return "host";
+       case RT_SCOPE_NOWHERE:  return "nowhere";
+       default:
+               snprintf(buf, sizeof(buf), "scope=%d", s);
+               return buf;
+       }
+}
 
-               for (i = 1; i <= max; i++)
-                       if (stat->nodesizes[i] != 0) {
-                               seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
-                               pointers += (1<<i) * stat->nodesizes[i];
-                       }
-               seq_printf(seq, "\n");
-               seq_printf(seq, "Pointers: %d\n", pointers);
-               bytes += sizeof(struct node *) * pointers;
-               seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
-               seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
+static const char *rtn_type_names[__RTN_MAX] = {
+       [RTN_UNSPEC] = "UNSPEC",
+       [RTN_UNICAST] = "UNICAST",
+       [RTN_LOCAL] = "LOCAL",
+       [RTN_BROADCAST] = "BROADCAST",
+       [RTN_ANYCAST] = "ANYCAST",
+       [RTN_MULTICAST] = "MULTICAST",
+       [RTN_BLACKHOLE] = "BLACKHOLE",
+       [RTN_UNREACHABLE] = "UNREACHABLE",
+       [RTN_PROHIBIT] = "PROHIBIT",
+       [RTN_THROW] = "THROW",
+       [RTN_NAT] = "NAT",
+       [RTN_XRESOLVE] = "XRESOLVE",
+};
 
-               kfree(stat);
-       }
+static inline const char *rtn_type(unsigned t)
+{
+       static char buf[32];
 
-#ifdef CONFIG_IP_FIB_TRIE_STATS
-       seq_printf(seq, "Counters:\n---------\n");
-       seq_printf(seq,"gets = %d\n", t->stats.gets);
-       seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
-       seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
-       seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
-       seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
-       seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
-#ifdef CLEAR_STATS
-       memset(&(t->stats), 0, sizeof(t->stats));
-#endif
-#endif /*  CONFIG_IP_FIB_TRIE_STATS */
+       if (t < __RTN_MAX && rtn_type_names[t])
+               return rtn_type_names[t];
+       snprintf(buf, sizeof(buf), "type %d", t);
+       return buf;
 }
 
-static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+/* Pretty print the trie */
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
 {
-       char bf[128];
+       const struct fib_trie_iter *iter = seq->private;
+       struct node *n = v;
 
-       if (v == SEQ_START_TOKEN) {
-               seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
-                          sizeof(struct leaf), sizeof(struct tnode));
-               if (trie_local)
-                       collect_and_show(trie_local, seq);
+       if (v == SEQ_START_TOKEN)
+               return 0;
 
-               if (trie_main)
-                       collect_and_show(trie_main, seq);
-       } else {
-               snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
+       if (IS_TNODE(n)) {
+               struct tnode *tn = (struct tnode *) n;
+               t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
 
-               seq_printf(seq, "%-127s\n", bf);
+               if (!NODE_PARENT(n)) {
+                       if (iter->trie == trie_local)
+                               seq_puts(seq, "<local>:\n");
+                       else
+                               seq_puts(seq, "<main>:\n");
+               } else {
+                       seq_indent(seq, iter->depth-1);
+                       seq_printf(seq, "  +-- %d.%d.%d.%d/%d\n",
+                                  NIPQUAD(prf), tn->pos);
+               }
+       } else {
+               struct leaf *l = (struct leaf *) n;
+               int i;
+               u32 val = ntohl(l->key);
+
+               seq_indent(seq, iter->depth);
+               seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
+               for (i = 32; i >= 0; i--) {
+                       struct leaf_info *li = find_leaf_info(&l->list, i);
+                       if (li) {
+                               struct fib_alias *fa;
+                               list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+                                       seq_indent(seq, iter->depth+1);
+                                       seq_printf(seq, "  /%d %s %s", i,
+                                                  rtn_scope(fa->fa_scope),
+                                                  rtn_type(fa->fa_type));
+                                       if (fa->fa_tos)
+                                               seq_printf(seq, "tos =%d\n",
+                                                          fa->fa_tos);
+                                       seq_putc(seq, '\n');
+                               }
+                       }
+               }
        }
+
        return 0;
 }
 
-static struct seq_operations fib_triestat_seq_ops = {
-       .start = fib_triestat_seq_start,
-       .next  = fib_triestat_seq_next,
-       .stop  = fib_triestat_seq_stop,
-       .show  = fib_triestat_seq_show,
+static struct seq_operations fib_trie_seq_ops = {
+       .start  = fib_trie_seq_start,
+       .next   = fib_trie_seq_next,
+       .stop   = fib_trie_seq_stop,
+       .show   = fib_trie_seq_show,
 };
 
-static int fib_triestat_seq_open(struct inode *inode, struct file *file)
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
 {
        struct seq_file *seq;
        int rc = -ENOMEM;
+       struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
 
-       rc = seq_open(file, &fib_triestat_seq_ops);
+       if (!s)
+               goto out;
+
+       rc = seq_open(file, &fib_trie_seq_ops);
        if (rc)
                goto out_kfree;
 
-       seq = file->private_data;
+       seq          = file->private_data;
+       seq->private = s;
+       memset(s, 0, sizeof(*s));
 out:
        return rc;
 out_kfree:
+       kfree(s);
        goto out;
 }
 
-static struct file_operations fib_triestat_seq_fops = {
-       .owner  = THIS_MODULE,
-       .open   = fib_triestat_seq_open,
-       .read   = seq_read,
-       .llseek = seq_lseek,
+static struct file_operations fib_trie_fops = {
+       .owner  = THIS_MODULE,
+       .open   = fib_trie_seq_open,
+       .read   = seq_read,
+       .llseek = seq_lseek,
        .release = seq_release_private,
 };
 
-int __init fib_stat_proc_init(void)
-{
-       if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
-               return -ENOMEM;
-       return 0;
-}
-
-void __init fib_stat_proc_exit(void)
+static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
 {
-       proc_net_remove("fib_triestat");
-}
+       static unsigned type2flags[RTN_MAX + 1] = {
+               [7] = RTF_REJECT, [8] = RTF_REJECT,
+       };
+       unsigned flags = type2flags[type];
 
-static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
-{
-       return NULL;
+       if (fi && fi->fib_nh->nh_gw)
+               flags |= RTF_GATEWAY;
+       if (mask == 0xFFFFFFFF)
+               flags |= RTF_HOST;
+       flags |= RTF_UP;
+       return flags;
 }
 
-static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
+/*
+ *     This outputs /proc/net/route.
+ *     The format of the file is not supposed to be changed
+ *     and needs to be same as fib_hash output to avoid breaking
+ *     legacy utilities
+ */
+static int fib_route_seq_show(struct seq_file *seq, void *v)
 {
-       return NULL;
-}
+       struct leaf *l = v;
+       int i;
+       char bf[128];
 
-static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
-{
-       if (!ip_fib_main_table)
-               return NULL;
+       if (v == SEQ_START_TOKEN) {
+               seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+                          "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+                          "\tWindow\tIRTT");
+               return 0;
+       }
 
-       if (*pos)
-               return fib_trie_get_next(seq);
-       else
-               return SEQ_START_TOKEN;
-}
+       if (IS_TNODE(l))
+               return 0;
 
-static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-       ++*pos;
-       if (v == SEQ_START_TOKEN)
-               return fib_trie_get_first(seq);
-       else
-               return fib_trie_get_next(seq);
+       for (i=32; i>=0; i--) {
+               struct leaf_info *li = find_leaf_info(&l->list, i);
+               struct fib_alias *fa;
+               u32 mask, prefix;
 
-}
+               if (!li)
+                       continue;
 
-static void fib_trie_seq_stop(struct seq_file *seq, void *v)
-{
-}
+               mask = inet_make_mask(li->plen);
+               prefix = htonl(l->key);
 
-/*
- *     This outputs /proc/net/fib_trie.
- *
- *     It always works in backward compatibility mode.
- *     The format of the file is not supposed to be changed.
- */
+               list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+                       const struct fib_info *fi = rcu_dereference(fa->fa_info);
+                       unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
 
-static int fib_trie_seq_show(struct seq_file *seq, void *v)
-{
-       char bf[128];
+                       if (fa->fa_type == RTN_BROADCAST
+                           || fa->fa_type == RTN_MULTICAST)
+                               continue;
 
-       if (v == SEQ_START_TOKEN) {
-               if (trie_local)
-                       trie_dump_seq(seq, trie_local);
+                       if (fi)
+                               snprintf(bf, sizeof(bf),
+                                        "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+                                        fi->fib_dev ? fi->fib_dev->name : "*",
+                                        prefix,
+                                        fi->fib_nh->nh_gw, flags, 0, 0,
+                                        fi->fib_priority,
+                                        mask,
+                                        (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+                                        fi->fib_window,
+                                        fi->fib_rtt >> 3);
+                       else
+                               snprintf(bf, sizeof(bf),
+                                        "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+                                        prefix, 0, flags, 0, 0, 0,
+                                        mask, 0, 0, 0);
 
-               if (trie_main)
-                       trie_dump_seq(seq, trie_main);
-       } else {
-               snprintf(bf, sizeof(bf),
-                        "*\t%08X\t%08X", 200, 400);
-               seq_printf(seq, "%-127s\n", bf);
+                       seq_printf(seq, "%-127s\n", bf);
+               }
        }
 
        return 0;
 }
 
-static struct seq_operations fib_trie_seq_ops = {
-       .start = fib_trie_seq_start,
-       .next  = fib_trie_seq_next,
-       .stop  = fib_trie_seq_stop,
-       .show  = fib_trie_seq_show,
+static struct seq_operations fib_route_seq_ops = {
+       .start  = fib_trie_seq_start,
+       .next   = fib_trie_seq_next,
+       .stop   = fib_trie_seq_stop,
+       .show   = fib_route_seq_show,
 };
 
-static int fib_trie_seq_open(struct inode *inode, struct file *file)
+static int fib_route_seq_open(struct inode *inode, struct file *file)
 {
        struct seq_file *seq;
        int rc = -ENOMEM;
+       struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
 
-       rc = seq_open(file, &fib_trie_seq_ops);
+       if (!s)
+               goto out;
+
+       rc = seq_open(file, &fib_route_seq_ops);
        if (rc)
                goto out_kfree;
 
-       seq = file->private_data;
+       seq          = file->private_data;
+       seq->private = s;
+       memset(s, 0, sizeof(*s));
 out:
        return rc;
 out_kfree:
+       kfree(s);
        goto out;
 }
 
-static struct file_operations fib_trie_seq_fops = {
-       .owner  = THIS_MODULE,
-       .open   = fib_trie_seq_open,
-       .read   = seq_read,
-       .llseek = seq_lseek,
-       .release= seq_release_private,
+static struct file_operations fib_route_fops = {
+       .owner  = THIS_MODULE,
+       .open   = fib_route_seq_open,
+       .read   = seq_read,
+       .llseek = seq_lseek,
+       .release = seq_release_private,
 };
 
 int __init fib_proc_init(void)
 {
-       if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
-               return -ENOMEM;
+       if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
+               goto out1;
+
+       if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
+               goto out2;
+
+       if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
+               goto out3;
+
        return 0;
+
+out3:
+       proc_net_remove("fib_triestat");
+out2:
+       proc_net_remove("fib_trie");
+out1:
+       return -ENOMEM;
 }
 
 void __init fib_proc_exit(void)
 {
        proc_net_remove("fib_trie");
+       proc_net_remove("fib_triestat");
+       proc_net_remove("route");
 }
 
 #endif /* CONFIG_PROC_FS */