Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[pandora-kernel.git] / net / core / dev.c
index f112970..033d7ca 100644 (file)
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
+enum {
+       GRO_MERGED,
+       GRO_MERGED_FREE,
+       GRO_HELD,
+       GRO_NORMAL,
+       GRO_DROP,
+};
+
 /*
  *     The list of packet types we will receive (as opposed to discard)
  *     and the routines to invoke.
@@ -1668,6 +1676,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                        struct netdev_queue *txq)
 {
        const struct net_device_ops *ops = dev->netdev_ops;
+       int rc;
 
        prefetch(&dev->netdev_ops->ndo_start_xmit);
        if (likely(!skb->next)) {
@@ -1681,13 +1690,27 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                goto gso;
                }
 
-               return ops->ndo_start_xmit(skb, dev);
+               rc = ops->ndo_start_xmit(skb, dev);
+               /*
+                * TODO: if skb_orphan() was called by
+                * dev->hard_start_xmit() (for example, the unmodified
+                * igb driver does that; bnx2 doesn't), then
+                * skb_tx_software_timestamp() will be unable to send
+                * back the time stamp.
+                *
+                * How can this be prevented? Always create another
+                * reference to the socket before calling
+                * dev->hard_start_xmit()? Prevent that skb_orphan()
+                * does anything in dev->hard_start_xmit() by clearing
+                * the skb destructor before the call and restoring it
+                * afterwards, then doing the skb_orphan() ourselves?
+                */
+               return rc;
        }
 
 gso:
        do {
                struct sk_buff *nskb = skb->next;
-               int rc;
 
                skb->next = nskb->next;
                nskb->next = NULL;
@@ -1708,56 +1731,20 @@ out_kfree_skb:
        return 0;
 }
 
-static u32 simple_tx_hashrnd;
-static int simple_tx_hashrnd_initialized = 0;
+static u32 skb_tx_hashrnd;
 
-static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
+static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
 {
-       u32 addr1, addr2, ports;
-       u32 hash, ihl;
-       u8 ip_proto = 0;
-
-       if (unlikely(!simple_tx_hashrnd_initialized)) {
-               get_random_bytes(&simple_tx_hashrnd, 4);
-               simple_tx_hashrnd_initialized = 1;
-       }
-
-       switch (skb->protocol) {
-       case htons(ETH_P_IP):
-               if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
-                       ip_proto = ip_hdr(skb)->protocol;
-               addr1 = ip_hdr(skb)->saddr;
-               addr2 = ip_hdr(skb)->daddr;
-               ihl = ip_hdr(skb)->ihl;
-               break;
-       case htons(ETH_P_IPV6):
-               ip_proto = ipv6_hdr(skb)->nexthdr;
-               addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
-               addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
-               ihl = (40 >> 2);
-               break;
-       default:
-               return 0;
-       }
+       u32 hash;
 
+       if (skb_rx_queue_recorded(skb)) {
+               hash = skb_get_rx_queue(skb);
+       } else if (skb->sk && skb->sk->sk_hash) {
+               hash = skb->sk->sk_hash;
+       } else
+               hash = skb->protocol;
 
-       switch (ip_proto) {
-       case IPPROTO_TCP:
-       case IPPROTO_UDP:
-       case IPPROTO_DCCP:
-       case IPPROTO_ESP:
-       case IPPROTO_AH:
-       case IPPROTO_SCTP:
-       case IPPROTO_UDPLITE:
-               ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
-               break;
-
-       default:
-               ports = 0;
-               break;
-       }
-
-       hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
+       hash = jhash_1word(hash, skb_tx_hashrnd);
 
        return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
@@ -1771,7 +1758,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
        if (ops->ndo_select_queue)
                queue_index = ops->ndo_select_queue(dev, skb);
        else if (dev->real_num_tx_queues > 1)
-               queue_index = simple_tx_hash(dev, skb);
+               queue_index = skb_tx_hash(dev, skb);
 
        skb_set_queue_mapping(skb, queue_index);
        return netdev_get_tx_queue(dev, queue_index);
@@ -2297,6 +2284,8 @@ ncls:
        if (!skb)
                goto out;
 
+       skb_orphan(skb);
+
        type = skb->protocol;
        list_for_each_entry_rcu(ptype,
                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
@@ -2366,7 +2355,6 @@ static int napi_gro_complete(struct sk_buff *skb)
 
 out:
        skb_shinfo(skb)->gso_size = 0;
-       __skb_push(skb, -skb_network_offset(skb));
        return netif_receive_skb(skb);
 }
 
@@ -2380,20 +2368,40 @@ void napi_gro_flush(struct napi_struct *napi)
                napi_gro_complete(skb);
        }
 
+       napi->gro_count = 0;
        napi->gro_list = NULL;
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
+void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
+{
+       unsigned int offset = skb_gro_offset(skb);
+
+       hlen += offset;
+       if (hlen <= skb_headlen(skb))
+               return skb->data + offset;
+
+       if (unlikely(!skb_shinfo(skb)->nr_frags ||
+                    skb_shinfo(skb)->frags[0].size <=
+                    hlen - skb_headlen(skb) ||
+                    PageHighMem(skb_shinfo(skb)->frags[0].page)))
+               return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
+
+       return page_address(skb_shinfo(skb)->frags[0].page) +
+              skb_shinfo(skb)->frags[0].page_offset +
+              offset - skb_headlen(skb);
+}
+EXPORT_SYMBOL(skb_gro_header);
+
 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
        struct sk_buff **pp = NULL;
        struct packet_type *ptype;
        __be16 type = skb->protocol;
        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
-       int count = 0;
        int same_flow;
        int mac_len;
-       int free;
+       int ret;
 
        if (!(skb->dev->features & NETIF_F_GRO))
                goto normal;
@@ -2403,30 +2411,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
        rcu_read_lock();
        list_for_each_entry_rcu(ptype, head, list) {
-               struct sk_buff *p;
-
                if (ptype->type != type || ptype->dev || !ptype->gro_receive)
                        continue;
 
-               skb_reset_network_header(skb);
+               skb_set_network_header(skb, skb_gro_offset(skb));
                mac_len = skb->network_header - skb->mac_header;
                skb->mac_len = mac_len;
                NAPI_GRO_CB(skb)->same_flow = 0;
                NAPI_GRO_CB(skb)->flush = 0;
                NAPI_GRO_CB(skb)->free = 0;
 
-               for (p = napi->gro_list; p; p = p->next) {
-                       count++;
-
-                       if (!NAPI_GRO_CB(p)->same_flow)
-                               continue;
-
-                       if (p->mac_len != mac_len ||
-                           memcmp(skb_mac_header(p), skb_mac_header(skb),
-                                  mac_len))
-                               NAPI_GRO_CB(p)->same_flow = 0;
-               }
-
                pp = ptype->gro_receive(&napi->gro_list, skb);
                break;
        }
@@ -2436,7 +2430,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
                goto normal;
 
        same_flow = NAPI_GRO_CB(skb)->same_flow;
-       free = NAPI_GRO_CB(skb)->free;
+       ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
        if (pp) {
                struct sk_buff *nskb = *pp;
@@ -2444,27 +2438,35 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
                *pp = nskb->next;
                nskb->next = NULL;
                napi_gro_complete(nskb);
-               count--;
+               napi->gro_count--;
        }
 
        if (same_flow)
                goto ok;
 
-       if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
-               __skb_push(skb, -skb_network_offset(skb));
+       if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
                goto normal;
-       }
 
+       napi->gro_count++;
        NAPI_GRO_CB(skb)->count = 1;
-       skb_shinfo(skb)->gso_size = skb->len;
+       skb_shinfo(skb)->gso_size = skb_gro_len(skb);
        skb->next = napi->gro_list;
        napi->gro_list = skb;
+       ret = GRO_HELD;
+
+pull:
+       if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
+               if (napi->gro_list == skb)
+                       napi->gro_list = skb->next;
+               ret = GRO_DROP;
+       }
 
 ok:
-       return free;
+       return ret;
 
 normal:
-       return -1;
+       ret = GRO_NORMAL;
+       goto pull;
 }
 EXPORT_SYMBOL(dev_gro_receive);
 
@@ -2473,28 +2475,43 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
        struct sk_buff *p;
 
        for (p = napi->gro_list; p; p = p->next) {
-               NAPI_GRO_CB(p)->same_flow = 1;
+               NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
+                       skb_mac_header(p), skb_gro_mac_header(skb));
                NAPI_GRO_CB(p)->flush = 0;
        }
 
        return dev_gro_receive(napi, skb);
 }
 
-int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+int napi_skb_finish(int ret, struct sk_buff *skb)
 {
+       int err = NET_RX_SUCCESS;
+
        if (netpoll_receive_skb(skb))
                return NET_RX_DROP;
 
-       switch (__napi_gro_receive(napi, skb)) {
-       case -1:
+       switch (ret) {
+       case GRO_NORMAL:
                return netif_receive_skb(skb);
 
-       case 1:
+       case GRO_DROP:
+               err = NET_RX_DROP;
+               /* fall through */
+
+       case GRO_MERGED_FREE:
                kfree_skb(skb);
                break;
        }
 
-       return NET_RX_SUCCESS;
+       return err;
+}
+EXPORT_SYMBOL(napi_skb_finish);
+
+int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+       skb_gro_reset_offset(skb);
+
+       return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
@@ -2512,6 +2529,9 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
 {
        struct net_device *dev = napi->dev;
        struct sk_buff *skb = napi->skb;
+       struct ethhdr *eth;
+       skb_frag_t *frag;
+       int i;
 
        napi->skb = NULL;
 
@@ -2524,20 +2544,36 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
        }
 
        BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
+       frag = &info->frags[info->nr_frags - 1];
+
+       for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
+               skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
+                                  frag->size);
+               frag++;
+       }
        skb_shinfo(skb)->nr_frags = info->nr_frags;
-       memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
 
        skb->data_len = info->len;
        skb->len += info->len;
        skb->truesize += info->len;
 
-       if (!pskb_may_pull(skb, ETH_HLEN)) {
+       skb_reset_mac_header(skb);
+       skb_gro_reset_offset(skb);
+
+       eth = skb_gro_header(skb, sizeof(*eth));
+       if (!eth) {
                napi_reuse_skb(napi, skb);
                skb = NULL;
                goto out;
        }
 
-       skb->protocol = eth_type_trans(skb, dev);
+       skb_gro_pull(skb, sizeof(*eth));
+
+       /*
+        * This works because the only protocols we care about don't require
+        * special handling.  We'll fix it up properly at the end.
+        */
+       skb->protocol = eth->h_proto;
 
        skb->ip_summed = info->ip_summed;
        skb->csum = info->csum;
@@ -2547,32 +2583,46 @@ out:
 }
 EXPORT_SYMBOL(napi_fraginfo_skb);
 
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 {
-       struct sk_buff *skb = napi_fraginfo_skb(napi, info);
-       int err = NET_RX_DROP;
-
-       if (!skb)
-               goto out;
+       int err = NET_RX_SUCCESS;
 
        if (netpoll_receive_skb(skb))
-               goto out;
+               return NET_RX_DROP;
 
-       err = NET_RX_SUCCESS;
+       switch (ret) {
+       case GRO_NORMAL:
+       case GRO_HELD:
+               skb->protocol = eth_type_trans(skb, napi->dev);
 
-       switch (__napi_gro_receive(napi, skb)) {
-       case -1:
-               return netif_receive_skb(skb);
+               if (ret == GRO_NORMAL)
+                       return netif_receive_skb(skb);
 
-       case 0:
-               goto out;
-       }
+               skb_gro_pull(skb, -ETH_HLEN);
+               break;
 
-       napi_reuse_skb(napi, skb);
+       case GRO_DROP:
+               err = NET_RX_DROP;
+               /* fall through */
+
+       case GRO_MERGED_FREE:
+               napi_reuse_skb(napi, skb);
+               break;
+       }
 
-out:
        return err;
 }
+EXPORT_SYMBOL(napi_frags_finish);
+
+int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+{
+       struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+
+       if (!skb)
+               return NET_RX_DROP;
+
+       return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
+}
 EXPORT_SYMBOL(napi_gro_frags);
 
 static int process_backlog(struct napi_struct *napi, int quota)
@@ -2652,6 +2702,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                    int (*poll)(struct napi_struct *, int), int weight)
 {
        INIT_LIST_HEAD(&napi->poll_list);
+       napi->gro_count = 0;
        napi->gro_list = NULL;
        napi->skb = NULL;
        napi->poll = poll;
@@ -2680,6 +2731,7 @@ void netif_napi_del(struct napi_struct *napi)
        }
 
        napi->gro_list = NULL;
+       napi->gro_count = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
 
@@ -3948,6 +4000,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
                            cmd == SIOCSMIIREG ||
                            cmd == SIOCBRADDIF ||
                            cmd == SIOCBRDELIF ||
+                           cmd == SIOCSHWTSTAMP ||
                            cmd == SIOCWANDEV) {
                                err = -EOPNOTSUPP;
                                if (ops->ndo_do_ioctl) {
@@ -4102,6 +4155,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
                case SIOCBONDCHANGEACTIVE:
                case SIOCBRADDIF:
                case SIOCBRDELIF:
+               case SIOCSHWTSTAMP:
                        if (!capable(CAP_NET_ADMIN))
                                return -EPERM;
                        /* fall through */
@@ -5198,6 +5252,7 @@ static int __init net_dev_init(void)
                queue->backlog.poll = process_backlog;
                queue->backlog.weight = weight_p;
                queue->backlog.gro_list = NULL;
+               queue->backlog.gro_count = 0;
        }
 
        dev_boot_phase = 0;
@@ -5230,6 +5285,14 @@ out:
 
 subsys_initcall(net_dev_init);
 
+static int __init initialize_hashrnd(void)
+{
+       get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+       return 0;
+}
+
+late_initcall_sync(initialize_hashrnd);
+
 EXPORT_SYMBOL(__dev_get_by_index);
 EXPORT_SYMBOL(__dev_get_by_name);
 EXPORT_SYMBOL(__dev_remove_pack);