return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
-static inline void rps_lock(struct softnet_data *queue)
+static inline void rps_lock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_lock(&queue->input_pkt_queue.lock);
+ spin_lock(&sd->input_pkt_queue.lock);
#endif
}
-static inline void rps_unlock(struct softnet_data *queue)
+static inline void rps_unlock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_unlock(&queue->input_pkt_queue.lock);
+ spin_unlock(&sd->input_pkt_queue.lock);
#endif
}
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
+ ASSERT_RTNL();
return raw_notifier_call_chain(&netdev_chain, val, dev);
}
*
* return values:
* NET_RX_SUCCESS (no congestion)
- * NET_RX_DROP (packet was dropped)
+ * NET_RX_DROP (packet was dropped, but freed)
*
* dev_forward_skb can be used for injecting an skb from the
* start_xmit function of one device into the receive queue
{
skb_orphan(skb);
- if (!(dev->flags & IFF_UP))
- return NET_RX_DROP;
-
- if (skb->len > (dev->mtu + dev->hard_header_len))
+ if (!(dev->flags & IFF_UP) ||
+ (skb->len > (dev->mtu + dev->hard_header_len))) {
+ kfree_skb(skb);
return NET_RX_DROP;
-
+ }
skb_set_dev(skb, dev);
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
local_irq_save(flags);
sd = &__get_cpu_var(softnet_data);
- q->next_sched = sd->output_queue;
- sd->output_queue = q;
+ q->next_sched = NULL;
+ *sd->output_queue_tailp = q;
+ sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
return 0;
}
+/*
+ * Try to orphan skb early, right before transmission by the device.
+ * We cannot orphan skb if tx timestamp is requested, since
+ * drivers need to call skb_tstamp_tx() to send the timestamp.
+ */
+static inline void skb_orphan_try(struct sk_buff *skb)
+{
+ if (!skb_tx(skb)->flags)
+ skb_orphan(skb);
+}
+
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
- if (netif_needs_gso(dev, skb)) {
- if (unlikely(dev_gso_segment(skb)))
- goto out_kfree_skb;
- if (skb->next)
- goto gso;
- }
-
/*
* If device doesnt need skb->dst, release it right now while
* its hot in this cpu cache
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
+ skb_orphan_try(skb);
+
+ if (netif_needs_gso(dev, skb)) {
+ if (unlikely(dev_gso_segment(skb)))
+ goto out_kfree_skb;
+ if (skb->next)
+ goto gso;
+ }
+
rc = ops->ndo_start_xmit(skb, dev);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
- /*
- * TODO: if skb_orphan() was called by
- * dev->hard_start_xmit() (for example, the unmodified
- * igb driver does that; bnx2 doesn't), then
- * skb_tx_software_timestamp() will be unable to send
- * back the time stamp.
- *
- * How can this be prevented? Always create another
- * reference to the socket before calling
- * dev->hard_start_xmit()? Prevent that skb_orphan()
- * does anything in dev->hard_start_xmit() by clearing
- * the skb destructor before the call and restoring it
- * afterwards, then doing the skb_orphan() ourselves?
- */
return rc;
}
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
else
- hash = skb->protocol;
+ hash = (__force u16) skb->protocol;
hash = jhash_1word(hash, hashrnd);
if (dev->real_num_tx_queues > 1)
queue_index = skb_tx_hash(dev, skb);
- if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
- sk_tx_queue_set(sk, queue_index);
+ if (sk) {
+ struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
+
+ if (dst && skb_dst(skb) == dst)
+ sk_tx_queue_set(sk, queue_index);
+ }
}
}
int netdev_budget __read_mostly = 300;
int weight_p __read_mostly = 64; /* old backlog weight */
-DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
+/* Called with irq disabled */
+static inline void ____napi_schedule(struct softnet_data *sd,
+ struct napi_struct *napi)
+{
+ list_add_tail(&napi->poll_list, &sd->poll_list);
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
#ifdef CONFIG_RPS
int cpu = -1;
u8 ip_proto;
u16 tcpu;
- u32 addr1, addr2, ports, ihl;
+ u32 addr1, addr2, ihl;
+ union {
+ u32 v32;
+ u16 v16[2];
+ } ports;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
ip = (struct iphdr *) skb->data;
ip_proto = ip->protocol;
- addr1 = ip->saddr;
- addr2 = ip->daddr;
+ addr1 = (__force u32) ip->saddr;
+ addr2 = (__force u32) ip->daddr;
ihl = ip->ihl;
break;
case __constant_htons(ETH_P_IPV6):
ip6 = (struct ipv6hdr *) skb->data;
ip_proto = ip6->nexthdr;
- addr1 = ip6->saddr.s6_addr32[3];
- addr2 = ip6->daddr.s6_addr32[3];
+ addr1 = (__force u32) ip6->saddr.s6_addr32[3];
+ addr2 = (__force u32) ip6->daddr.s6_addr32[3];
ihl = (40 >> 2);
break;
default:
goto done;
}
- ports = 0;
switch (ip_proto) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_AH:
case IPPROTO_SCTP:
case IPPROTO_UDPLITE:
- if (pskb_may_pull(skb, (ihl * 4) + 4))
- ports = *((u32 *) (skb->data + (ihl * 4)));
- break;
-
+ if (pskb_may_pull(skb, (ihl * 4) + 4)) {
+ ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
+ if (ports.v16[1] < ports.v16[0])
+ swap(ports.v16[0], ports.v16[1]);
+ break;
+ }
default:
+ ports.v32 = 0;
break;
}
- skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
+ /* get a consistent hash (same value on both flow directions) */
+ if (addr2 < addr1)
+ swap(addr1, addr2);
+ skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
if (!skb->rxhash)
skb->rxhash = 1;
return cpu;
}
-/*
- * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
- * to be sent to kick remote softirq processing. There are two masks since
- * the sending of IPIs must be done with interrupts enabled. The select field
- * indicates the current mask that enqueue_backlog uses to schedule IPIs.
- * select is flipped before net_rps_action is called while still under lock,
- * net_rps_action then uses the non-selected mask to send the IPIs and clears
- * it without conflicting with enqueue_backlog operation.
- */
-struct rps_remote_softirq_cpus {
- cpumask_t mask[2];
- int select;
-};
-static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus);
-
/* Called from hardirq (IPI) context */
-static void trigger_softirq(void *data)
+static void rps_trigger_softirq(void *data)
{
- struct softnet_data *queue = data;
- __napi_schedule(&queue->backlog);
- __get_cpu_var(netdev_rx_stat).received_rps++;
+ struct softnet_data *sd = data;
+
+ ____napi_schedule(sd, &sd->backlog);
+ sd->received_rps++;
}
+
+#endif /* CONFIG_RPS */
+
+/*
+ * Check if this softnet_data structure is another cpu one
+ * If yes, queue it to our IPI list and return 1
+ * If no, return 0
+ */
+static int rps_ipi_queued(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *mysd = &__get_cpu_var(softnet_data);
+
+ if (sd != mysd) {
+ sd->rps_ipi_next = mysd->rps_ipi_list;
+ mysd->rps_ipi_list = sd;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ return 1;
+ }
#endif /* CONFIG_RPS */
+ return 0;
+}
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
- struct softnet_data *queue;
+ struct softnet_data *sd;
unsigned long flags;
- queue = &per_cpu(softnet_data, cpu);
+ sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
- __get_cpu_var(netdev_rx_stat).total++;
- rps_lock(queue);
- if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
- if (queue->input_pkt_queue.qlen) {
+ rps_lock(sd);
+ if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+ if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
- __skb_queue_tail(&queue->input_pkt_queue, skb);
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
#ifdef CONFIG_RPS
- *qtail = queue->input_queue_head +
- queue->input_pkt_queue.qlen;
+ *qtail = sd->input_queue_head +
+ skb_queue_len(&sd->input_pkt_queue);
#endif
- rps_unlock(queue);
+ rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device */
- if (napi_schedule_prep(&queue->backlog)) {
-#ifdef CONFIG_RPS
- if (cpu != smp_processor_id()) {
- struct rps_remote_softirq_cpus *rcpus =
- &__get_cpu_var(rps_remote_softirq_cpus);
-
- cpu_set(cpu, rcpus->mask[rcpus->select]);
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- goto enqueue;
- }
-#endif
- __napi_schedule(&queue->backlog);
+ if (napi_schedule_prep(&sd->backlog)) {
+ if (!rps_ipi_queued(sd))
+ ____napi_schedule(sd, &sd->backlog);
}
goto enqueue;
}
- rps_unlock(queue);
+ sd->dropped++;
+ rps_unlock(sd);
- __get_cpu_var(netdev_rx_stat).dropped++;
local_irq_restore(flags);
kfree_skb(skb);
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
+ sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
while (head) {
skb->dev = master;
}
- __get_cpu_var(netdev_rx_stat).total++;
+ __get_cpu_var(softnet_data).processed++;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
}
EXPORT_SYMBOL(netif_receive_skb);
-/* Network device is going away, flush any packets still pending */
+/* Network device is going away, flush any packets still pending
+ * Called with irqs disabled.
+ */
static void flush_backlog(void *arg)
{
struct net_device *dev = arg;
- struct softnet_data *queue = &__get_cpu_var(softnet_data);
+ struct softnet_data *sd = &__get_cpu_var(softnet_data);
struct sk_buff *skb, *tmp;
- rps_lock(queue);
- skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
+ rps_lock(sd);
+ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+ if (skb->dev == dev) {
+ __skb_unlink(skb, &sd->input_pkt_queue);
+ kfree_skb(skb);
+ input_queue_head_add(sd, 1);
+ }
+ }
+ rps_unlock(sd);
+
+ skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev == dev) {
- __skb_unlink(skb, &queue->input_pkt_queue);
+ __skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
- incr_input_queue_head(queue);
}
- rps_unlock(queue);
+ }
}
static int napi_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(napi_gro_frags);
+/*
+ * net_rps_action sends any pending IPI's for rps.
+ * Note: called with local irq disabled, but exits with local irq enabled.
+ */
+static void net_rps_action_and_irq_enable(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *remsd = sd->rps_ipi_list;
+
+ if (remsd) {
+ sd->rps_ipi_list = NULL;
+
+ local_irq_enable();
+
+ /* Send pending IPI's to kick RPS processing on remote cpus. */
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ __smp_call_function_single(remsd->cpu,
+ &remsd->csd, 0);
+ remsd = next;
+ }
+ } else
+#endif
+ local_irq_enable();
+}
+
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
- struct softnet_data *queue = &__get_cpu_var(softnet_data);
+ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+#ifdef CONFIG_RPS
+ /* Check if we have pending ipi, its better to send them now,
+ * not waiting net_rx_action() end.
+ */
+ if (sd->rps_ipi_list) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+#endif
napi->weight = weight_p;
- do {
+ local_irq_disable();
+ while (work < quota) {
struct sk_buff *skb;
+ unsigned int qlen;
- local_irq_disable();
- rps_lock(queue);
- skb = __skb_dequeue(&queue->input_pkt_queue);
- if (!skb) {
- __napi_complete(napi);
- rps_unlock(queue);
+ while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
- break;
+ __netif_receive_skb(skb);
+ if (++work >= quota)
+ return work;
+ local_irq_disable();
}
- incr_input_queue_head(queue);
- rps_unlock(queue);
- local_irq_enable();
- __netif_receive_skb(skb);
- } while (++work < quota);
+ rps_lock(sd);
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (qlen) {
+ input_queue_head_add(sd, qlen);
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
+ }
+ if (qlen < quota - work) {
+ /*
+ * Inline a custom version of __napi_complete().
+ * only current cpu owns and manipulates this napi,
+ * and NAPI_STATE_SCHED is the only possible flag set on backlog.
+ * we can use a plain write instead of clear_bit(),
+ * and we dont need an smp_mb() memory barrier.
+ */
+ list_del(&napi->poll_list);
+ napi->state = 0;
+
+ quota = work + qlen;
+ }
+ rps_unlock(sd);
+ }
+ local_irq_enable();
return work;
}
unsigned long flags;
local_irq_save(flags);
- list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ ____napi_schedule(&__get_cpu_var(softnet_data), n);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);
}
EXPORT_SYMBOL(netif_napi_del);
-#ifdef CONFIG_RPS
-/*
- * net_rps_action sends any pending IPI's for rps. This is only called from
- * softirq and interrupts must be enabled.
- */
-static void net_rps_action(cpumask_t *mask)
-{
- int cpu;
-
- /* Send pending IPI's to kick RPS processing on remote cpus. */
- for_each_cpu_mask_nr(cpu, *mask) {
- struct softnet_data *queue = &per_cpu(softnet_data, cpu);
- if (cpu_online(cpu))
- __smp_call_function_single(cpu, &queue->csd, 0);
- }
- cpus_clear(*mask);
-}
-#endif
-
static void net_rx_action(struct softirq_action *h)
{
- struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
+ struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
-#ifdef CONFIG_RPS
- int select;
- struct rps_remote_softirq_cpus *rcpus;
-#endif
local_irq_disable();
- while (!list_empty(list)) {
+ while (!list_empty(&sd->poll_list)) {
struct napi_struct *n;
int work, weight;
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
- n = list_first_entry(list, struct napi_struct, poll_list);
+ n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
napi_complete(n);
local_irq_disable();
} else
- list_move_tail(&n->poll_list, list);
+ list_move_tail(&n->poll_list, &sd->poll_list);
}
netpoll_poll_unlock(have);
}
out:
-#ifdef CONFIG_RPS
- rcpus = &__get_cpu_var(rps_remote_softirq_cpus);
- select = rcpus->select;
- rcpus->select ^= 1;
-
- local_irq_enable();
-
- net_rps_action(&rcpus->mask[select]);
-#else
- local_irq_enable();
-#endif
+ net_rps_action_and_irq_enable(sd);
#ifdef CONFIG_NET_DMA
/*
return;
softnet_break:
- __get_cpu_var(netdev_rx_stat).time_squeeze++;
+ sd->time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}
return 0;
}
-static struct netif_rx_stats *softnet_get_online(loff_t *pos)
+static struct softnet_data *softnet_get_online(loff_t *pos)
{
- struct netif_rx_stats *rc = NULL;
+ struct softnet_data *sd = NULL;
while (*pos < nr_cpu_ids)
if (cpu_online(*pos)) {
- rc = &per_cpu(netdev_rx_stat, *pos);
+ sd = &per_cpu(softnet_data, *pos);
break;
} else
++*pos;
- return rc;
+ return sd;
}
static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
static int softnet_seq_show(struct seq_file *seq, void *v)
{
- struct netif_rx_stats *s = v;
+ struct softnet_data *sd = v;
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- s->total, s->dropped, s->time_squeeze, 0,
+ sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
- s->cpu_collision, s->received_rps);
+ sd->cpu_collision, sd->received_rps);
return 0;
}
void *ocpu)
{
struct sk_buff **list_skb;
- struct Qdisc **list_net;
struct sk_buff *skb;
unsigned int cpu, oldcpu = (unsigned long)ocpu;
struct softnet_data *sd, *oldsd;
*list_skb = oldsd->completion_queue;
oldsd->completion_queue = NULL;
- /* Find end of our output_queue. */
- list_net = &sd->output_queue;
- while (*list_net)
- list_net = &(*list_net)->next_sched;
/* Append output queue from offline CPU. */
- *list_net = oldsd->output_queue;
- oldsd->output_queue = NULL;
+ if (oldsd->output_queue) {
+ *sd->output_queue_tailp = oldsd->output_queue;
+ sd->output_queue_tailp = oldsd->output_queue_tailp;
+ oldsd->output_queue = NULL;
+ oldsd->output_queue_tailp = &oldsd->output_queue;
+ }
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
- incr_input_queue_head(oldsd);
+ input_queue_head_add(oldsd, 1);
}
+ while ((skb = __skb_dequeue(&oldsd->process_queue)))
+ netif_rx(skb);
return NOTIFY_OK;
}
*/
for_each_possible_cpu(i) {
- struct softnet_data *queue;
-
- queue = &per_cpu(softnet_data, i);
- skb_queue_head_init(&queue->input_pkt_queue);
- queue->completion_queue = NULL;
- INIT_LIST_HEAD(&queue->poll_list);
+ struct softnet_data *sd = &per_cpu(softnet_data, i);
+ memset(sd, 0, sizeof(*sd));
+ skb_queue_head_init(&sd->input_pkt_queue);
+ skb_queue_head_init(&sd->process_queue);
+ sd->completion_queue = NULL;
+ INIT_LIST_HEAD(&sd->poll_list);
+ sd->output_queue = NULL;
+ sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
- queue->csd.func = trigger_softirq;
- queue->csd.info = queue;
- queue->csd.flags = 0;
+ sd->csd.func = rps_trigger_softirq;
+ sd->csd.info = sd;
+ sd->csd.flags = 0;
+ sd->cpu = i;
#endif
- queue->backlog.poll = process_backlog;
- queue->backlog.weight = weight_p;
- queue->backlog.gro_list = NULL;
- queue->backlog.gro_count = 0;
+ sd->backlog.poll = process_backlog;
+ sd->backlog.weight = weight_p;
+ sd->backlog.gro_list = NULL;
+ sd->backlog.gro_count = 0;
}
dev_boot_phase = 0;