Merge branch 'for-2.6.39/core' of git://git.kernel.dk/linux-2.6-block
[pandora-kernel.git] / kernel / sched.c
index 42eab5a..ae659b9 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
@@ -324,7 +323,7 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-       struct sched_entity *curr, *next, *last;
+       struct sched_entity *curr, *next, *last, *skip;
 
        unsigned int nr_spread_over;
 
@@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct task_group *tg;
        struct cgroup_subsys_state *css;
 
-       if (p->flags & PF_EXITING)
-               return &root_task_group;
-
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
@@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq)
 #endif
 
 /**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
  * @cpu: the processor in question.
  *
- * Returns true if the current cpu runqueue is locked.
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
@@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                __release(rq2->lock);
 }
 
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+       __acquires(rq1->lock)
+       __acquires(rq2->lock)
+{
+       BUG_ON(!irqs_disabled());
+       BUG_ON(rq1 != rq2);
+       raw_spin_lock(&rq1->lock);
+       __acquire(rq2->lock);   /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+       __releases(rq1->lock)
+       __releases(rq2->lock)
+{
+       BUG_ON(rq1 != rq2);
+       raw_spin_unlock(&rq1->lock);
+       __release(rq2->lock);
+}
+
 #endif
 
 static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr)
         */
        if (hardirq_count())
                __this_cpu_add(cpu_hardirq_time, delta);
-       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                __this_cpu_add(cpu_softirq_time, delta);
 
        irq_time_write_end();
@@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
                sched_rt_avg_update(rq, irq_delta);
 }
 
+static int irqtime_account_hi_update(void)
+{
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_hardirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_softirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+#define sched_clock_irqtime    (0)
+
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
        rq->clock_task += delta;
@@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p)
 
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
-                                      int oldprio, int running)
+                                      int oldprio)
 {
        if (prev_class != p->sched_class) {
                if (prev_class->switched_from)
-                       prev_class->switched_from(rq, p, running);
-               p->sched_class->switched_to(rq, p, running);
-       } else
-               p->sched_class->prio_changed(rq, p, oldprio, running);
+                       prev_class->switched_from(rq, p);
+               p->sched_class->switched_to(rq, p);
+       } else if (oldprio != p->prio)
+               p->sched_class->prio_changed(rq, p, oldprio);
 }
 
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * yield - it could be a while.
                 */
                if (unlikely(on_rq)) {
-                       schedule_timeout_uninterruptible(1);
+                       ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_hrtimeout(&to, HRTIMER_MODE_REL);
                        continue;
                }
 
@@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:         the task to evaluate
- * @func:      the function to be called
- * @info:      the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-                             void (*func) (void *info), void *info)
-{
-       int cpu;
-
-       preempt_disable();
-       cpu = task_cpu(p);
-       if (task_curr(p))
-               smp_call_function_single(cpu, func, info, 1);
-       preempt_enable();
-}
-
 #ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p)
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
+       p->se.vruntime                  = 0;
 
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2776,9 +2819,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
+       sched_info_switch(prev, next);
+       perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
+       trace_sched_switch(prev, next);
 }
 
 /**
@@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
 
        prepare_task_switch(rq, prev, next);
-       trace_sched_switch(prev, next);
+
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -3567,6 +3613,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        }
 }
 
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                       cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+       cputime64_t tmp = cputime_to_cputime64(cputime);
+
+       /* Add system time to process. */
+       p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+       account_group_system_time(p, cputime);
+
+       /* Add system time to cpustat. */
+       *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+       /* Account for system time used */
+       acct_update_integrals(p);
+}
+
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                         cputime_t cputime, cputime_t cputime_scaled)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp;
+       cputime64_t *target_cputime64;
 
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                account_guest_time(p, cputime, cputime_scaled);
                return;
        }
 
-       /* Add system time to process. */
-       p->stime = cputime_add(p->stime, cputime);
-       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-       account_group_system_time(p, cputime);
-
-       /* Add system time to cpustat. */
-       tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
-               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+               target_cputime64 = &cpustat->irq;
        else if (in_serving_softirq())
-               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+               target_cputime64 = &cpustat->softirq;
        else
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-
-       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+               target_cputime64 = &cpustat->system;
 
-       /* Account for system time used */
-       acct_update_integrals(p);
+       __account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
 /*
  * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
  */
 void account_steal_time(cputime_t cputime)
 {
@@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime)
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+       if (irqtime_account_hi_update()) {
+               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+       } else if (irqtime_account_si_update()) {
+               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+       } else if (this_cpu_ksoftirqd() == p) {
+               /*
+                * ksoftirqd time do not get accounted in cpu_softirq_time.
+                * So, we have to handle it separately here.
+                * Also, p->stime needs to be updated for ksoftirqd.
+                */
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->softirq);
+       } else if (user_tick) {
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else if (p == rq->idle) {
+               account_idle_time(cputime_one_jiffy);
+       } else if (p->flags & PF_VCPU) { /* System time or guest time */
+               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else {
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->system);
+       }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+       int i;
+       struct rq *rq = this_rq();
+
+       for (i = 0; i < ticks; i++)
+               irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
 /*
  * Account a single tick of cpu time.
  * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
 
+       if (sched_clock_irqtime) {
+               irqtime_account_process_tick(p, user_tick, rq);
+               return;
+       }
+
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks)
  */
 void account_idle_ticks(unsigned long ticks)
 {
+
+       if (sched_clock_irqtime) {
+               irqtime_account_idle_ticks(ticks);
+               return;
+       }
+
        account_idle_time(jiffies_to_cputime(ticks));
 }
 
@@ -3945,9 +4085,6 @@ need_resched:
        rcu_note_context_switch(cpu);
        prev = rq->curr;
 
-       release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
        schedule_debug(prev);
 
        if (sched_feat(HRTICK))
@@ -3978,6 +4115,16 @@ need_resched_nonpreemptible:
                switch_count = &prev->nvcsw;
        }
 
+       /*
+        * If we are going to sleep and we have plugged IO queued, make
+        * sure to submit it to avoid deadlocks.
+        */
+       if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
+               raw_spin_unlock(&rq->lock);
+               blk_flush_plug(prev);
+               raw_spin_lock(&rq->lock);
+       }
+
        pre_schedule(rq, prev);
 
        if (unlikely(!rq->nr_running))
@@ -3989,9 +4136,6 @@ need_resched_nonpreemptible:
        rq->skip_clock_update = 0;
 
        if (likely(prev != next)) {
-               sched_info_switch(prev, next);
-               perf_event_task_sched_out(prev, next);
-
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
@@ -4010,9 +4154,6 @@ need_resched_nonpreemptible:
 
        post_schedule(rq);
 
-       if (unlikely(reacquire_kernel_lock(prev)))
-               goto need_resched_nonpreemptible;
-
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
@@ -4571,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
+       check_class_changed(rq, p, prev_class, oldprio);
        task_rq_unlock(rq, &flags);
 }
 
@@ -4762,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
 
        rcu_read_lock();
        pcred = __task_cred(p);
-       match = (cred->euid == pcred->euid ||
-                cred->euid == pcred->uid);
+       if (cred->user->user_ns == pcred->user->user_ns)
+               match = (cred->euid == pcred->euid ||
+                        cred->euid == pcred->uid);
+       else
+               match = false;
        rcu_read_unlock();
        return match;
 }
@@ -4823,12 +4966,15 @@ recheck:
                            param->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
+
                /*
-                * Like positive nice levels, dont allow tasks to
-                * move out of SCHED_IDLE either:
+                * Treat SCHED_IDLE as nice 20. Only allow a switch to
+                * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
-                       return -EPERM;
+               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+                       if (!can_nice(p, TASK_NICE(p)))
+                               return -EPERM;
+               }
 
                /* can't change other user's priorities */
                if (!check_same_owner(p))
@@ -4903,11 +5049,10 @@ recheck:
 
        if (running)
                p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                activate_task(rq, p, 0);
 
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
+       check_class_changed(rq, p, prev_class, oldprio);
        __task_rq_unlock(rq);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -5089,7 +5234,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                goto out_unlock;
 
        retval = security_task_setscheduler(p);
@@ -5324,6 +5469,65 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
 
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+       struct task_struct *curr = current;
+       struct rq *rq, *p_rq;
+       unsigned long flags;
+       bool yielded = 0;
+
+       local_irq_save(flags);
+       rq = this_rq();
+
+again:
+       p_rq = task_rq(p);
+       double_rq_lock(rq, p_rq);
+       while (task_rq(p) != p_rq) {
+               double_rq_unlock(rq, p_rq);
+               goto again;
+       }
+
+       if (!curr->sched_class->yield_to_task)
+               goto out;
+
+       if (curr->sched_class != p->sched_class)
+               goto out;
+
+       if (task_running(p_rq, p) || p->state)
+               goto out;
+
+       yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+       if (yielded) {
+               schedstat_inc(rq, yld_count);
+               /*
+                * Make p's CPU reschedule; pick_next_entity takes care of
+                * fairness.
+                */
+               if (preempt && rq != p_rq)
+                       resched_task(p_rq->curr);
+       }
+
+out:
+       double_rq_unlock(rq, p_rq);
+       local_irq_restore(flags);
+
+       if (yielded)
+               schedule();
+
+       return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
@@ -5334,6 +5538,7 @@ void __sched io_schedule(void)
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
        current->in_iowait = 1;
        schedule();
        current->in_iowait = 0;
@@ -5349,6 +5554,7 @@ long __sched io_schedule_timeout(long timeout)
 
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
        current->in_iowait = 0;
@@ -5572,7 +5778,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
-       ftrace_graph_init_task(idle);
+       ftrace_graph_init_idle_task(idle, cpu);
 }
 
 /*
@@ -7797,6 +8003,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
        INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        cfs_rq->rq = rq;
+       /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+       cfs_rq->load_stamp = 1;
+#endif
 #endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
@@ -8075,7 +8285,7 @@ static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 
-       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+       return (nested == preempt_offset);
 }
 
 void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8110,6 +8320,8 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+       const struct sched_class *prev_class = p->sched_class;
+       int old_prio = p->prio;
        int on_rq;
 
        on_rq = p->se.on_rq;
@@ -8120,6 +8332,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                activate_task(rq, p, 0);
                resched_task(rq->curr);
        }
+
+       check_class_changed(rq, p, prev_class, old_prio);
 }
 
 void normalize_rt_tasks(void)
@@ -8511,7 +8725,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                /* Propagate contribution to hierarchy */
                raw_spin_lock_irqsave(&rq->lock, flags);
                for_each_sched_entity(se)
-                       update_cfs_shares(group_cfs_rq(se), 0);
+                       update_cfs_shares(group_cfs_rq(se));
                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
 
@@ -8885,7 +9099,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+               struct cgroup *old_cgrp, struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.