Pull sony into release branch

[pandora-kernel.git] / arch / x86_64 / kernel / smpboot.c
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c

index 9753802..3544372 100644 (file)
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -46,9 +46,10 @@
  #include <linux/bootmem.h>
  #include <linux/thread_info.h>
  #include <linux/module.h>
-
  #include <linux/delay.h>
  #include <linux/mc146818rtc.h>
+#include <linux/smp.h>
+
  #include <asm/mtrr.h>
  #include <asm/pgalloc.h>
  #include <asm/desc.h>
@@ -59,6 +60,7 @@
  #include <asm/irq.h>
  #include <asm/hw_irq.h>
  #include <asm/numa.h>
+#include <asm/genapic.h>
  
  /* Number of siblings per CPU package */
  int smp_num_siblings = 1;
@@ -146,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id)
         print_cpu_info(c);
  }
  
-/*
- * New Funky TSC sync algorithm borrowed from IA64.
- * Main advantage is that it doesn't reset the TSCs fully and
- * in general looks more robust and it works better than my earlier
- * attempts. I believe it was written by David Mosberger. Some minor
- * adjustments for x86-64 by me -AK
- *
- * Original comment reproduced below.
- *
- * Synchronize TSC of the current (slave) CPU with the TSC of the
- * MASTER CPU (normally the time-keeper CPU).  We use a closed loop to
- * eliminate the possibility of unaccounted-for errors (such as
- * getting a machine check in the middle of a calibration step).  The
- * basic idea is for the slave to ask the master what itc value it has
- * and to read its own itc before and after the master responds.  Each
- * iteration gives us three timestamps:
- *
- *     slave           master
- *
- *     t0 ---\
- *             ---\
- *                --->
- *                     tm
- *                /---
- *            /---
- *     t1 <---
- *
- *
- * The goal is to adjust the slave's TSC such that tm falls exactly
- * half-way between t0 and t1.  If we achieve this, the clocks are
- * synchronized provided the interconnect between the slave and the
- * master is symmetric.  Even if the interconnect were asymmetric, we
- * would still know that the synchronization error is smaller than the
- * roundtrip latency (t0 - t1).
- *
- * When the interconnect is quiet and symmetric, this lets us
- * synchronize the TSC to within one or two cycles.  However, we can
- * only *guarantee* that the synchronization is accurate to within a
- * round-trip time, which is typically in the range of several hundred
- * cycles (e.g., ~500 cycles).  In practice, this means that the TSCs
- * are usually almost perfectly synchronized, but we shouldn't assume
- * that the accuracy is much better than half a micro second or so.
- *
- * [there are other errors like the latency of RDTSC and of the
- * WRMSR. These can also account to hundreds of cycles. So it's
- * probably worse. It claims 153 cycles error on a dual Opteron,
- * but I suspect the numbers are actually somewhat worse -AK]
- */
-
-#define MASTER 0
-#define SLAVE  (SMP_CACHE_BYTES/8)
-
-/* Intentionally don't use cpu_relax() while TSC synchronization
-   because we don't want to go into funky power save modi or cause
-   hypervisors to schedule us away.  Going to sleep would likely affect
-   latency and low latency is the primary objective here. -AK */
-#define no_cpu_relax() barrier()
-
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
-static volatile __cpuinitdata unsigned long go[SLAVE + 1];
-static int notscsync __cpuinitdata;
-
-#undef DEBUG_TSC_SYNC
-
-#define NUM_ROUNDS     64      /* magic value */
-#define NUM_ITERS      5       /* likewise */
-
-/* Callback on boot CPU */
-static __cpuinit void sync_master(void *arg)
-{
-       unsigned long flags, i;
-
-       go[MASTER] = 0;
-
-       local_irq_save(flags);
-       {
-               for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
-                       while (!go[MASTER])
-                               no_cpu_relax();
-                       go[MASTER] = 0;
-                       rdtscll(go[SLAVE]);
-               }
-       }
-       local_irq_restore(flags);
-}
-
-/*
- * Return the number of cycles by which our tsc differs from the tsc
- * on the master (time-keeper) CPU.  A positive number indicates our
- * tsc is ahead of the master, negative that it is behind.
- */
-static inline long
-get_delta(long *rt, long *master)
-{
-       unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
-       unsigned long tcenter, t0, t1, tm;
-       int i;
-
-       for (i = 0; i < NUM_ITERS; ++i) {
-               rdtscll(t0);
-               go[MASTER] = 1;
-               while (!(tm = go[SLAVE]))
-                       no_cpu_relax();
-               go[SLAVE] = 0;
-               rdtscll(t1);
-
-               if (t1 - t0 < best_t1 - best_t0)
-                       best_t0 = t0, best_t1 = t1, best_tm = tm;
-       }
-
-       *rt = best_t1 - best_t0;
-       *master = best_tm - best_t0;
-
-       /* average best_t0 and best_t1 without overflow: */
-       tcenter = (best_t0/2 + best_t1/2);
-       if (best_t0 % 2 + best_t1 % 2 == 2)
-               ++tcenter;
-       return tcenter - best_tm;
-}
-
-static __cpuinit void sync_tsc(unsigned int master)
-{
-       int i, done = 0;
-       long delta, adj, adjust_latency = 0;
-       unsigned long flags, rt, master_time_stamp, bound;
-#ifdef DEBUG_TSC_SYNC
-       static struct syncdebug {
-               long rt;        /* roundtrip time */
-               long master;    /* master's timestamp */
-               long diff;      /* difference between midpoint and master's timestamp */
-               long lat;       /* estimate of tsc adjustment latency */
-       } t[NUM_ROUNDS] __cpuinitdata;
-#endif
-
-       printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
-               smp_processor_id(), master);
-
-       go[MASTER] = 1;
-
-       /* It is dangerous to broadcast IPI as cpus are coming up,
-        * as they may not be ready to accept them.  So since
-        * we only need to send the ipi to the boot cpu direct
-        * the message, and avoid the race.
-        */
-       smp_call_function_single(master, sync_master, NULL, 1, 0);
-
-       while (go[MASTER])      /* wait for master to be ready */
-               no_cpu_relax();
-
-       spin_lock_irqsave(&tsc_sync_lock, flags);
-       {
-               for (i = 0; i < NUM_ROUNDS; ++i) {
-                       delta = get_delta(&rt, &master_time_stamp);
-                       if (delta == 0) {
-                               done = 1;       /* let's lock on to this... */
-                               bound = rt;
-                       }
-
-                       if (!done) {
-                               unsigned long t;
-                               if (i > 0) {
-                                       adjust_latency += -delta;
-                                       adj = -delta + adjust_latency/4;
-                               } else
-                                       adj = -delta;
-
-                               rdtscll(t);
-                               wrmsrl(MSR_IA32_TSC, t + adj);
-                       }
-#ifdef DEBUG_TSC_SYNC
-                       t[i].rt = rt;
-                       t[i].master = master_time_stamp;
-                       t[i].diff = delta;
-                       t[i].lat = adjust_latency/4;
-#endif
-               }
-       }
-       spin_unlock_irqrestore(&tsc_sync_lock, flags);
-
-#ifdef DEBUG_TSC_SYNC
-       for (i = 0; i < NUM_ROUNDS; ++i)
-               printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
-                      t[i].rt, t[i].master, t[i].diff, t[i].lat);
-#endif
-
-       printk(KERN_INFO
-              "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
-              "maxerr %lu cycles)\n",
-              smp_processor_id(), master, delta, rt);
-}
-
-static void __cpuinit tsc_sync_wait(void)
-{
-       /*
-        * When the CPU has synchronized TSCs assume the BIOS
-        * or the hardware already synced.  Otherwise we could
-        * mess up a possible perfect synchronization with a
-        * not-quite-perfect algorithm.
-        */
-       if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
-               return;
-       sync_tsc(0);
-}
-
-static __init int notscsync_setup(char *s)
-{
-       notscsync = 1;
-       return 1;
-}
-__setup("notscsync", notscsync_setup);
-
  static atomic_t init_deasserted __cpuinitdata;
  
  /*
@@ -544,6 +335,11 @@ void __cpuinit start_secondary(void)
         /* otherwise gcc will move up the smp_processor_id before the cpu_init */
         barrier();
  
+       /*
+        * Check TSC sync first:
+        */
+       check_tsc_sync_target();
+
         Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());         
         setup_secondary_APIC_clock();
  
@@ -563,14 +359,6 @@ void __cpuinit start_secondary(void)
          */
         set_cpu_sibling_map(smp_processor_id());
  
-       /* 
-        * Wait for TSC sync to not schedule things before.
-        * We still process interrupts, which could see an inconsistent
-        * time in that window unfortunately. 
-        * Do this here because TSC sync has global unprotected state.
-        */
-       tsc_sync_wait();
-
         /*
          * We need to hold call_lock, so there is no inconsistency
          * between the time smp_call_function() determines number of
@@ -580,12 +368,17 @@ void __cpuinit start_secondary(void)
          * smp_call_function().
          */
         lock_ipi_call_lock();
+       spin_lock(&vector_lock);
  
+       /* Setup the per cpu irq handling data structures */
+       __setup_vector_irq(smp_processor_id());
         /*
          * Allow the master to continue.
          */
         cpu_set(smp_processor_id(), cpu_online_map);
         per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+       spin_unlock(&vector_lock);
+
         unlock_ipi_call_lock();
  
         cpu_idle();
@@ -748,14 +541,16 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
  }
  
  struct create_idle {
+       struct work_struct work;
         struct task_struct *idle;
         struct completion done;
         int cpu;
  };
  
-void do_fork_idle(void *_c_idle)
+void do_fork_idle(struct work_struct *work)
  {
-       struct create_idle *c_idle = _c_idle;
+       struct create_idle *c_idle =
+               container_of(work, struct create_idle, work);
  
         c_idle->idle = fork_idle(c_idle->cpu);
         complete(&c_idle->done);
@@ -770,10 +565,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
         int timeout;
         unsigned long start_rip;
         struct create_idle c_idle = {
+               .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
                 .cpu = cpu,
                 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
         };
-       DECLARE_WORK(work, do_fork_idle, &c_idle);
  
         /* allocate memory for gdts of secondary cpus. Hotplug is considered */
         if (!cpu_gdt_descr[cpu].address &&
@@ -798,7 +593,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
                                 cpu, node);
         }
  
-
         alternatives_smp_switch(1);
  
         c_idle.idle = get_idle_for_cpu(cpu);
@@ -821,9 +615,9 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
          * thread.
          */
         if (!keventd_up() || current_is_keventd())
-               work.func(work.data);
+               c_idle.work.func(&c_idle.work);
         else {
-               schedule_work(&work);
+               schedule_work(&c_idle.work);
                 wait_for_completion(&c_idle.done);
         }
  
@@ -1090,7 +884,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
         /*
          * Switch from PIC to APIC mode.
          */
-       connect_bsp_APIC();
         setup_local_APIC();
  
         if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
@@ -1162,8 +955,20 @@ int __cpuinit __cpu_up(unsigned int cpu)
         /* Unleash the CPU! */
         Dprintk("waiting for cpu %d\n", cpu);
  
+       /*
+        * Make sure and check TSC sync:
+        */
+       check_tsc_sync_source(cpu);
+
         while (!cpu_isset(cpu, cpu_online_map))
                 cpu_relax();
+
+       if (num_online_cpus() > 8 && genapic == &apic_flat) {
+               printk(KERN_WARNING
+                      "flat APIC routing can't be used with > 8 cpus\n");
+               BUG();
+       }
+
         err = 0;
  
         return err;
@@ -1175,11 +980,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
  void __init smp_cpus_done(unsigned int max_cpus)
  {
         smp_cleanup_boot();
-
-#ifdef CONFIG_X86_IO_APIC
         setup_ioapic_dest();
-#endif
-
         check_nmi_watchdog();
  }
  
@@ -1233,6 +1034,8 @@ int __cpu_disable(void)
         if (cpu == 0)
                 return -EBUSY;
  
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               stop_apic_nmi_watchdog(NULL);
         clear_local_APIC();
  
         /*
@@ -1247,8 +1050,10 @@ int __cpu_disable(void)
         local_irq_disable();
         remove_siblinginfo(cpu);
  
+       spin_lock(&vector_lock);
         /* It's now safe to remove this processor from the online map */
         cpu_clear(cpu, cpu_online_map);
+       spin_unlock(&vector_lock);
         remove_cpu_from_maps();
         fixup_irqs(cpu_online_map);
         return 0;
@@ -1272,11 +1077,11 @@ void __cpu_die(unsigned int cpu)
         printk(KERN_ERR "CPU %u didn't die...\n", cpu);
  }
  
-__init int setup_additional_cpus(char *s)
+static __init int setup_additional_cpus(char *s)
  {
-       return get_option(&s, &additional_cpus);
+       return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
  }
-__setup("additional_cpus=", setup_additional_cpus);
+early_param("additional_cpus", setup_additional_cpus);
  
  #else /* ... !CONFIG_HOTPLUG_CPU */