Merge branch 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel...
Linus Torvalds [Fri, 2 Jan 2009 19:44:09 +0000 (11:44 -0800)]
* 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (66 commits)
  x86: export vector_used_by_percpu_irq
  x86: use logical apicid in x2apic_cluster's x2apic_cpu_mask_to_apicid_and()
  sched: nominate preferred wakeup cpu, fix
  x86: fix lguest used_vectors breakage, -v2
  x86: fix warning in arch/x86/kernel/io_apic.c
  sched: fix warning in kernel/sched.c
  sched: move test_sd_parent() to an SMP section of sched.h
  sched: add SD_BALANCE_NEWIDLE at MC and CPU level for sched_mc>0
  sched: activate active load balancing in new idle cpus
  sched: bias task wakeups to preferred semi-idle packages
  sched: nominate preferred wakeup cpu
  sched: favour lower logical cpu number for sched_mc balance
  sched: framework for sched_mc/smt_power_savings=N
  sched: convert BALANCE_FOR_xx_POWER to inline functions
  x86: use possible_cpus=NUM to extend the possible cpus allowed
  x86: fix cpu_mask_to_apicid_and to include cpu_online_mask
  x86: update io_apic.c to the new cpumask code
  x86: Introduce topology_core_cpumask()/topology_thread_cpumask()
  x86: xen: use smp_call_function_many()
  x86: use work_on_cpu in x86/kernel/cpu/mcheck/mce_amd_64.c
  ...

Fixed up trivial conflict in kernel/time/tick-sched.c manually

55 files changed:
1  2 
arch/arm/kernel/smp.c
arch/arm/mach-at91/at91rm9200_time.c
arch/arm/mach-pxa/time.c
arch/arm/mach-realview/core.c
arch/arm/mach-realview/localtimer.c
arch/arm/mach-sa1100/time.c
arch/arm/mach-versatile/core.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/time.c
arch/powerpc/platforms/pseries/xics.c
arch/powerpc/sysdev/mpic.c
arch/s390/Kconfig
arch/s390/kernel/smp.c
arch/s390/kernel/time.c
arch/sparc/kernel/irq_64.c
arch/sparc/kernel/of_device_64.c
arch/sparc/kernel/pci_msi.c
arch/sparc/kernel/smp_32.c
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/sparc_ksyms_32.c
arch/sparc/kernel/time_64.c
arch/x86/Kconfig
arch/x86/include/asm/irq.h
arch/x86/kernel/apic.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
arch/x86/kernel/genx2apic_uv_x.c
arch/x86/kernel/hpet.c
arch/x86/kernel/io_apic.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/irqinit_32.c
arch/x86/kernel/irqinit_64.c
arch/x86/kernel/reboot.c
arch/x86/kernel/smp.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/tlb_32.c
arch/x86/kernel/tlb_64.c
arch/x86/kernel/traps.c
arch/x86/lguest/boot.c
arch/x86/xen/mmu.c
drivers/xen/events.c
include/linux/interrupt.h
include/linux/irq.h
include/linux/sched.h
init/Kconfig
kernel/irq/chip.c
kernel/irq/manage.c
kernel/sched.c
kernel/sched_fair.c
kernel/sched_rt.c
kernel/sched_stats.h
kernel/time/tick-sched.c
kernel/trace/trace.c
lib/Kconfig
mm/slub.c

Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
  #define DBG(fmt...)
  #endif
  
 -int smp_hw_index[NR_CPUS];
  struct thread_info *secondary_ti;
  
- cpumask_t cpu_possible_map = CPU_MASK_NONE;
- cpumask_t cpu_online_map = CPU_MASK_NONE;
  DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
  DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE;
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index a3ea2bc,0000000..cab8e02
mode 100644,000000..100644
--- /dev/null
@@@ -1,1101 -1,0 +1,1104 @@@
 +/* irq.c: UltraSparc IRQ handling/init/registry.
 + *
 + * Copyright (C) 1997, 2007, 2008 David S. Miller (davem@davemloft.net)
 + * Copyright (C) 1998  Eddie C. Dost    (ecd@skynet.be)
 + * Copyright (C) 1998  Jakub Jelinek    (jj@ultra.linux.cz)
 + */
 +
 +#include <linux/module.h>
 +#include <linux/sched.h>
 +#include <linux/linkage.h>
 +#include <linux/ptrace.h>
 +#include <linux/errno.h>
 +#include <linux/kernel_stat.h>
 +#include <linux/signal.h>
 +#include <linux/mm.h>
 +#include <linux/interrupt.h>
 +#include <linux/slab.h>
 +#include <linux/random.h>
 +#include <linux/init.h>
 +#include <linux/delay.h>
 +#include <linux/proc_fs.h>
 +#include <linux/seq_file.h>
 +#include <linux/bootmem.h>
 +#include <linux/irq.h>
 +
 +#include <asm/ptrace.h>
 +#include <asm/processor.h>
 +#include <asm/atomic.h>
 +#include <asm/system.h>
 +#include <asm/irq.h>
 +#include <asm/io.h>
 +#include <asm/iommu.h>
 +#include <asm/upa.h>
 +#include <asm/oplib.h>
 +#include <asm/prom.h>
 +#include <asm/timer.h>
 +#include <asm/smp.h>
 +#include <asm/starfire.h>
 +#include <asm/uaccess.h>
 +#include <asm/cache.h>
 +#include <asm/cpudata.h>
 +#include <asm/auxio.h>
 +#include <asm/head.h>
 +#include <asm/hypervisor.h>
 +#include <asm/cacheflush.h>
 +
 +#include "entry.h"
 +
 +#define NUM_IVECS     (IMAP_INR + 1)
 +
 +struct ino_bucket *ivector_table;
 +unsigned long ivector_table_pa;
 +
 +/* On several sun4u processors, it is illegal to mix bypass and
 + * non-bypass accesses.  Therefore we access all INO buckets
 + * using bypass accesses only.
 + */
 +static unsigned long bucket_get_chain_pa(unsigned long bucket_pa)
 +{
 +      unsigned long ret;
 +
 +      __asm__ __volatile__("ldxa      [%1] %2, %0"
 +                           : "=&r" (ret)
 +                           : "r" (bucket_pa +
 +                                  offsetof(struct ino_bucket,
 +                                           __irq_chain_pa)),
 +                             "i" (ASI_PHYS_USE_EC));
 +
 +      return ret;
 +}
 +
 +static void bucket_clear_chain_pa(unsigned long bucket_pa)
 +{
 +      __asm__ __volatile__("stxa      %%g0, [%0] %1"
 +                           : /* no outputs */
 +                           : "r" (bucket_pa +
 +                                  offsetof(struct ino_bucket,
 +                                           __irq_chain_pa)),
 +                             "i" (ASI_PHYS_USE_EC));
 +}
 +
 +static unsigned int bucket_get_virt_irq(unsigned long bucket_pa)
 +{
 +      unsigned int ret;
 +
 +      __asm__ __volatile__("lduwa     [%1] %2, %0"
 +                           : "=&r" (ret)
 +                           : "r" (bucket_pa +
 +                                  offsetof(struct ino_bucket,
 +                                           __virt_irq)),
 +                             "i" (ASI_PHYS_USE_EC));
 +
 +      return ret;
 +}
 +
 +static void bucket_set_virt_irq(unsigned long bucket_pa,
 +                              unsigned int virt_irq)
 +{
 +      __asm__ __volatile__("stwa      %0, [%1] %2"
 +                           : /* no outputs */
 +                           : "r" (virt_irq),
 +                             "r" (bucket_pa +
 +                                  offsetof(struct ino_bucket,
 +                                           __virt_irq)),
 +                             "i" (ASI_PHYS_USE_EC));
 +}
 +
 +#define irq_work_pa(__cpu)    &(trap_block[(__cpu)].irq_worklist_pa)
 +
 +static struct {
 +      unsigned int dev_handle;
 +      unsigned int dev_ino;
 +      unsigned int in_use;
 +} virt_irq_table[NR_IRQS];
 +static DEFINE_SPINLOCK(virt_irq_alloc_lock);
 +
 +unsigned char virt_irq_alloc(unsigned int dev_handle,
 +                           unsigned int dev_ino)
 +{
 +      unsigned long flags;
 +      unsigned char ent;
 +
 +      BUILD_BUG_ON(NR_IRQS >= 256);
 +
 +      spin_lock_irqsave(&virt_irq_alloc_lock, flags);
 +
 +      for (ent = 1; ent < NR_IRQS; ent++) {
 +              if (!virt_irq_table[ent].in_use)
 +                      break;
 +      }
 +      if (ent >= NR_IRQS) {
 +              printk(KERN_ERR "IRQ: Out of virtual IRQs.\n");
 +              ent = 0;
 +      } else {
 +              virt_irq_table[ent].dev_handle = dev_handle;
 +              virt_irq_table[ent].dev_ino = dev_ino;
 +              virt_irq_table[ent].in_use = 1;
 +      }
 +
 +      spin_unlock_irqrestore(&virt_irq_alloc_lock, flags);
 +
 +      return ent;
 +}
 +
 +#ifdef CONFIG_PCI_MSI
 +void virt_irq_free(unsigned int virt_irq)
 +{
 +      unsigned long flags;
 +
 +      if (virt_irq >= NR_IRQS)
 +              return;
 +
 +      spin_lock_irqsave(&virt_irq_alloc_lock, flags);
 +
 +      virt_irq_table[virt_irq].in_use = 0;
 +
 +      spin_unlock_irqrestore(&virt_irq_alloc_lock, flags);
 +}
 +#endif
 +
 +/*
 + * /proc/interrupts printing:
 + */
 +
 +int show_interrupts(struct seq_file *p, void *v)
 +{
 +      int i = *(loff_t *) v, j;
 +      struct irqaction * action;
 +      unsigned long flags;
 +
 +      if (i == 0) {
 +              seq_printf(p, "           ");
 +              for_each_online_cpu(j)
 +                      seq_printf(p, "CPU%d       ",j);
 +              seq_putc(p, '\n');
 +      }
 +
 +      if (i < NR_IRQS) {
 +              spin_lock_irqsave(&irq_desc[i].lock, flags);
 +              action = irq_desc[i].action;
 +              if (!action)
 +                      goto skip;
 +              seq_printf(p, "%3d: ",i);
 +#ifndef CONFIG_SMP
 +              seq_printf(p, "%10u ", kstat_irqs(i));
 +#else
 +              for_each_online_cpu(j)
 +                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 +#endif
 +              seq_printf(p, " %9s", irq_desc[i].chip->typename);
 +              seq_printf(p, "  %s", action->name);
 +
 +              for (action=action->next; action; action = action->next)
 +                      seq_printf(p, ", %s", action->name);
 +
 +              seq_putc(p, '\n');
 +skip:
 +              spin_unlock_irqrestore(&irq_desc[i].lock, flags);
 +      }
 +      return 0;
 +}
 +
 +static unsigned int sun4u_compute_tid(unsigned long imap, unsigned long cpuid)
 +{
 +      unsigned int tid;
 +
 +      if (this_is_starfire) {
 +              tid = starfire_translate(imap, cpuid);
 +              tid <<= IMAP_TID_SHIFT;
 +              tid &= IMAP_TID_UPA;
 +      } else {
 +              if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 +                      unsigned long ver;
 +
 +                      __asm__ ("rdpr %%ver, %0" : "=r" (ver));
 +                      if ((ver >> 32UL) == __JALAPENO_ID ||
 +                          (ver >> 32UL) == __SERRANO_ID) {
 +                              tid = cpuid << IMAP_TID_SHIFT;
 +                              tid &= IMAP_TID_JBUS;
 +                      } else {
 +                              unsigned int a = cpuid & 0x1f;
 +                              unsigned int n = (cpuid >> 5) & 0x1f;
 +
 +                              tid = ((a << IMAP_AID_SHIFT) |
 +                                     (n << IMAP_NID_SHIFT));
 +                              tid &= (IMAP_AID_SAFARI |
 +                                      IMAP_NID_SAFARI);;
 +                      }
 +              } else {
 +                      tid = cpuid << IMAP_TID_SHIFT;
 +                      tid &= IMAP_TID_UPA;
 +              }
 +      }
 +
 +      return tid;
 +}
 +
 +struct irq_handler_data {
 +      unsigned long   iclr;
 +      unsigned long   imap;
 +
 +      void            (*pre_handler)(unsigned int, void *, void *);
 +      void            *arg1;
 +      void            *arg2;
 +};
 +
 +#ifdef CONFIG_SMP
 +static int irq_choose_cpu(unsigned int virt_irq)
 +{
 +      cpumask_t mask = irq_desc[virt_irq].affinity;
 +      int cpuid;
 +
 +      if (cpus_equal(mask, CPU_MASK_ALL)) {
 +              static int irq_rover;
 +              static DEFINE_SPINLOCK(irq_rover_lock);
 +              unsigned long flags;
 +
 +              /* Round-robin distribution... */
 +      do_round_robin:
 +              spin_lock_irqsave(&irq_rover_lock, flags);
 +
 +              while (!cpu_online(irq_rover)) {
 +                      if (++irq_rover >= NR_CPUS)
 +                              irq_rover = 0;
 +              }
 +              cpuid = irq_rover;
 +              do {
 +                      if (++irq_rover >= NR_CPUS)
 +                              irq_rover = 0;
 +              } while (!cpu_online(irq_rover));
 +
 +              spin_unlock_irqrestore(&irq_rover_lock, flags);
 +      } else {
 +              cpumask_t tmp;
 +
 +              cpus_and(tmp, cpu_online_map, mask);
 +
 +              if (cpus_empty(tmp))
 +                      goto do_round_robin;
 +
 +              cpuid = first_cpu(tmp);
 +      }
 +
 +      return cpuid;
 +}
 +#else
 +static int irq_choose_cpu(unsigned int virt_irq)
 +{
 +      return real_hard_smp_processor_id();
 +}
 +#endif
 +
 +static void sun4u_irq_enable(unsigned int virt_irq)
 +{
 +      struct irq_handler_data *data = get_irq_chip_data(virt_irq);
 +
 +      if (likely(data)) {
 +              unsigned long cpuid, imap, val;
 +              unsigned int tid;
 +
 +              cpuid = irq_choose_cpu(virt_irq);
 +              imap = data->imap;
 +
 +              tid = sun4u_compute_tid(imap, cpuid);
 +
 +              val = upa_readq(imap);
 +              val &= ~(IMAP_TID_UPA | IMAP_TID_JBUS |
 +                       IMAP_AID_SAFARI | IMAP_NID_SAFARI);
 +              val |= tid | IMAP_VALID;
 +              upa_writeq(val, imap);
 +              upa_writeq(ICLR_IDLE, data->iclr);
 +      }
 +}
 +
- static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask)
++static void sun4u_set_affinity(unsigned int virt_irq,
++                             const struct cpumask *mask)
 +{
 +      sun4u_irq_enable(virt_irq);
 +}
 +
 +static void sun4u_irq_disable(unsigned int virt_irq)
 +{
 +      struct irq_handler_data *data = get_irq_chip_data(virt_irq);
 +
 +      if (likely(data)) {
 +              unsigned long imap = data->imap;
 +              unsigned long tmp = upa_readq(imap);
 +
 +              tmp &= ~IMAP_VALID;
 +              upa_writeq(tmp, imap);
 +      }
 +}
 +
 +static void sun4u_irq_eoi(unsigned int virt_irq)
 +{
 +      struct irq_handler_data *data = get_irq_chip_data(virt_irq);
 +      struct irq_desc *desc = irq_desc + virt_irq;
 +
 +      if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
 +              return;
 +
 +      if (likely(data))
 +              upa_writeq(ICLR_IDLE, data->iclr);
 +}
 +
 +static void sun4v_irq_enable(unsigned int virt_irq)
 +{
 +      unsigned int ino = virt_irq_table[virt_irq].dev_ino;
 +      unsigned long cpuid = irq_choose_cpu(virt_irq);
 +      int err;
 +
 +      err = sun4v_intr_settarget(ino, cpuid);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
 +                     "err(%d)\n", ino, cpuid, err);
 +      err = sun4v_intr_setstate(ino, HV_INTR_STATE_IDLE);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_intr_setstate(%x): "
 +                     "err(%d)\n", ino, err);
 +      err = sun4v_intr_setenabled(ino, HV_INTR_ENABLED);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_intr_setenabled(%x): err(%d)\n",
 +                     ino, err);
 +}
 +
- static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask)
++static void sun4v_set_affinity(unsigned int virt_irq,
++                             const struct cpumask *mask)
 +{
 +      unsigned int ino = virt_irq_table[virt_irq].dev_ino;
 +      unsigned long cpuid = irq_choose_cpu(virt_irq);
 +      int err;
 +
 +      err = sun4v_intr_settarget(ino, cpuid);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
 +                     "err(%d)\n", ino, cpuid, err);
 +}
 +
 +static void sun4v_irq_disable(unsigned int virt_irq)
 +{
 +      unsigned int ino = virt_irq_table[virt_irq].dev_ino;
 +      int err;
 +
 +      err = sun4v_intr_setenabled(ino, HV_INTR_DISABLED);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_intr_setenabled(%x): "
 +                     "err(%d)\n", ino, err);
 +}
 +
 +static void sun4v_irq_eoi(unsigned int virt_irq)
 +{
 +      unsigned int ino = virt_irq_table[virt_irq].dev_ino;
 +      struct irq_desc *desc = irq_desc + virt_irq;
 +      int err;
 +
 +      if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
 +              return;
 +
 +      err = sun4v_intr_setstate(ino, HV_INTR_STATE_IDLE);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_intr_setstate(%x): "
 +                     "err(%d)\n", ino, err);
 +}
 +
 +static void sun4v_virq_enable(unsigned int virt_irq)
 +{
 +      unsigned long cpuid, dev_handle, dev_ino;
 +      int err;
 +
 +      cpuid = irq_choose_cpu(virt_irq);
 +
 +      dev_handle = virt_irq_table[virt_irq].dev_handle;
 +      dev_ino = virt_irq_table[virt_irq].dev_ino;
 +
 +      err = sun4v_vintr_set_target(dev_handle, dev_ino, cpuid);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
 +                     "err(%d)\n",
 +                     dev_handle, dev_ino, cpuid, err);
 +      err = sun4v_vintr_set_state(dev_handle, dev_ino,
 +                                  HV_INTR_STATE_IDLE);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_vintr_set_state(%lx,%lx,"
 +                     "HV_INTR_STATE_IDLE): err(%d)\n",
 +                     dev_handle, dev_ino, err);
 +      err = sun4v_vintr_set_valid(dev_handle, dev_ino,
 +                                  HV_INTR_ENABLED);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_vintr_set_state(%lx,%lx,"
 +                     "HV_INTR_ENABLED): err(%d)\n",
 +                     dev_handle, dev_ino, err);
 +}
 +
- static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask)
++static void sun4v_virt_set_affinity(unsigned int virt_irq,
++                                  const struct cpumask *mask)
 +{
 +      unsigned long cpuid, dev_handle, dev_ino;
 +      int err;
 +
 +      cpuid = irq_choose_cpu(virt_irq);
 +
 +      dev_handle = virt_irq_table[virt_irq].dev_handle;
 +      dev_ino = virt_irq_table[virt_irq].dev_ino;
 +
 +      err = sun4v_vintr_set_target(dev_handle, dev_ino, cpuid);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
 +                     "err(%d)\n",
 +                     dev_handle, dev_ino, cpuid, err);
 +}
 +
 +static void sun4v_virq_disable(unsigned int virt_irq)
 +{
 +      unsigned long dev_handle, dev_ino;
 +      int err;
 +
 +      dev_handle = virt_irq_table[virt_irq].dev_handle;
 +      dev_ino = virt_irq_table[virt_irq].dev_ino;
 +
 +      err = sun4v_vintr_set_valid(dev_handle, dev_ino,
 +                                  HV_INTR_DISABLED);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_vintr_set_state(%lx,%lx,"
 +                     "HV_INTR_DISABLED): err(%d)\n",
 +                     dev_handle, dev_ino, err);
 +}
 +
 +static void sun4v_virq_eoi(unsigned int virt_irq)
 +{
 +      struct irq_desc *desc = irq_desc + virt_irq;
 +      unsigned long dev_handle, dev_ino;
 +      int err;
 +
 +      if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
 +              return;
 +
 +      dev_handle = virt_irq_table[virt_irq].dev_handle;
 +      dev_ino = virt_irq_table[virt_irq].dev_ino;
 +
 +      err = sun4v_vintr_set_state(dev_handle, dev_ino,
 +                                  HV_INTR_STATE_IDLE);
 +      if (err != HV_EOK)
 +              printk(KERN_ERR "sun4v_vintr_set_state(%lx,%lx,"
 +                     "HV_INTR_STATE_IDLE): err(%d)\n",
 +                     dev_handle, dev_ino, err);
 +}
 +
 +static struct irq_chip sun4u_irq = {
 +      .typename       = "sun4u",
 +      .enable         = sun4u_irq_enable,
 +      .disable        = sun4u_irq_disable,
 +      .eoi            = sun4u_irq_eoi,
 +      .set_affinity   = sun4u_set_affinity,
 +};
 +
 +static struct irq_chip sun4v_irq = {
 +      .typename       = "sun4v",
 +      .enable         = sun4v_irq_enable,
 +      .disable        = sun4v_irq_disable,
 +      .eoi            = sun4v_irq_eoi,
 +      .set_affinity   = sun4v_set_affinity,
 +};
 +
 +static struct irq_chip sun4v_virq = {
 +      .typename       = "vsun4v",
 +      .enable         = sun4v_virq_enable,
 +      .disable        = sun4v_virq_disable,
 +      .eoi            = sun4v_virq_eoi,
 +      .set_affinity   = sun4v_virt_set_affinity,
 +};
 +
 +static void pre_flow_handler(unsigned int virt_irq,
 +                                    struct irq_desc *desc)
 +{
 +      struct irq_handler_data *data = get_irq_chip_data(virt_irq);
 +      unsigned int ino = virt_irq_table[virt_irq].dev_ino;
 +
 +      data->pre_handler(ino, data->arg1, data->arg2);
 +
 +      handle_fasteoi_irq(virt_irq, desc);
 +}
 +
 +void irq_install_pre_handler(int virt_irq,
 +                           void (*func)(unsigned int, void *, void *),
 +                           void *arg1, void *arg2)
 +{
 +      struct irq_handler_data *data = get_irq_chip_data(virt_irq);
 +      struct irq_desc *desc = irq_desc + virt_irq;
 +
 +      data->pre_handler = func;
 +      data->arg1 = arg1;
 +      data->arg2 = arg2;
 +
 +      desc->handle_irq = pre_flow_handler;
 +}
 +
 +unsigned int build_irq(int inofixup, unsigned long iclr, unsigned long imap)
 +{
 +      struct ino_bucket *bucket;
 +      struct irq_handler_data *data;
 +      unsigned int virt_irq;
 +      int ino;
 +
 +      BUG_ON(tlb_type == hypervisor);
 +
 +      ino = (upa_readq(imap) & (IMAP_IGN | IMAP_INO)) + inofixup;
 +      bucket = &ivector_table[ino];
 +      virt_irq = bucket_get_virt_irq(__pa(bucket));
 +      if (!virt_irq) {
 +              virt_irq = virt_irq_alloc(0, ino);
 +              bucket_set_virt_irq(__pa(bucket), virt_irq);
 +              set_irq_chip_and_handler_name(virt_irq,
 +                                            &sun4u_irq,
 +                                            handle_fasteoi_irq,
 +                                            "IVEC");
 +      }
 +
 +      data = get_irq_chip_data(virt_irq);
 +      if (unlikely(data))
 +              goto out;
 +
 +      data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
 +      if (unlikely(!data)) {
 +              prom_printf("IRQ: kzalloc(irq_handler_data) failed.\n");
 +              prom_halt();
 +      }
 +      set_irq_chip_data(virt_irq, data);
 +
 +      data->imap  = imap;
 +      data->iclr  = iclr;
 +
 +out:
 +      return virt_irq;
 +}
 +
 +static unsigned int sun4v_build_common(unsigned long sysino,
 +                                     struct irq_chip *chip)
 +{
 +      struct ino_bucket *bucket;
 +      struct irq_handler_data *data;
 +      unsigned int virt_irq;
 +
 +      BUG_ON(tlb_type != hypervisor);
 +
 +      bucket = &ivector_table[sysino];
 +      virt_irq = bucket_get_virt_irq(__pa(bucket));
 +      if (!virt_irq) {
 +              virt_irq = virt_irq_alloc(0, sysino);
 +              bucket_set_virt_irq(__pa(bucket), virt_irq);
 +              set_irq_chip_and_handler_name(virt_irq, chip,
 +                                            handle_fasteoi_irq,
 +                                            "IVEC");
 +      }
 +
 +      data = get_irq_chip_data(virt_irq);
 +      if (unlikely(data))
 +              goto out;
 +
 +      data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
 +      if (unlikely(!data)) {
 +              prom_printf("IRQ: kzalloc(irq_handler_data) failed.\n");
 +              prom_halt();
 +      }
 +      set_irq_chip_data(virt_irq, data);
 +
 +      /* Catch accidental accesses to these things.  IMAP/ICLR handling
 +       * is done by hypervisor calls on sun4v platforms, not by direct
 +       * register accesses.
 +       */
 +      data->imap = ~0UL;
 +      data->iclr = ~0UL;
 +
 +out:
 +      return virt_irq;
 +}
 +
 +unsigned int sun4v_build_irq(u32 devhandle, unsigned int devino)
 +{
 +      unsigned long sysino = sun4v_devino_to_sysino(devhandle, devino);
 +
 +      return sun4v_build_common(sysino, &sun4v_irq);
 +}
 +
 +unsigned int sun4v_build_virq(u32 devhandle, unsigned int devino)
 +{
 +      struct irq_handler_data *data;
 +      unsigned long hv_err, cookie;
 +      struct ino_bucket *bucket;
 +      struct irq_desc *desc;
 +      unsigned int virt_irq;
 +
 +      bucket = kzalloc(sizeof(struct ino_bucket), GFP_ATOMIC);
 +      if (unlikely(!bucket))
 +              return 0;
 +      __flush_dcache_range((unsigned long) bucket,
 +                           ((unsigned long) bucket +
 +                            sizeof(struct ino_bucket)));
 +
 +      virt_irq = virt_irq_alloc(devhandle, devino);
 +      bucket_set_virt_irq(__pa(bucket), virt_irq);
 +
 +      set_irq_chip_and_handler_name(virt_irq, &sun4v_virq,
 +                                    handle_fasteoi_irq,
 +                                    "IVEC");
 +
 +      data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
 +      if (unlikely(!data))
 +              return 0;
 +
 +      /* In order to make the LDC channel startup sequence easier,
 +       * especially wrt. locking, we do not let request_irq() enable
 +       * the interrupt.
 +       */
 +      desc = irq_desc + virt_irq;
 +      desc->status |= IRQ_NOAUTOEN;
 +
 +      set_irq_chip_data(virt_irq, data);
 +
 +      /* Catch accidental accesses to these things.  IMAP/ICLR handling
 +       * is done by hypervisor calls on sun4v platforms, not by direct
 +       * register accesses.
 +       */
 +      data->imap = ~0UL;
 +      data->iclr = ~0UL;
 +
 +      cookie = ~__pa(bucket);
 +      hv_err = sun4v_vintr_set_cookie(devhandle, devino, cookie);
 +      if (hv_err) {
 +              prom_printf("IRQ: Fatal, cannot set cookie for [%x:%x] "
 +                          "err=%lu\n", devhandle, devino, hv_err);
 +              prom_halt();
 +      }
 +
 +      return virt_irq;
 +}
 +
 +void ack_bad_irq(unsigned int virt_irq)
 +{
 +      unsigned int ino = virt_irq_table[virt_irq].dev_ino;
 +
 +      if (!ino)
 +              ino = 0xdeadbeef;
 +
 +      printk(KERN_CRIT "Unexpected IRQ from ino[%x] virt_irq[%u]\n",
 +             ino, virt_irq);
 +}
 +
 +void *hardirq_stack[NR_CPUS];
 +void *softirq_stack[NR_CPUS];
 +
 +static __attribute__((always_inline)) void *set_hardirq_stack(void)
 +{
 +      void *orig_sp, *sp = hardirq_stack[smp_processor_id()];
 +
 +      __asm__ __volatile__("mov %%sp, %0" : "=r" (orig_sp));
 +      if (orig_sp < sp ||
 +          orig_sp > (sp + THREAD_SIZE)) {
 +              sp += THREAD_SIZE - 192 - STACK_BIAS;
 +              __asm__ __volatile__("mov %0, %%sp" : : "r" (sp));
 +      }
 +
 +      return orig_sp;
 +}
 +static __attribute__((always_inline)) void restore_hardirq_stack(void *orig_sp)
 +{
 +      __asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp));
 +}
 +
 +void handler_irq(int irq, struct pt_regs *regs)
 +{
 +      unsigned long pstate, bucket_pa;
 +      struct pt_regs *old_regs;
 +      void *orig_sp;
 +
 +      clear_softint(1 << irq);
 +
 +      old_regs = set_irq_regs(regs);
 +      irq_enter();
 +
 +      /* Grab an atomic snapshot of the pending IVECs.  */
 +      __asm__ __volatile__("rdpr      %%pstate, %0\n\t"
 +                           "wrpr      %0, %3, %%pstate\n\t"
 +                           "ldx       [%2], %1\n\t"
 +                           "stx       %%g0, [%2]\n\t"
 +                           "wrpr      %0, 0x0, %%pstate\n\t"
 +                           : "=&r" (pstate), "=&r" (bucket_pa)
 +                           : "r" (irq_work_pa(smp_processor_id())),
 +                             "i" (PSTATE_IE)
 +                           : "memory");
 +
 +      orig_sp = set_hardirq_stack();
 +
 +      while (bucket_pa) {
 +              struct irq_desc *desc;
 +              unsigned long next_pa;
 +              unsigned int virt_irq;
 +
 +              next_pa = bucket_get_chain_pa(bucket_pa);
 +              virt_irq = bucket_get_virt_irq(bucket_pa);
 +              bucket_clear_chain_pa(bucket_pa);
 +
 +              desc = irq_desc + virt_irq;
 +
 +              desc->handle_irq(virt_irq, desc);
 +
 +              bucket_pa = next_pa;
 +      }
 +
 +      restore_hardirq_stack(orig_sp);
 +
 +      irq_exit();
 +      set_irq_regs(old_regs);
 +}
 +
 +void do_softirq(void)
 +{
 +      unsigned long flags;
 +
 +      if (in_interrupt())
 +              return;
 +
 +      local_irq_save(flags);
 +
 +      if (local_softirq_pending()) {
 +              void *orig_sp, *sp = softirq_stack[smp_processor_id()];
 +
 +              sp += THREAD_SIZE - 192 - STACK_BIAS;
 +
 +              __asm__ __volatile__("mov %%sp, %0\n\t"
 +                                   "mov %1, %%sp"
 +                                   : "=&r" (orig_sp)
 +                                   : "r" (sp));
 +              __do_softirq();
 +              __asm__ __volatile__("mov %0, %%sp"
 +                                   : : "r" (orig_sp));
 +      }
 +
 +      local_irq_restore(flags);
 +}
 +
 +static void unhandled_perf_irq(struct pt_regs *regs)
 +{
 +      unsigned long pcr, pic;
 +
 +      read_pcr(pcr);
 +      read_pic(pic);
 +
 +      write_pcr(0);
 +
 +      printk(KERN_EMERG "CPU %d: Got unexpected perf counter IRQ.\n",
 +             smp_processor_id());
 +      printk(KERN_EMERG "CPU %d: PCR[%016lx] PIC[%016lx]\n",
 +             smp_processor_id(), pcr, pic);
 +}
 +
 +/* Almost a direct copy of the powerpc PMC code.  */
 +static DEFINE_SPINLOCK(perf_irq_lock);
 +static void *perf_irq_owner_caller; /* mostly for debugging */
 +static void (*perf_irq)(struct pt_regs *regs) = unhandled_perf_irq;
 +
 +/* Invoked from level 15 PIL handler in trap table.  */
 +void perfctr_irq(int irq, struct pt_regs *regs)
 +{
 +      clear_softint(1 << irq);
 +      perf_irq(regs);
 +}
 +
 +int register_perfctr_intr(void (*handler)(struct pt_regs *))
 +{
 +      int ret;
 +
 +      if (!handler)
 +              return -EINVAL;
 +
 +      spin_lock(&perf_irq_lock);
 +      if (perf_irq != unhandled_perf_irq) {
 +              printk(KERN_WARNING "register_perfctr_intr: "
 +                     "perf IRQ busy (reserved by caller %p)\n",
 +                     perf_irq_owner_caller);
 +              ret = -EBUSY;
 +              goto out;
 +      }
 +
 +      perf_irq_owner_caller = __builtin_return_address(0);
 +      perf_irq = handler;
 +
 +      ret = 0;
 +out:
 +      spin_unlock(&perf_irq_lock);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL_GPL(register_perfctr_intr);
 +
 +void release_perfctr_intr(void (*handler)(struct pt_regs *))
 +{
 +      spin_lock(&perf_irq_lock);
 +      perf_irq_owner_caller = NULL;
 +      perf_irq = unhandled_perf_irq;
 +      spin_unlock(&perf_irq_lock);
 +}
 +EXPORT_SYMBOL_GPL(release_perfctr_intr);
 +
 +#ifdef CONFIG_HOTPLUG_CPU
 +void fixup_irqs(void)
 +{
 +      unsigned int irq;
 +
 +      for (irq = 0; irq < NR_IRQS; irq++) {
 +              unsigned long flags;
 +
 +              spin_lock_irqsave(&irq_desc[irq].lock, flags);
 +              if (irq_desc[irq].action &&
 +                  !(irq_desc[irq].status & IRQ_PER_CPU)) {
 +                      if (irq_desc[irq].chip->set_affinity)
 +                              irq_desc[irq].chip->set_affinity(irq,
-                                       irq_desc[irq].affinity);
++                                      &irq_desc[irq].affinity);
 +              }
 +              spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
 +      }
 +
 +      tick_ops->disable_irq();
 +}
 +#endif
 +
 +struct sun5_timer {
 +      u64     count0;
 +      u64     limit0;
 +      u64     count1;
 +      u64     limit1;
 +};
 +
 +static struct sun5_timer *prom_timers;
 +static u64 prom_limit0, prom_limit1;
 +
 +static void map_prom_timers(void)
 +{
 +      struct device_node *dp;
 +      const unsigned int *addr;
 +
 +      /* PROM timer node hangs out in the top level of device siblings... */
 +      dp = of_find_node_by_path("/");
 +      dp = dp->child;
 +      while (dp) {
 +              if (!strcmp(dp->name, "counter-timer"))
 +                      break;
 +              dp = dp->sibling;
 +      }
 +
 +      /* Assume if node is not present, PROM uses different tick mechanism
 +       * which we should not care about.
 +       */
 +      if (!dp) {
 +              prom_timers = (struct sun5_timer *) 0;
 +              return;
 +      }
 +
 +      /* If PROM is really using this, it must be mapped by him. */
 +      addr = of_get_property(dp, "address", NULL);
 +      if (!addr) {
 +              prom_printf("PROM does not have timer mapped, trying to continue.\n");
 +              prom_timers = (struct sun5_timer *) 0;
 +              return;
 +      }
 +      prom_timers = (struct sun5_timer *) ((unsigned long)addr[0]);
 +}
 +
 +static void kill_prom_timer(void)
 +{
 +      if (!prom_timers)
 +              return;
 +
 +      /* Save them away for later. */
 +      prom_limit0 = prom_timers->limit0;
 +      prom_limit1 = prom_timers->limit1;
 +
 +      /* Just as in sun4c/sun4m PROM uses timer which ticks at IRQ 14.
 +       * We turn both off here just to be paranoid.
 +       */
 +      prom_timers->limit0 = 0;
 +      prom_timers->limit1 = 0;
 +
 +      /* Wheee, eat the interrupt packet too... */
 +      __asm__ __volatile__(
 +"     mov     0x40, %%g2\n"
 +"     ldxa    [%%g0] %0, %%g1\n"
 +"     ldxa    [%%g2] %1, %%g1\n"
 +"     stxa    %%g0, [%%g0] %0\n"
 +"     membar  #Sync\n"
 +      : /* no outputs */
 +      : "i" (ASI_INTR_RECEIVE), "i" (ASI_INTR_R)
 +      : "g1", "g2");
 +}
 +
 +void notrace init_irqwork_curcpu(void)
 +{
 +      int cpu = hard_smp_processor_id();
 +
 +      trap_block[cpu].irq_worklist_pa = 0UL;
 +}
 +
 +/* Please be very careful with register_one_mondo() and
 + * sun4v_register_mondo_queues().
 + *
 + * On SMP this gets invoked from the CPU trampoline before
 + * the cpu has fully taken over the trap table from OBP,
 + * and it's kernel stack + %g6 thread register state is
 + * not fully cooked yet.
 + *
 + * Therefore you cannot make any OBP calls, not even prom_printf,
 + * from these two routines.
 + */
 +static void __cpuinit register_one_mondo(unsigned long paddr, unsigned long type, unsigned long qmask)
 +{
 +      unsigned long num_entries = (qmask + 1) / 64;
 +      unsigned long status;
 +
 +      status = sun4v_cpu_qconf(type, paddr, num_entries);
 +      if (status != HV_EOK) {
 +              prom_printf("SUN4V: sun4v_cpu_qconf(%lu:%lx:%lu) failed, "
 +                          "err %lu\n", type, paddr, num_entries, status);
 +              prom_halt();
 +      }
 +}
 +
 +void __cpuinit notrace sun4v_register_mondo_queues(int this_cpu)
 +{
 +      struct trap_per_cpu *tb = &trap_block[this_cpu];
 +
 +      register_one_mondo(tb->cpu_mondo_pa, HV_CPU_QUEUE_CPU_MONDO,
 +                         tb->cpu_mondo_qmask);
 +      register_one_mondo(tb->dev_mondo_pa, HV_CPU_QUEUE_DEVICE_MONDO,
 +                         tb->dev_mondo_qmask);
 +      register_one_mondo(tb->resum_mondo_pa, HV_CPU_QUEUE_RES_ERROR,
 +                         tb->resum_qmask);
 +      register_one_mondo(tb->nonresum_mondo_pa, HV_CPU_QUEUE_NONRES_ERROR,
 +                         tb->nonresum_qmask);
 +}
 +
 +static void __init alloc_one_mondo(unsigned long *pa_ptr, unsigned long qmask)
 +{
 +      unsigned long size = PAGE_ALIGN(qmask + 1);
 +      void *p = __alloc_bootmem(size, size, 0);
 +      if (!p) {
 +              prom_printf("SUN4V: Error, cannot allocate mondo queue.\n");
 +              prom_halt();
 +      }
 +
 +      *pa_ptr = __pa(p);
 +}
 +
 +static void __init alloc_one_kbuf(unsigned long *pa_ptr, unsigned long qmask)
 +{
 +      unsigned long size = PAGE_ALIGN(qmask + 1);
 +      void *p = __alloc_bootmem(size, size, 0);
 +
 +      if (!p) {
 +              prom_printf("SUN4V: Error, cannot allocate kbuf page.\n");
 +              prom_halt();
 +      }
 +
 +      *pa_ptr = __pa(p);
 +}
 +
 +static void __init init_cpu_send_mondo_info(struct trap_per_cpu *tb)
 +{
 +#ifdef CONFIG_SMP
 +      void *page;
 +
 +      BUILD_BUG_ON((NR_CPUS * sizeof(u16)) > (PAGE_SIZE - 64));
 +
 +      page = alloc_bootmem_pages(PAGE_SIZE);
 +      if (!page) {
 +              prom_printf("SUN4V: Error, cannot allocate cpu mondo page.\n");
 +              prom_halt();
 +      }
 +
 +      tb->cpu_mondo_block_pa = __pa(page);
 +      tb->cpu_list_pa = __pa(page + 64);
 +#endif
 +}
 +
 +/* Allocate mondo and error queues for all possible cpus.  */
 +static void __init sun4v_init_mondo_queues(void)
 +{
 +      int cpu;
 +
 +      for_each_possible_cpu(cpu) {
 +              struct trap_per_cpu *tb = &trap_block[cpu];
 +
 +              alloc_one_mondo(&tb->cpu_mondo_pa, tb->cpu_mondo_qmask);
 +              alloc_one_mondo(&tb->dev_mondo_pa, tb->dev_mondo_qmask);
 +              alloc_one_mondo(&tb->resum_mondo_pa, tb->resum_qmask);
 +              alloc_one_kbuf(&tb->resum_kernel_buf_pa, tb->resum_qmask);
 +              alloc_one_mondo(&tb->nonresum_mondo_pa, tb->nonresum_qmask);
 +              alloc_one_kbuf(&tb->nonresum_kernel_buf_pa,
 +                             tb->nonresum_qmask);
 +      }
 +}
 +
 +static void __init init_send_mondo_info(void)
 +{
 +      int cpu;
 +
 +      for_each_possible_cpu(cpu) {
 +              struct trap_per_cpu *tb = &trap_block[cpu];
 +
 +              init_cpu_send_mondo_info(tb);
 +      }
 +}
 +
 +static struct irqaction timer_irq_action = {
 +      .name = "timer",
 +};
 +
 +/* Only invoked on boot processor. */
 +void __init init_IRQ(void)
 +{
 +      unsigned long size;
 +
 +      map_prom_timers();
 +      kill_prom_timer();
 +
 +      size = sizeof(struct ino_bucket) * NUM_IVECS;
 +      ivector_table = alloc_bootmem(size);
 +      if (!ivector_table) {
 +              prom_printf("Fatal error, cannot allocate ivector_table\n");
 +              prom_halt();
 +      }
 +      __flush_dcache_range((unsigned long) ivector_table,
 +                           ((unsigned long) ivector_table) + size);
 +
 +      ivector_table_pa = __pa(ivector_table);
 +
 +      if (tlb_type == hypervisor)
 +              sun4v_init_mondo_queues();
 +
 +      init_send_mondo_info();
 +
 +      if (tlb_type == hypervisor) {
 +              /* Load up the boot cpu's entries.  */
 +              sun4v_register_mondo_queues(hard_smp_processor_id());
 +      }
 +
 +      /* We need to clear any IRQ's pending in the soft interrupt
 +       * registers, a spurious one could be left around from the
 +       * PROM timer which we just disabled.
 +       */
 +      clear_softint(get_softint());
 +
 +      /* Now that ivector table is initialized, it is safe
 +       * to receive IRQ vector traps.  We will normally take
 +       * one or two right now, in case some device PROM used
 +       * to boot us wants to speak to us.  We just ignore them.
 +       */
 +      __asm__ __volatile__("rdpr      %%pstate, %%g1\n\t"
 +                           "or        %%g1, %0, %%g1\n\t"
 +                           "wrpr      %%g1, 0x0, %%pstate"
 +                           : /* No outputs */
 +                           : "i" (PSTATE_IE)
 +                           : "g1");
 +
 +      irq_desc[0].action = &timer_irq_action;
 +}
index 46e231f,0000000..322046c
mode 100644,000000..100644
--- /dev/null
@@@ -1,898 -1,0 +1,898 @@@
 +#include <linux/string.h>
 +#include <linux/kernel.h>
 +#include <linux/of.h>
 +#include <linux/init.h>
 +#include <linux/module.h>
 +#include <linux/mod_devicetable.h>
 +#include <linux/slab.h>
 +#include <linux/errno.h>
 +#include <linux/irq.h>
 +#include <linux/of_device.h>
 +#include <linux/of_platform.h>
 +
 +void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name)
 +{
 +      unsigned long ret = res->start + offset;
 +      struct resource *r;
 +
 +      if (res->flags & IORESOURCE_MEM)
 +              r = request_mem_region(ret, size, name);
 +      else
 +              r = request_region(ret, size, name);
 +      if (!r)
 +              ret = 0;
 +
 +      return (void __iomem *) ret;
 +}
 +EXPORT_SYMBOL(of_ioremap);
 +
 +void of_iounmap(struct resource *res, void __iomem *base, unsigned long size)
 +{
 +      if (res->flags & IORESOURCE_MEM)
 +              release_mem_region((unsigned long) base, size);
 +      else
 +              release_region((unsigned long) base, size);
 +}
 +EXPORT_SYMBOL(of_iounmap);
 +
 +static int node_match(struct device *dev, void *data)
 +{
 +      struct of_device *op = to_of_device(dev);
 +      struct device_node *dp = data;
 +
 +      return (op->node == dp);
 +}
 +
 +struct of_device *of_find_device_by_node(struct device_node *dp)
 +{
 +      struct device *dev = bus_find_device(&of_platform_bus_type, NULL,
 +                                           dp, node_match);
 +
 +      if (dev)
 +              return to_of_device(dev);
 +
 +      return NULL;
 +}
 +EXPORT_SYMBOL(of_find_device_by_node);
 +
 +unsigned int irq_of_parse_and_map(struct device_node *node, int index)
 +{
 +      struct of_device *op = of_find_device_by_node(node);
 +
 +      if (!op || index >= op->num_irqs)
 +              return 0;
 +
 +      return op->irqs[index];
 +}
 +EXPORT_SYMBOL(irq_of_parse_and_map);
 +
 +/* Take the archdata values for IOMMU, STC, and HOSTDATA found in
 + * BUS and propagate to all child of_device objects.
 + */
 +void of_propagate_archdata(struct of_device *bus)
 +{
 +      struct dev_archdata *bus_sd = &bus->dev.archdata;
 +      struct device_node *bus_dp = bus->node;
 +      struct device_node *dp;
 +
 +      for (dp = bus_dp->child; dp; dp = dp->sibling) {
 +              struct of_device *op = of_find_device_by_node(dp);
 +
 +              op->dev.archdata.iommu = bus_sd->iommu;
 +              op->dev.archdata.stc = bus_sd->stc;
 +              op->dev.archdata.host_controller = bus_sd->host_controller;
 +              op->dev.archdata.numa_node = bus_sd->numa_node;
 +
 +              if (dp->child)
 +                      of_propagate_archdata(op);
 +      }
 +}
 +
 +struct bus_type of_platform_bus_type;
 +EXPORT_SYMBOL(of_platform_bus_type);
 +
 +static inline u64 of_read_addr(const u32 *cell, int size)
 +{
 +      u64 r = 0;
 +      while (size--)
 +              r = (r << 32) | *(cell++);
 +      return r;
 +}
 +
 +static void __init get_cells(struct device_node *dp,
 +                           int *addrc, int *sizec)
 +{
 +      if (addrc)
 +              *addrc = of_n_addr_cells(dp);
 +      if (sizec)
 +              *sizec = of_n_size_cells(dp);
 +}
 +
 +/* Max address size we deal with */
 +#define OF_MAX_ADDR_CELLS     4
 +
 +struct of_bus {
 +      const char      *name;
 +      const char      *addr_prop_name;
 +      int             (*match)(struct device_node *parent);
 +      void            (*count_cells)(struct device_node *child,
 +                                     int *addrc, int *sizec);
 +      int             (*map)(u32 *addr, const u32 *range,
 +                             int na, int ns, int pna);
 +      unsigned long   (*get_flags)(const u32 *addr, unsigned long);
 +};
 +
 +/*
 + * Default translator (generic bus)
 + */
 +
 +static void of_bus_default_count_cells(struct device_node *dev,
 +                                     int *addrc, int *sizec)
 +{
 +      get_cells(dev, addrc, sizec);
 +}
 +
 +/* Make sure the least significant 64-bits are in-range.  Even
 + * for 3 or 4 cell values it is a good enough approximation.
 + */
 +static int of_out_of_range(const u32 *addr, const u32 *base,
 +                         const u32 *size, int na, int ns)
 +{
 +      u64 a = of_read_addr(addr, na);
 +      u64 b = of_read_addr(base, na);
 +
 +      if (a < b)
 +              return 1;
 +
 +      b += of_read_addr(size, ns);
 +      if (a >= b)
 +              return 1;
 +
 +      return 0;
 +}
 +
 +static int of_bus_default_map(u32 *addr, const u32 *range,
 +                            int na, int ns, int pna)
 +{
 +      u32 result[OF_MAX_ADDR_CELLS];
 +      int i;
 +
 +      if (ns > 2) {
 +              printk("of_device: Cannot handle size cells (%d) > 2.", ns);
 +              return -EINVAL;
 +      }
 +
 +      if (of_out_of_range(addr, range, range + na + pna, na, ns))
 +              return -EINVAL;
 +
 +      /* Start with the parent range base.  */
 +      memcpy(result, range + na, pna * 4);
 +
 +      /* Add in the child address offset.  */
 +      for (i = 0; i < na; i++)
 +              result[pna - 1 - i] +=
 +                      (addr[na - 1 - i] -
 +                       range[na - 1 - i]);
 +
 +      memcpy(addr, result, pna * 4);
 +
 +      return 0;
 +}
 +
 +static unsigned long of_bus_default_get_flags(const u32 *addr, unsigned long flags)
 +{
 +      if (flags)
 +              return flags;
 +      return IORESOURCE_MEM;
 +}
 +
 +/*
 + * PCI bus specific translator
 + */
 +
 +static int of_bus_pci_match(struct device_node *np)
 +{
 +      if (!strcmp(np->name, "pci")) {
 +              const char *model = of_get_property(np, "model", NULL);
 +
 +              if (model && !strcmp(model, "SUNW,simba"))
 +                      return 0;
 +
 +              /* Do not do PCI specific frobbing if the
 +               * PCI bridge lacks a ranges property.  We
 +               * want to pass it through up to the next
 +               * parent as-is, not with the PCI translate
 +               * method which chops off the top address cell.
 +               */
 +              if (!of_find_property(np, "ranges", NULL))
 +                      return 0;
 +
 +              return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +static int of_bus_simba_match(struct device_node *np)
 +{
 +      const char *model = of_get_property(np, "model", NULL);
 +
 +      if (model && !strcmp(model, "SUNW,simba"))
 +              return 1;
 +
 +      /* Treat PCI busses lacking ranges property just like
 +       * simba.
 +       */
 +      if (!strcmp(np->name, "pci")) {
 +              if (!of_find_property(np, "ranges", NULL))
 +                      return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +static int of_bus_simba_map(u32 *addr, const u32 *range,
 +                          int na, int ns, int pna)
 +{
 +      return 0;
 +}
 +
 +static void of_bus_pci_count_cells(struct device_node *np,
 +                                 int *addrc, int *sizec)
 +{
 +      if (addrc)
 +              *addrc = 3;
 +      if (sizec)
 +              *sizec = 2;
 +}
 +
 +static int of_bus_pci_map(u32 *addr, const u32 *range,
 +                        int na, int ns, int pna)
 +{
 +      u32 result[OF_MAX_ADDR_CELLS];
 +      int i;
 +
 +      /* Check address type match */
 +      if ((addr[0] ^ range[0]) & 0x03000000)
 +              return -EINVAL;
 +
 +      if (of_out_of_range(addr + 1, range + 1, range + na + pna,
 +                          na - 1, ns))
 +              return -EINVAL;
 +
 +      /* Start with the parent range base.  */
 +      memcpy(result, range + na, pna * 4);
 +
 +      /* Add in the child address offset, skipping high cell.  */
 +      for (i = 0; i < na - 1; i++)
 +              result[pna - 1 - i] +=
 +                      (addr[na - 1 - i] -
 +                       range[na - 1 - i]);
 +
 +      memcpy(addr, result, pna * 4);
 +
 +      return 0;
 +}
 +
 +static unsigned long of_bus_pci_get_flags(const u32 *addr, unsigned long flags)
 +{
 +      u32 w = addr[0];
 +
 +      /* For PCI, we override whatever child busses may have used.  */
 +      flags = 0;
 +      switch((w >> 24) & 0x03) {
 +      case 0x01:
 +              flags |= IORESOURCE_IO;
 +              break;
 +
 +      case 0x02: /* 32 bits */
 +      case 0x03: /* 64 bits */
 +              flags |= IORESOURCE_MEM;
 +              break;
 +      }
 +      if (w & 0x40000000)
 +              flags |= IORESOURCE_PREFETCH;
 +      return flags;
 +}
 +
 +/*
 + * SBUS bus specific translator
 + */
 +
 +static int of_bus_sbus_match(struct device_node *np)
 +{
 +      return !strcmp(np->name, "sbus") ||
 +              !strcmp(np->name, "sbi");
 +}
 +
 +static void of_bus_sbus_count_cells(struct device_node *child,
 +                                 int *addrc, int *sizec)
 +{
 +      if (addrc)
 +              *addrc = 2;
 +      if (sizec)
 +              *sizec = 1;
 +}
 +
 +/*
 + * FHC/Central bus specific translator.
 + *
 + * This is just needed to hard-code the address and size cell
 + * counts.  'fhc' and 'central' nodes lack the #address-cells and
 + * #size-cells properties, and if you walk to the root on such
 + * Enterprise boxes all you'll get is a #size-cells of 2 which is
 + * not what we want to use.
 + */
 +static int of_bus_fhc_match(struct device_node *np)
 +{
 +      return !strcmp(np->name, "fhc") ||
 +              !strcmp(np->name, "central");
 +}
 +
 +#define of_bus_fhc_count_cells of_bus_sbus_count_cells
 +
 +/*
 + * Array of bus specific translators
 + */
 +
 +static struct of_bus of_busses[] = {
 +      /* PCI */
 +      {
 +              .name = "pci",
 +              .addr_prop_name = "assigned-addresses",
 +              .match = of_bus_pci_match,
 +              .count_cells = of_bus_pci_count_cells,
 +              .map = of_bus_pci_map,
 +              .get_flags = of_bus_pci_get_flags,
 +      },
 +      /* SIMBA */
 +      {
 +              .name = "simba",
 +              .addr_prop_name = "assigned-addresses",
 +              .match = of_bus_simba_match,
 +              .count_cells = of_bus_pci_count_cells,
 +              .map = of_bus_simba_map,
 +              .get_flags = of_bus_pci_get_flags,
 +      },
 +      /* SBUS */
 +      {
 +              .name = "sbus",
 +              .addr_prop_name = "reg",
 +              .match = of_bus_sbus_match,
 +              .count_cells = of_bus_sbus_count_cells,
 +              .map = of_bus_default_map,
 +              .get_flags = of_bus_default_get_flags,
 +      },
 +      /* FHC */
 +      {
 +              .name = "fhc",
 +              .addr_prop_name = "reg",
 +              .match = of_bus_fhc_match,
 +              .count_cells = of_bus_fhc_count_cells,
 +              .map = of_bus_default_map,
 +              .get_flags = of_bus_default_get_flags,
 +      },
 +      /* Default */
 +      {
 +              .name = "default",
 +              .addr_prop_name = "reg",
 +              .match = NULL,
 +              .count_cells = of_bus_default_count_cells,
 +              .map = of_bus_default_map,
 +              .get_flags = of_bus_default_get_flags,
 +      },
 +};
 +
 +static struct of_bus *of_match_bus(struct device_node *np)
 +{
 +      int i;
 +
 +      for (i = 0; i < ARRAY_SIZE(of_busses); i ++)
 +              if (!of_busses[i].match || of_busses[i].match(np))
 +                      return &of_busses[i];
 +      BUG();
 +      return NULL;
 +}
 +
 +static int __init build_one_resource(struct device_node *parent,
 +                                   struct of_bus *bus,
 +                                   struct of_bus *pbus,
 +                                   u32 *addr,
 +                                   int na, int ns, int pna)
 +{
 +      const u32 *ranges;
 +      int rone, rlen;
 +
 +      ranges = of_get_property(parent, "ranges", &rlen);
 +      if (ranges == NULL || rlen == 0) {
 +              u32 result[OF_MAX_ADDR_CELLS];
 +              int i;
 +
 +              memset(result, 0, pna * 4);
 +              for (i = 0; i < na; i++)
 +                      result[pna - 1 - i] =
 +                              addr[na - 1 - i];
 +
 +              memcpy(addr, result, pna * 4);
 +              return 0;
 +      }
 +
 +      /* Now walk through the ranges */
 +      rlen /= 4;
 +      rone = na + pna + ns;
 +      for (; rlen >= rone; rlen -= rone, ranges += rone) {
 +              if (!bus->map(addr, ranges, na, ns, pna))
 +                      return 0;
 +      }
 +
 +      /* When we miss an I/O space match on PCI, just pass it up
 +       * to the next PCI bridge and/or controller.
 +       */
 +      if (!strcmp(bus->name, "pci") &&
 +          (addr[0] & 0x03000000) == 0x01000000)
 +              return 0;
 +
 +      return 1;
 +}
 +
 +static int __init use_1to1_mapping(struct device_node *pp)
 +{
 +      /* If we have a ranges property in the parent, use it.  */
 +      if (of_find_property(pp, "ranges", NULL) != NULL)
 +              return 0;
 +
 +      /* If the parent is the dma node of an ISA bus, pass
 +       * the translation up to the root.
 +       *
 +       * Some SBUS devices use intermediate nodes to express
 +       * hierarchy within the device itself.  These aren't
 +       * real bus nodes, and don't have a 'ranges' property.
 +       * But, we should still pass the translation work up
 +       * to the SBUS itself.
 +       */
 +      if (!strcmp(pp->name, "dma") ||
 +          !strcmp(pp->name, "espdma") ||
 +          !strcmp(pp->name, "ledma") ||
 +          !strcmp(pp->name, "lebuffer"))
 +              return 0;
 +
 +      /* Similarly for all PCI bridges, if we get this far
 +       * it lacks a ranges property, and this will include
 +       * cases like Simba.
 +       */
 +      if (!strcmp(pp->name, "pci"))
 +              return 0;
 +
 +      return 1;
 +}
 +
 +static int of_resource_verbose;
 +
 +static void __init build_device_resources(struct of_device *op,
 +                                        struct device *parent)
 +{
 +      struct of_device *p_op;
 +      struct of_bus *bus;
 +      int na, ns;
 +      int index, num_reg;
 +      const void *preg;
 +
 +      if (!parent)
 +              return;
 +
 +      p_op = to_of_device(parent);
 +      bus = of_match_bus(p_op->node);
 +      bus->count_cells(op->node, &na, &ns);
 +
 +      preg = of_get_property(op->node, bus->addr_prop_name, &num_reg);
 +      if (!preg || num_reg == 0)
 +              return;
 +
 +      /* Convert to num-cells.  */
 +      num_reg /= 4;
 +
 +      /* Convert to num-entries.  */
 +      num_reg /= na + ns;
 +
 +      /* Prevent overrunning the op->resources[] array.  */
 +      if (num_reg > PROMREG_MAX) {
 +              printk(KERN_WARNING "%s: Too many regs (%d), "
 +                     "limiting to %d.\n",
 +                     op->node->full_name, num_reg, PROMREG_MAX);
 +              num_reg = PROMREG_MAX;
 +      }
 +
 +      for (index = 0; index < num_reg; index++) {
 +              struct resource *r = &op->resource[index];
 +              u32 addr[OF_MAX_ADDR_CELLS];
 +              const u32 *reg = (preg + (index * ((na + ns) * 4)));
 +              struct device_node *dp = op->node;
 +              struct device_node *pp = p_op->node;
 +              struct of_bus *pbus, *dbus;
 +              u64 size, result = OF_BAD_ADDR;
 +              unsigned long flags;
 +              int dna, dns;
 +              int pna, pns;
 +
 +              size = of_read_addr(reg + na, ns);
 +              memcpy(addr, reg, na * 4);
 +
 +              flags = bus->get_flags(addr, 0);
 +
 +              if (use_1to1_mapping(pp)) {
 +                      result = of_read_addr(addr, na);
 +                      goto build_res;
 +              }
 +
 +              dna = na;
 +              dns = ns;
 +              dbus = bus;
 +
 +              while (1) {
 +                      dp = pp;
 +                      pp = dp->parent;
 +                      if (!pp) {
 +                              result = of_read_addr(addr, dna);
 +                              break;
 +                      }
 +
 +                      pbus = of_match_bus(pp);
 +                      pbus->count_cells(dp, &pna, &pns);
 +
 +                      if (build_one_resource(dp, dbus, pbus, addr,
 +                                             dna, dns, pna))
 +                              break;
 +
 +                      flags = pbus->get_flags(addr, flags);
 +
 +                      dna = pna;
 +                      dns = pns;
 +                      dbus = pbus;
 +              }
 +
 +      build_res:
 +              memset(r, 0, sizeof(*r));
 +
 +              if (of_resource_verbose)
 +                      printk("%s reg[%d] -> %lx\n",
 +                             op->node->full_name, index,
 +                             result);
 +
 +              if (result != OF_BAD_ADDR) {
 +                      if (tlb_type == hypervisor)
 +                              result &= 0x0fffffffffffffffUL;
 +
 +                      r->start = result;
 +                      r->end = result + size - 1;
 +                      r->flags = flags;
 +              }
 +              r->name = op->node->name;
 +      }
 +}
 +
 +static struct device_node * __init
 +apply_interrupt_map(struct device_node *dp, struct device_node *pp,
 +                  const u32 *imap, int imlen, const u32 *imask,
 +                  unsigned int *irq_p)
 +{
 +      struct device_node *cp;
 +      unsigned int irq = *irq_p;
 +      struct of_bus *bus;
 +      phandle handle;
 +      const u32 *reg;
 +      int na, num_reg, i;
 +
 +      bus = of_match_bus(pp);
 +      bus->count_cells(dp, &na, NULL);
 +
 +      reg = of_get_property(dp, "reg", &num_reg);
 +      if (!reg || !num_reg)
 +              return NULL;
 +
 +      imlen /= ((na + 3) * 4);
 +      handle = 0;
 +      for (i = 0; i < imlen; i++) {
 +              int j;
 +
 +              for (j = 0; j < na; j++) {
 +                      if ((reg[j] & imask[j]) != imap[j])
 +                              goto next;
 +              }
 +              if (imap[na] == irq) {
 +                      handle = imap[na + 1];
 +                      irq = imap[na + 2];
 +                      break;
 +              }
 +
 +      next:
 +              imap += (na + 3);
 +      }
 +      if (i == imlen) {
 +              /* Psycho and Sabre PCI controllers can have 'interrupt-map'
 +               * properties that do not include the on-board device
 +               * interrupts.  Instead, the device's 'interrupts' property
 +               * is already a fully specified INO value.
 +               *
 +               * Handle this by deciding that, if we didn't get a
 +               * match in the parent's 'interrupt-map', and the
 +               * parent is an IRQ translater, then use the parent as
 +               * our IRQ controller.
 +               */
 +              if (pp->irq_trans)
 +                      return pp;
 +
 +              return NULL;
 +      }
 +
 +      *irq_p = irq;
 +      cp = of_find_node_by_phandle(handle);
 +
 +      return cp;
 +}
 +
 +static unsigned int __init pci_irq_swizzle(struct device_node *dp,
 +                                         struct device_node *pp,
 +                                         unsigned int irq)
 +{
 +      const struct linux_prom_pci_registers *regs;
 +      unsigned int bus, devfn, slot, ret;
 +
 +      if (irq < 1 || irq > 4)
 +              return irq;
 +
 +      regs = of_get_property(dp, "reg", NULL);
 +      if (!regs)
 +              return irq;
 +
 +      bus = (regs->phys_hi >> 16) & 0xff;
 +      devfn = (regs->phys_hi >> 8) & 0xff;
 +      slot = (devfn >> 3) & 0x1f;
 +
 +      if (pp->irq_trans) {
 +              /* Derived from Table 8-3, U2P User's Manual.  This branch
 +               * is handling a PCI controller that lacks a proper set of
 +               * interrupt-map and interrupt-map-mask properties.  The
 +               * Ultra-E450 is one example.
 +               *
 +               * The bit layout is BSSLL, where:
 +               * B: 0 on bus A, 1 on bus B
 +               * D: 2-bit slot number, derived from PCI device number as
 +               *    (dev - 1) for bus A, or (dev - 2) for bus B
 +               * L: 2-bit line number
 +               */
 +              if (bus & 0x80) {
 +                      /* PBM-A */
 +                      bus  = 0x00;
 +                      slot = (slot - 1) << 2;
 +              } else {
 +                      /* PBM-B */
 +                      bus  = 0x10;
 +                      slot = (slot - 2) << 2;
 +              }
 +              irq -= 1;
 +
 +              ret = (bus | slot | irq);
 +      } else {
 +              /* Going through a PCI-PCI bridge that lacks a set of
 +               * interrupt-map and interrupt-map-mask properties.
 +               */
 +              ret = ((irq - 1 + (slot & 3)) & 3) + 1;
 +      }
 +
 +      return ret;
 +}
 +
 +static int of_irq_verbose;
 +
 +static unsigned int __init build_one_device_irq(struct of_device *op,
 +                                              struct device *parent,
 +                                              unsigned int irq)
 +{
 +      struct device_node *dp = op->node;
 +      struct device_node *pp, *ip;
 +      unsigned int orig_irq = irq;
 +      int nid;
 +
 +      if (irq == 0xffffffff)
 +              return irq;
 +
 +      if (dp->irq_trans) {
 +              irq = dp->irq_trans->irq_build(dp, irq,
 +                                             dp->irq_trans->data);
 +
 +              if (of_irq_verbose)
 +                      printk("%s: direct translate %x --> %x\n",
 +                             dp->full_name, orig_irq, irq);
 +
 +              goto out;
 +      }
 +
 +      /* Something more complicated.  Walk up to the root, applying
 +       * interrupt-map or bus specific translations, until we hit
 +       * an IRQ translator.
 +       *
 +       * If we hit a bus type or situation we cannot handle, we
 +       * stop and assume that the original IRQ number was in a
 +       * format which has special meaning to it's immediate parent.
 +       */
 +      pp = dp->parent;
 +      ip = NULL;
 +      while (pp) {
 +              const void *imap, *imsk;
 +              int imlen;
 +
 +              imap = of_get_property(pp, "interrupt-map", &imlen);
 +              imsk = of_get_property(pp, "interrupt-map-mask", NULL);
 +              if (imap && imsk) {
 +                      struct device_node *iret;
 +                      int this_orig_irq = irq;
 +
 +                      iret = apply_interrupt_map(dp, pp,
 +                                                 imap, imlen, imsk,
 +                                                 &irq);
 +
 +                      if (of_irq_verbose)
 +                              printk("%s: Apply [%s:%x] imap --> [%s:%x]\n",
 +                                     op->node->full_name,
 +                                     pp->full_name, this_orig_irq,
 +                                     (iret ? iret->full_name : "NULL"), irq);
 +
 +                      if (!iret)
 +                              break;
 +
 +                      if (iret->irq_trans) {
 +                              ip = iret;
 +                              break;
 +                      }
 +              } else {
 +                      if (!strcmp(pp->name, "pci")) {
 +                              unsigned int this_orig_irq = irq;
 +
 +                              irq = pci_irq_swizzle(dp, pp, irq);
 +                              if (of_irq_verbose)
 +                                      printk("%s: PCI swizzle [%s] "
 +                                             "%x --> %x\n",
 +                                             op->node->full_name,
 +                                             pp->full_name, this_orig_irq,
 +                                             irq);
 +
 +                      }
 +
 +                      if (pp->irq_trans) {
 +                              ip = pp;
 +                              break;
 +                      }
 +              }
 +              dp = pp;
 +              pp = pp->parent;
 +      }
 +      if (!ip)
 +              return orig_irq;
 +
 +      irq = ip->irq_trans->irq_build(op->node, irq,
 +                                     ip->irq_trans->data);
 +      if (of_irq_verbose)
 +              printk("%s: Apply IRQ trans [%s] %x --> %x\n",
 +                     op->node->full_name, ip->full_name, orig_irq, irq);
 +
 +out:
 +      nid = of_node_to_nid(dp);
 +      if (nid != -1) {
 +              cpumask_t numa_mask = node_to_cpumask(nid);
 +
-               irq_set_affinity(irq, numa_mask);
++              irq_set_affinity(irq, &numa_mask);
 +      }
 +
 +      return irq;
 +}
 +
 +static struct of_device * __init scan_one_device(struct device_node *dp,
 +                                               struct device *parent)
 +{
 +      struct of_device *op = kzalloc(sizeof(*op), GFP_KERNEL);
 +      const unsigned int *irq;
 +      struct dev_archdata *sd;
 +      int len, i;
 +
 +      if (!op)
 +              return NULL;
 +
 +      sd = &op->dev.archdata;
 +      sd->prom_node = dp;
 +      sd->op = op;
 +
 +      op->node = dp;
 +
 +      op->clock_freq = of_getintprop_default(dp, "clock-frequency",
 +                                             (25*1000*1000));
 +      op->portid = of_getintprop_default(dp, "upa-portid", -1);
 +      if (op->portid == -1)
 +              op->portid = of_getintprop_default(dp, "portid", -1);
 +
 +      irq = of_get_property(dp, "interrupts", &len);
 +      if (irq) {
 +              op->num_irqs = len / 4;
 +
 +              /* Prevent overrunning the op->irqs[] array.  */
 +              if (op->num_irqs > PROMINTR_MAX) {
 +                      printk(KERN_WARNING "%s: Too many irqs (%d), "
 +                             "limiting to %d.\n",
 +                             dp->full_name, op->num_irqs, PROMINTR_MAX);
 +                      op->num_irqs = PROMINTR_MAX;
 +              }
 +              memcpy(op->irqs, irq, op->num_irqs * 4);
 +      } else {
 +              op->num_irqs = 0;
 +      }
 +
 +      build_device_resources(op, parent);
 +      for (i = 0; i < op->num_irqs; i++)
 +              op->irqs[i] = build_one_device_irq(op, parent, op->irqs[i]);
 +
 +      op->dev.parent = parent;
 +      op->dev.bus = &of_platform_bus_type;
 +      if (!parent)
 +              dev_set_name(&op->dev, "root");
 +      else
 +              dev_set_name(&op->dev, "%08x", dp->node);
 +
 +      if (of_device_register(op)) {
 +              printk("%s: Could not register of device.\n",
 +                     dp->full_name);
 +              kfree(op);
 +              op = NULL;
 +      }
 +
 +      return op;
 +}
 +
 +static void __init scan_tree(struct device_node *dp, struct device *parent)
 +{
 +      while (dp) {
 +              struct of_device *op = scan_one_device(dp, parent);
 +
 +              if (op)
 +                      scan_tree(dp->child, &op->dev);
 +
 +              dp = dp->sibling;
 +      }
 +}
 +
 +static void __init scan_of_devices(void)
 +{
 +      struct device_node *root = of_find_node_by_path("/");
 +      struct of_device *parent;
 +
 +      parent = scan_one_device(root, NULL);
 +      if (!parent)
 +              return;
 +
 +      scan_tree(root->child, &parent->dev);
 +}
 +
 +static int __init of_bus_driver_init(void)
 +{
 +      int err;
 +
 +      err = of_bus_type_init(&of_platform_bus_type, "of");
 +      if (!err)
 +              scan_of_devices();
 +
 +      return err;
 +}
 +
 +postcore_initcall(of_bus_driver_init);
 +
 +static int __init of_debug(char *str)
 +{
 +      int val = 0;
 +
 +      get_option(&str, &val);
 +      if (val & 1)
 +              of_resource_verbose = 1;
 +      if (val & 2)
 +              of_irq_verbose = 1;
 +      return 1;
 +}
 +
 +__setup("of_debug=", of_debug);
Simple merge
Simple merge
index bfe99d8,0000000..4632979
mode 100644,000000..100644
--- /dev/null
@@@ -1,1412 -1,0 +1,1408 @@@
 +/* smp.c: Sparc64 SMP support.
 + *
 + * Copyright (C) 1997, 2007, 2008 David S. Miller (davem@davemloft.net)
 + */
 +
 +#include <linux/module.h>
 +#include <linux/kernel.h>
 +#include <linux/sched.h>
 +#include <linux/mm.h>
 +#include <linux/pagemap.h>
 +#include <linux/threads.h>
 +#include <linux/smp.h>
 +#include <linux/interrupt.h>
 +#include <linux/kernel_stat.h>
 +#include <linux/delay.h>
 +#include <linux/init.h>
 +#include <linux/spinlock.h>
 +#include <linux/fs.h>
 +#include <linux/seq_file.h>
 +#include <linux/cache.h>
 +#include <linux/jiffies.h>
 +#include <linux/profile.h>
 +#include <linux/lmb.h>
 +#include <linux/cpu.h>
 +
 +#include <asm/head.h>
 +#include <asm/ptrace.h>
 +#include <asm/atomic.h>
 +#include <asm/tlbflush.h>
 +#include <asm/mmu_context.h>
 +#include <asm/cpudata.h>
 +#include <asm/hvtramp.h>
 +#include <asm/io.h>
 +#include <asm/timer.h>
 +
 +#include <asm/irq.h>
 +#include <asm/irq_regs.h>
 +#include <asm/page.h>
 +#include <asm/pgtable.h>
 +#include <asm/oplib.h>
 +#include <asm/uaccess.h>
 +#include <asm/starfire.h>
 +#include <asm/tlb.h>
 +#include <asm/sections.h>
 +#include <asm/prom.h>
 +#include <asm/mdesc.h>
 +#include <asm/ldc.h>
 +#include <asm/hypervisor.h>
 +
 +int sparc64_multi_core __read_mostly;
 +
- cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE;
- cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
 +DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
 +cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
 +      { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 +
- EXPORT_SYMBOL(cpu_possible_map);
- EXPORT_SYMBOL(cpu_online_map);
 +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 +EXPORT_SYMBOL(cpu_core_map);
 +
 +static cpumask_t smp_commenced_mask;
 +
 +void smp_info(struct seq_file *m)
 +{
 +      int i;
 +      
 +      seq_printf(m, "State:\n");
 +      for_each_online_cpu(i)
 +              seq_printf(m, "CPU%d:\t\tonline\n", i);
 +}
 +
 +void smp_bogo(struct seq_file *m)
 +{
 +      int i;
 +      
 +      for_each_online_cpu(i)
 +              seq_printf(m,
 +                         "Cpu%dClkTck\t: %016lx\n",
 +                         i, cpu_data(i).clock_tick);
 +}
 +
 +extern void setup_sparc64_timer(void);
 +
 +static volatile unsigned long callin_flag = 0;
 +
 +void __cpuinit smp_callin(void)
 +{
 +      int cpuid = hard_smp_processor_id();
 +
 +      __local_per_cpu_offset = __per_cpu_offset(cpuid);
 +
 +      if (tlb_type == hypervisor)
 +              sun4v_ktsb_register();
 +
 +      __flush_tlb_all();
 +
 +      setup_sparc64_timer();
 +
 +      if (cheetah_pcache_forced_on)
 +              cheetah_enable_pcache();
 +
 +      local_irq_enable();
 +
 +      callin_flag = 1;
 +      __asm__ __volatile__("membar #Sync\n\t"
 +                           "flush  %%g6" : : : "memory");
 +
 +      /* Clear this or we will die instantly when we
 +       * schedule back to this idler...
 +       */
 +      current_thread_info()->new_child = 0;
 +
 +      /* Attach to the address space of init_task. */
 +      atomic_inc(&init_mm.mm_count);
 +      current->active_mm = &init_mm;
 +
 +      /* inform the notifiers about the new cpu */
 +      notify_cpu_starting(cpuid);
 +
 +      while (!cpu_isset(cpuid, smp_commenced_mask))
 +              rmb();
 +
 +      ipi_call_lock();
 +      cpu_set(cpuid, cpu_online_map);
 +      ipi_call_unlock();
 +
 +      /* idle thread is expected to have preempt disabled */
 +      preempt_disable();
 +}
 +
 +void cpu_panic(void)
 +{
 +      printk("CPU[%d]: Returns from cpu_idle!\n", smp_processor_id());
 +      panic("SMP bolixed\n");
 +}
 +
 +/* This tick register synchronization scheme is taken entirely from
 + * the ia64 port, see arch/ia64/kernel/smpboot.c for details and credit.
 + *
 + * The only change I've made is to rework it so that the master
 + * initiates the synchonization instead of the slave. -DaveM
 + */
 +
 +#define MASTER        0
 +#define SLAVE (SMP_CACHE_BYTES/sizeof(unsigned long))
 +
 +#define NUM_ROUNDS    64      /* magic value */
 +#define NUM_ITERS     5       /* likewise */
 +
 +static DEFINE_SPINLOCK(itc_sync_lock);
 +static unsigned long go[SLAVE + 1];
 +
 +#define DEBUG_TICK_SYNC       0
 +
 +static inline long get_delta (long *rt, long *master)
 +{
 +      unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
 +      unsigned long tcenter, t0, t1, tm;
 +      unsigned long i;
 +
 +      for (i = 0; i < NUM_ITERS; i++) {
 +              t0 = tick_ops->get_tick();
 +              go[MASTER] = 1;
 +              membar_safe("#StoreLoad");
 +              while (!(tm = go[SLAVE]))
 +                      rmb();
 +              go[SLAVE] = 0;
 +              wmb();
 +              t1 = tick_ops->get_tick();
 +
 +              if (t1 - t0 < best_t1 - best_t0)
 +                      best_t0 = t0, best_t1 = t1, best_tm = tm;
 +      }
 +
 +      *rt = best_t1 - best_t0;
 +      *master = best_tm - best_t0;
 +
 +      /* average best_t0 and best_t1 without overflow: */
 +      tcenter = (best_t0/2 + best_t1/2);
 +      if (best_t0 % 2 + best_t1 % 2 == 2)
 +              tcenter++;
 +      return tcenter - best_tm;
 +}
 +
 +void smp_synchronize_tick_client(void)
 +{
 +      long i, delta, adj, adjust_latency = 0, done = 0;
 +      unsigned long flags, rt, master_time_stamp, bound;
 +#if DEBUG_TICK_SYNC
 +      struct {
 +              long rt;        /* roundtrip time */
 +              long master;    /* master's timestamp */
 +              long diff;      /* difference between midpoint and master's timestamp */
 +              long lat;       /* estimate of itc adjustment latency */
 +      } t[NUM_ROUNDS];
 +#endif
 +
 +      go[MASTER] = 1;
 +
 +      while (go[MASTER])
 +              rmb();
 +
 +      local_irq_save(flags);
 +      {
 +              for (i = 0; i < NUM_ROUNDS; i++) {
 +                      delta = get_delta(&rt, &master_time_stamp);
 +                      if (delta == 0) {
 +                              done = 1;       /* let's lock on to this... */
 +                              bound = rt;
 +                      }
 +
 +                      if (!done) {
 +                              if (i > 0) {
 +                                      adjust_latency += -delta;
 +                                      adj = -delta + adjust_latency/4;
 +                              } else
 +                                      adj = -delta;
 +
 +                              tick_ops->add_tick(adj);
 +                      }
 +#if DEBUG_TICK_SYNC
 +                      t[i].rt = rt;
 +                      t[i].master = master_time_stamp;
 +                      t[i].diff = delta;
 +                      t[i].lat = adjust_latency/4;
 +#endif
 +              }
 +      }
 +      local_irq_restore(flags);
 +
 +#if DEBUG_TICK_SYNC
 +      for (i = 0; i < NUM_ROUNDS; i++)
 +              printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
 +                     t[i].rt, t[i].master, t[i].diff, t[i].lat);
 +#endif
 +
 +      printk(KERN_INFO "CPU %d: synchronized TICK with master CPU "
 +             "(last diff %ld cycles, maxerr %lu cycles)\n",
 +             smp_processor_id(), delta, rt);
 +}
 +
 +static void smp_start_sync_tick_client(int cpu);
 +
 +static void smp_synchronize_one_tick(int cpu)
 +{
 +      unsigned long flags, i;
 +
 +      go[MASTER] = 0;
 +
 +      smp_start_sync_tick_client(cpu);
 +
 +      /* wait for client to be ready */
 +      while (!go[MASTER])
 +              rmb();
 +
 +      /* now let the client proceed into his loop */
 +      go[MASTER] = 0;
 +      membar_safe("#StoreLoad");
 +
 +      spin_lock_irqsave(&itc_sync_lock, flags);
 +      {
 +              for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) {
 +                      while (!go[MASTER])
 +                              rmb();
 +                      go[MASTER] = 0;
 +                      wmb();
 +                      go[SLAVE] = tick_ops->get_tick();
 +                      membar_safe("#StoreLoad");
 +              }
 +      }
 +      spin_unlock_irqrestore(&itc_sync_lock, flags);
 +}
 +
 +#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
 +/* XXX Put this in some common place. XXX */
 +static unsigned long kimage_addr_to_ra(void *p)
 +{
 +      unsigned long val = (unsigned long) p;
 +
 +      return kern_base + (val - KERNBASE);
 +}
 +
 +static void __cpuinit ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg)
 +{
 +      extern unsigned long sparc64_ttable_tl0;
 +      extern unsigned long kern_locked_tte_data;
 +      struct hvtramp_descr *hdesc;
 +      unsigned long trampoline_ra;
 +      struct trap_per_cpu *tb;
 +      u64 tte_vaddr, tte_data;
 +      unsigned long hv_err;
 +      int i;
 +
 +      hdesc = kzalloc(sizeof(*hdesc) +
 +                      (sizeof(struct hvtramp_mapping) *
 +                       num_kernel_image_mappings - 1),
 +                      GFP_KERNEL);
 +      if (!hdesc) {
 +              printk(KERN_ERR "ldom_startcpu_cpuid: Cannot allocate "
 +                     "hvtramp_descr.\n");
 +              return;
 +      }
 +
 +      hdesc->cpu = cpu;
 +      hdesc->num_mappings = num_kernel_image_mappings;
 +
 +      tb = &trap_block[cpu];
 +      tb->hdesc = hdesc;
 +
 +      hdesc->fault_info_va = (unsigned long) &tb->fault_info;
 +      hdesc->fault_info_pa = kimage_addr_to_ra(&tb->fault_info);
 +
 +      hdesc->thread_reg = thread_reg;
 +
 +      tte_vaddr = (unsigned long) KERNBASE;
 +      tte_data = kern_locked_tte_data;
 +
 +      for (i = 0; i < hdesc->num_mappings; i++) {
 +              hdesc->maps[i].vaddr = tte_vaddr;
 +              hdesc->maps[i].tte   = tte_data;
 +              tte_vaddr += 0x400000;
 +              tte_data  += 0x400000;
 +      }
 +
 +      trampoline_ra = kimage_addr_to_ra(hv_cpu_startup);
 +
 +      hv_err = sun4v_cpu_start(cpu, trampoline_ra,
 +                               kimage_addr_to_ra(&sparc64_ttable_tl0),
 +                               __pa(hdesc));
 +      if (hv_err)
 +              printk(KERN_ERR "ldom_startcpu_cpuid: sun4v_cpu_start() "
 +                     "gives error %lu\n", hv_err);
 +}
 +#endif
 +
 +extern unsigned long sparc64_cpu_startup;
 +
 +/* The OBP cpu startup callback truncates the 3rd arg cookie to
 + * 32-bits (I think) so to be safe we have it read the pointer
 + * contained here so we work on >4GB machines. -DaveM
 + */
 +static struct thread_info *cpu_new_thread = NULL;
 +
 +static int __cpuinit smp_boot_one_cpu(unsigned int cpu)
 +{
 +      struct trap_per_cpu *tb = &trap_block[cpu];
 +      unsigned long entry =
 +              (unsigned long)(&sparc64_cpu_startup);
 +      unsigned long cookie =
 +              (unsigned long)(&cpu_new_thread);
 +      struct task_struct *p;
 +      int timeout, ret;
 +
 +      p = fork_idle(cpu);
 +      if (IS_ERR(p))
 +              return PTR_ERR(p);
 +      callin_flag = 0;
 +      cpu_new_thread = task_thread_info(p);
 +
 +      if (tlb_type == hypervisor) {
 +#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
 +              if (ldom_domaining_enabled)
 +                      ldom_startcpu_cpuid(cpu,
 +                                          (unsigned long) cpu_new_thread);
 +              else
 +#endif
 +                      prom_startcpu_cpuid(cpu, entry, cookie);
 +      } else {
 +              struct device_node *dp = of_find_node_by_cpuid(cpu);
 +
 +              prom_startcpu(dp->node, entry, cookie);
 +      }
 +
 +      for (timeout = 0; timeout < 50000; timeout++) {
 +              if (callin_flag)
 +                      break;
 +              udelay(100);
 +      }
 +
 +      if (callin_flag) {
 +              ret = 0;
 +      } else {
 +              printk("Processor %d is stuck.\n", cpu);
 +              ret = -ENODEV;
 +      }
 +      cpu_new_thread = NULL;
 +
 +      if (tb->hdesc) {
 +              kfree(tb->hdesc);
 +              tb->hdesc = NULL;
 +      }
 +
 +      return ret;
 +}
 +
 +static void spitfire_xcall_helper(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu)
 +{
 +      u64 result, target;
 +      int stuck, tmp;
 +
 +      if (this_is_starfire) {
 +              /* map to real upaid */
 +              cpu = (((cpu & 0x3c) << 1) |
 +                      ((cpu & 0x40) >> 4) |
 +                      (cpu & 0x3));
 +      }
 +
 +      target = (cpu << 14) | 0x70;
 +again:
 +      /* Ok, this is the real Spitfire Errata #54.
 +       * One must read back from a UDB internal register
 +       * after writes to the UDB interrupt dispatch, but
 +       * before the membar Sync for that write.
 +       * So we use the high UDB control register (ASI 0x7f,
 +       * ADDR 0x20) for the dummy read. -DaveM
 +       */
 +      tmp = 0x40;
 +      __asm__ __volatile__(
 +      "wrpr   %1, %2, %%pstate\n\t"
 +      "stxa   %4, [%0] %3\n\t"
 +      "stxa   %5, [%0+%8] %3\n\t"
 +      "add    %0, %8, %0\n\t"
 +      "stxa   %6, [%0+%8] %3\n\t"
 +      "membar #Sync\n\t"
 +      "stxa   %%g0, [%7] %3\n\t"
 +      "membar #Sync\n\t"
 +      "mov    0x20, %%g1\n\t"
 +      "ldxa   [%%g1] 0x7f, %%g0\n\t"
 +      "membar #Sync"
 +      : "=r" (tmp)
 +      : "r" (pstate), "i" (PSTATE_IE), "i" (ASI_INTR_W),
 +        "r" (data0), "r" (data1), "r" (data2), "r" (target),
 +        "r" (0x10), "0" (tmp)
 +        : "g1");
 +
 +      /* NOTE: PSTATE_IE is still clear. */
 +      stuck = 100000;
 +      do {
 +              __asm__ __volatile__("ldxa [%%g0] %1, %0"
 +                      : "=r" (result)
 +                      : "i" (ASI_INTR_DISPATCH_STAT));
 +              if (result == 0) {
 +                      __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 +                                           : : "r" (pstate));
 +                      return;
 +              }
 +              stuck -= 1;
 +              if (stuck == 0)
 +                      break;
 +      } while (result & 0x1);
 +      __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 +                           : : "r" (pstate));
 +      if (stuck == 0) {
 +              printk("CPU[%d]: mondo stuckage result[%016lx]\n",
 +                     smp_processor_id(), result);
 +      } else {
 +              udelay(2);
 +              goto again;
 +      }
 +}
 +
 +static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 +{
 +      u64 *mondo, data0, data1, data2;
 +      u16 *cpu_list;
 +      u64 pstate;
 +      int i;
 +
 +      __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 +      cpu_list = __va(tb->cpu_list_pa);
 +      mondo = __va(tb->cpu_mondo_block_pa);
 +      data0 = mondo[0];
 +      data1 = mondo[1];
 +      data2 = mondo[2];
 +      for (i = 0; i < cnt; i++)
 +              spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]);
 +}
 +
 +/* Cheetah now allows to send the whole 64-bytes of data in the interrupt
 + * packet, but we have no use for that.  However we do take advantage of
 + * the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
 + */
 +static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 +{
 +      int nack_busy_id, is_jbus, need_more;
 +      u64 *mondo, pstate, ver, busy_mask;
 +      u16 *cpu_list;
 +
 +      cpu_list = __va(tb->cpu_list_pa);
 +      mondo = __va(tb->cpu_mondo_block_pa);
 +
 +      /* Unfortunately, someone at Sun had the brilliant idea to make the
 +       * busy/nack fields hard-coded by ITID number for this Ultra-III
 +       * derivative processor.
 +       */
 +      __asm__ ("rdpr %%ver, %0" : "=r" (ver));
 +      is_jbus = ((ver >> 32) == __JALAPENO_ID ||
 +                 (ver >> 32) == __SERRANO_ID);
 +
 +      __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 +
 +retry:
 +      need_more = 0;
 +      __asm__ __volatile__("wrpr %0, %1, %%pstate\n\t"
 +                           : : "r" (pstate), "i" (PSTATE_IE));
 +
 +      /* Setup the dispatch data registers. */
 +      __asm__ __volatile__("stxa      %0, [%3] %6\n\t"
 +                           "stxa      %1, [%4] %6\n\t"
 +                           "stxa      %2, [%5] %6\n\t"
 +                           "membar    #Sync\n\t"
 +                           : /* no outputs */
 +                           : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]),
 +                             "r" (0x40), "r" (0x50), "r" (0x60),
 +                             "i" (ASI_INTR_W));
 +
 +      nack_busy_id = 0;
 +      busy_mask = 0;
 +      {
 +              int i;
 +
 +              for (i = 0; i < cnt; i++) {
 +                      u64 target, nr;
 +
 +                      nr = cpu_list[i];
 +                      if (nr == 0xffff)
 +                              continue;
 +
 +                      target = (nr << 14) | 0x70;
 +                      if (is_jbus) {
 +                              busy_mask |= (0x1UL << (nr * 2));
 +                      } else {
 +                              target |= (nack_busy_id << 24);
 +                              busy_mask |= (0x1UL <<
 +                                            (nack_busy_id * 2));
 +                      }
 +                      __asm__ __volatile__(
 +                              "stxa   %%g0, [%0] %1\n\t"
 +                              "membar #Sync\n\t"
 +                              : /* no outputs */
 +                              : "r" (target), "i" (ASI_INTR_W));
 +                      nack_busy_id++;
 +                      if (nack_busy_id == 32) {
 +                              need_more = 1;
 +                              break;
 +                      }
 +              }
 +      }
 +
 +      /* Now, poll for completion. */
 +      {
 +              u64 dispatch_stat, nack_mask;
 +              long stuck;
 +
 +              stuck = 100000 * nack_busy_id;
 +              nack_mask = busy_mask << 1;
 +              do {
 +                      __asm__ __volatile__("ldxa      [%%g0] %1, %0"
 +                                           : "=r" (dispatch_stat)
 +                                           : "i" (ASI_INTR_DISPATCH_STAT));
 +                      if (!(dispatch_stat & (busy_mask | nack_mask))) {
 +                              __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 +                                                   : : "r" (pstate));
 +                              if (unlikely(need_more)) {
 +                                      int i, this_cnt = 0;
 +                                      for (i = 0; i < cnt; i++) {
 +                                              if (cpu_list[i] == 0xffff)
 +                                                      continue;
 +                                              cpu_list[i] = 0xffff;
 +                                              this_cnt++;
 +                                              if (this_cnt == 32)
 +                                                      break;
 +                                      }
 +                                      goto retry;
 +                              }
 +                              return;
 +                      }
 +                      if (!--stuck)
 +                              break;
 +              } while (dispatch_stat & busy_mask);
 +
 +              __asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 +                                   : : "r" (pstate));
 +
 +              if (dispatch_stat & busy_mask) {
 +                      /* Busy bits will not clear, continue instead
 +                       * of freezing up on this cpu.
 +                       */
 +                      printk("CPU[%d]: mondo stuckage result[%016lx]\n",
 +                             smp_processor_id(), dispatch_stat);
 +              } else {
 +                      int i, this_busy_nack = 0;
 +
 +                      /* Delay some random time with interrupts enabled
 +                       * to prevent deadlock.
 +                       */
 +                      udelay(2 * nack_busy_id);
 +
 +                      /* Clear out the mask bits for cpus which did not
 +                       * NACK us.
 +                       */
 +                      for (i = 0; i < cnt; i++) {
 +                              u64 check_mask, nr;
 +
 +                              nr = cpu_list[i];
 +                              if (nr == 0xffff)
 +                                      continue;
 +
 +                              if (is_jbus)
 +                                      check_mask = (0x2UL << (2*nr));
 +                              else
 +                                      check_mask = (0x2UL <<
 +                                                    this_busy_nack);
 +                              if ((dispatch_stat & check_mask) == 0)
 +                                      cpu_list[i] = 0xffff;
 +                              this_busy_nack += 2;
 +                              if (this_busy_nack == 64)
 +                                      break;
 +                      }
 +
 +                      goto retry;
 +              }
 +      }
 +}
 +
 +/* Multi-cpu list version.  */
 +static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 +{
 +      int retries, this_cpu, prev_sent, i, saw_cpu_error;
 +      unsigned long status;
 +      u16 *cpu_list;
 +
 +      this_cpu = smp_processor_id();
 +
 +      cpu_list = __va(tb->cpu_list_pa);
 +
 +      saw_cpu_error = 0;
 +      retries = 0;
 +      prev_sent = 0;
 +      do {
 +              int forward_progress, n_sent;
 +
 +              status = sun4v_cpu_mondo_send(cnt,
 +                                            tb->cpu_list_pa,
 +                                            tb->cpu_mondo_block_pa);
 +
 +              /* HV_EOK means all cpus received the xcall, we're done.  */
 +              if (likely(status == HV_EOK))
 +                      break;
 +
 +              /* First, see if we made any forward progress.
 +               *
 +               * The hypervisor indicates successful sends by setting
 +               * cpu list entries to the value 0xffff.
 +               */
 +              n_sent = 0;
 +              for (i = 0; i < cnt; i++) {
 +                      if (likely(cpu_list[i] == 0xffff))
 +                              n_sent++;
 +              }
 +
 +              forward_progress = 0;
 +              if (n_sent > prev_sent)
 +                      forward_progress = 1;
 +
 +              prev_sent = n_sent;
 +
 +              /* If we get a HV_ECPUERROR, then one or more of the cpus
 +               * in the list are in error state.  Use the cpu_state()
 +               * hypervisor call to find out which cpus are in error state.
 +               */
 +              if (unlikely(status == HV_ECPUERROR)) {
 +                      for (i = 0; i < cnt; i++) {
 +                              long err;
 +                              u16 cpu;
 +
 +                              cpu = cpu_list[i];
 +                              if (cpu == 0xffff)
 +                                      continue;
 +
 +                              err = sun4v_cpu_state(cpu);
 +                              if (err == HV_CPU_STATE_ERROR) {
 +                                      saw_cpu_error = (cpu + 1);
 +                                      cpu_list[i] = 0xffff;
 +                              }
 +                      }
 +              } else if (unlikely(status != HV_EWOULDBLOCK))
 +                      goto fatal_mondo_error;
 +
 +              /* Don't bother rewriting the CPU list, just leave the
 +               * 0xffff and non-0xffff entries in there and the
 +               * hypervisor will do the right thing.
 +               *
 +               * Only advance timeout state if we didn't make any
 +               * forward progress.
 +               */
 +              if (unlikely(!forward_progress)) {
 +                      if (unlikely(++retries > 10000))
 +                              goto fatal_mondo_timeout;
 +
 +                      /* Delay a little bit to let other cpus catch up
 +                       * on their cpu mondo queue work.
 +                       */
 +                      udelay(2 * cnt);
 +              }
 +      } while (1);
 +
 +      if (unlikely(saw_cpu_error))
 +              goto fatal_mondo_cpu_error;
 +
 +      return;
 +
 +fatal_mondo_cpu_error:
 +      printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
 +             "(including %d) were in error state\n",
 +             this_cpu, saw_cpu_error - 1);
 +      return;
 +
 +fatal_mondo_timeout:
 +      printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
 +             " progress after %d retries.\n",
 +             this_cpu, retries);
 +      goto dump_cpu_list_and_out;
 +
 +fatal_mondo_error:
 +      printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
 +             this_cpu, status);
 +      printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
 +             "mondo_block_pa(%lx)\n",
 +             this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
 +
 +dump_cpu_list_and_out:
 +      printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
 +      for (i = 0; i < cnt; i++)
 +              printk("%u ", cpu_list[i]);
 +      printk("]\n");
 +}
 +
 +static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
 +
 +static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
 +{
 +      struct trap_per_cpu *tb;
 +      int this_cpu, i, cnt;
 +      unsigned long flags;
 +      u16 *cpu_list;
 +      u64 *mondo;
 +
 +      /* We have to do this whole thing with interrupts fully disabled.
 +       * Otherwise if we send an xcall from interrupt context it will
 +       * corrupt both our mondo block and cpu list state.
 +       *
 +       * One consequence of this is that we cannot use timeout mechanisms
 +       * that depend upon interrupts being delivered locally.  So, for
 +       * example, we cannot sample jiffies and expect it to advance.
 +       *
 +       * Fortunately, udelay() uses %stick/%tick so we can use that.
 +       */
 +      local_irq_save(flags);
 +
 +      this_cpu = smp_processor_id();
 +      tb = &trap_block[this_cpu];
 +
 +      mondo = __va(tb->cpu_mondo_block_pa);
 +      mondo[0] = data0;
 +      mondo[1] = data1;
 +      mondo[2] = data2;
 +      wmb();
 +
 +      cpu_list = __va(tb->cpu_list_pa);
 +
 +      /* Setup the initial cpu list.  */
 +      cnt = 0;
 +      for_each_cpu(i, mask) {
 +              if (i == this_cpu || !cpu_online(i))
 +                      continue;
 +              cpu_list[cnt++] = i;
 +      }
 +
 +      if (cnt)
 +              xcall_deliver_impl(tb, cnt);
 +
 +      local_irq_restore(flags);
 +}
 +
 +/* Send cross call to all processors mentioned in MASK_P
 + * except self.  Really, there are only two cases currently,
 + * "&cpu_online_map" and "&mm->cpu_vm_mask".
 + */
 +static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask)
 +{
 +      u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
 +
 +      xcall_deliver(data0, data1, data2, mask);
 +}
 +
 +/* Send cross call to all processors except self. */
 +static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2)
 +{
 +      smp_cross_call_masked(func, ctx, data1, data2, &cpu_online_map);
 +}
 +
 +extern unsigned long xcall_sync_tick;
 +
 +static void smp_start_sync_tick_client(int cpu)
 +{
 +      xcall_deliver((u64) &xcall_sync_tick, 0, 0,
 +                    &cpumask_of_cpu(cpu));
 +}
 +
 +extern unsigned long xcall_call_function;
 +
 +void arch_send_call_function_ipi(cpumask_t mask)
 +{
 +      xcall_deliver((u64) &xcall_call_function, 0, 0, &mask);
 +}
 +
 +extern unsigned long xcall_call_function_single;
 +
 +void arch_send_call_function_single_ipi(int cpu)
 +{
 +      xcall_deliver((u64) &xcall_call_function_single, 0, 0,
 +                    &cpumask_of_cpu(cpu));
 +}
 +
 +void smp_call_function_client(int irq, struct pt_regs *regs)
 +{
 +      clear_softint(1 << irq);
 +      generic_smp_call_function_interrupt();
 +}
 +
 +void smp_call_function_single_client(int irq, struct pt_regs *regs)
 +{
 +      clear_softint(1 << irq);
 +      generic_smp_call_function_single_interrupt();
 +}
 +
 +static void tsb_sync(void *info)
 +{
 +      struct trap_per_cpu *tp = &trap_block[raw_smp_processor_id()];
 +      struct mm_struct *mm = info;
 +
 +      /* It is not valid to test "currrent->active_mm == mm" here.
 +       *
 +       * The value of "current" is not changed atomically with
 +       * switch_mm().  But that's OK, we just need to check the
 +       * current cpu's trap block PGD physical address.
 +       */
 +      if (tp->pgd_paddr == __pa(mm->pgd))
 +              tsb_context_switch(mm);
 +}
 +
 +void smp_tsb_sync(struct mm_struct *mm)
 +{
 +      smp_call_function_mask(mm->cpu_vm_mask, tsb_sync, mm, 1);
 +}
 +
 +extern unsigned long xcall_flush_tlb_mm;
 +extern unsigned long xcall_flush_tlb_pending;
 +extern unsigned long xcall_flush_tlb_kernel_range;
 +extern unsigned long xcall_fetch_glob_regs;
 +extern unsigned long xcall_receive_signal;
 +extern unsigned long xcall_new_mmu_context_version;
 +#ifdef CONFIG_KGDB
 +extern unsigned long xcall_kgdb_capture;
 +#endif
 +
 +#ifdef DCACHE_ALIASING_POSSIBLE
 +extern unsigned long xcall_flush_dcache_page_cheetah;
 +#endif
 +extern unsigned long xcall_flush_dcache_page_spitfire;
 +
 +#ifdef CONFIG_DEBUG_DCFLUSH
 +extern atomic_t dcpage_flushes;
 +extern atomic_t dcpage_flushes_xcall;
 +#endif
 +
 +static inline void __local_flush_dcache_page(struct page *page)
 +{
 +#ifdef DCACHE_ALIASING_POSSIBLE
 +      __flush_dcache_page(page_address(page),
 +                          ((tlb_type == spitfire) &&
 +                           page_mapping(page) != NULL));
 +#else
 +      if (page_mapping(page) != NULL &&
 +          tlb_type == spitfire)
 +              __flush_icache_page(__pa(page_address(page)));
 +#endif
 +}
 +
 +void smp_flush_dcache_page_impl(struct page *page, int cpu)
 +{
 +      int this_cpu;
 +
 +      if (tlb_type == hypervisor)
 +              return;
 +
 +#ifdef CONFIG_DEBUG_DCFLUSH
 +      atomic_inc(&dcpage_flushes);
 +#endif
 +
 +      this_cpu = get_cpu();
 +
 +      if (cpu == this_cpu) {
 +              __local_flush_dcache_page(page);
 +      } else if (cpu_online(cpu)) {
 +              void *pg_addr = page_address(page);
 +              u64 data0 = 0;
 +
 +              if (tlb_type == spitfire) {
 +                      data0 = ((u64)&xcall_flush_dcache_page_spitfire);
 +                      if (page_mapping(page) != NULL)
 +                              data0 |= ((u64)1 << 32);
 +              } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 +#ifdef DCACHE_ALIASING_POSSIBLE
 +                      data0 = ((u64)&xcall_flush_dcache_page_cheetah);
 +#endif
 +              }
 +              if (data0) {
 +                      xcall_deliver(data0, __pa(pg_addr),
 +                                    (u64) pg_addr, &cpumask_of_cpu(cpu));
 +#ifdef CONFIG_DEBUG_DCFLUSH
 +                      atomic_inc(&dcpage_flushes_xcall);
 +#endif
 +              }
 +      }
 +
 +      put_cpu();
 +}
 +
 +void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 +{
 +      void *pg_addr;
 +      int this_cpu;
 +      u64 data0;
 +
 +      if (tlb_type == hypervisor)
 +              return;
 +
 +      this_cpu = get_cpu();
 +
 +#ifdef CONFIG_DEBUG_DCFLUSH
 +      atomic_inc(&dcpage_flushes);
 +#endif
 +      data0 = 0;
 +      pg_addr = page_address(page);
 +      if (tlb_type == spitfire) {
 +              data0 = ((u64)&xcall_flush_dcache_page_spitfire);
 +              if (page_mapping(page) != NULL)
 +                      data0 |= ((u64)1 << 32);
 +      } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 +#ifdef DCACHE_ALIASING_POSSIBLE
 +              data0 = ((u64)&xcall_flush_dcache_page_cheetah);
 +#endif
 +      }
 +      if (data0) {
 +              xcall_deliver(data0, __pa(pg_addr),
 +                            (u64) pg_addr, &cpu_online_map);
 +#ifdef CONFIG_DEBUG_DCFLUSH
 +              atomic_inc(&dcpage_flushes_xcall);
 +#endif
 +      }
 +      __local_flush_dcache_page(page);
 +
 +      put_cpu();
 +}
 +
 +void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 +{
 +      struct mm_struct *mm;
 +      unsigned long flags;
 +
 +      clear_softint(1 << irq);
 +
 +      /* See if we need to allocate a new TLB context because
 +       * the version of the one we are using is now out of date.
 +       */
 +      mm = current->active_mm;
 +      if (unlikely(!mm || (mm == &init_mm)))
 +              return;
 +
 +      spin_lock_irqsave(&mm->context.lock, flags);
 +
 +      if (unlikely(!CTX_VALID(mm->context)))
 +              get_new_mmu_context(mm);
 +
 +      spin_unlock_irqrestore(&mm->context.lock, flags);
 +
 +      load_secondary_context(mm);
 +      __flush_tlb_mm(CTX_HWBITS(mm->context),
 +                     SECONDARY_CONTEXT);
 +}
 +
 +void smp_new_mmu_context_version(void)
 +{
 +      smp_cross_call(&xcall_new_mmu_context_version, 0, 0, 0);
 +}
 +
 +#ifdef CONFIG_KGDB
 +void kgdb_roundup_cpus(unsigned long flags)
 +{
 +      smp_cross_call(&xcall_kgdb_capture, 0, 0, 0);
 +}
 +#endif
 +
 +void smp_fetch_global_regs(void)
 +{
 +      smp_cross_call(&xcall_fetch_glob_regs, 0, 0, 0);
 +}
 +
 +/* We know that the window frames of the user have been flushed
 + * to the stack before we get here because all callers of us
 + * are flush_tlb_*() routines, and these run after flush_cache_*()
 + * which performs the flushw.
 + *
 + * The SMP TLB coherency scheme we use works as follows:
 + *
 + * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
 + *    space has (potentially) executed on, this is the heuristic
 + *    we use to avoid doing cross calls.
 + *
 + *    Also, for flushing from kswapd and also for clones, we
 + *    use cpu_vm_mask as the list of cpus to make run the TLB.
 + *
 + * 2) TLB context numbers are shared globally across all processors
 + *    in the system, this allows us to play several games to avoid
 + *    cross calls.
 + *
 + *    One invariant is that when a cpu switches to a process, and
 + *    that processes tsk->active_mm->cpu_vm_mask does not have the
 + *    current cpu's bit set, that tlb context is flushed locally.
 + *
 + *    If the address space is non-shared (ie. mm->count == 1) we avoid
 + *    cross calls when we want to flush the currently running process's
 + *    tlb state.  This is done by clearing all cpu bits except the current
 + *    processor's in current->active_mm->cpu_vm_mask and performing the
 + *    flush locally only.  This will force any subsequent cpus which run
 + *    this task to flush the context from the local tlb if the process
 + *    migrates to another cpu (again).
 + *
 + * 3) For shared address spaces (threads) and swapping we bite the
 + *    bullet for most cases and perform the cross call (but only to
 + *    the cpus listed in cpu_vm_mask).
 + *
 + *    The performance gain from "optimizing" away the cross call for threads is
 + *    questionable (in theory the big win for threads is the massive sharing of
 + *    address space state across processors).
 + */
 +
 +/* This currently is only used by the hugetlb arch pre-fault
 + * hook on UltraSPARC-III+ and later when changing the pagesize
 + * bits of the context register for an address space.
 + */
 +void smp_flush_tlb_mm(struct mm_struct *mm)
 +{
 +      u32 ctx = CTX_HWBITS(mm->context);
 +      int cpu = get_cpu();
 +
 +      if (atomic_read(&mm->mm_users) == 1) {
 +              mm->cpu_vm_mask = cpumask_of_cpu(cpu);
 +              goto local_flush_and_out;
 +      }
 +
 +      smp_cross_call_masked(&xcall_flush_tlb_mm,
 +                            ctx, 0, 0,
 +                            &mm->cpu_vm_mask);
 +
 +local_flush_and_out:
 +      __flush_tlb_mm(ctx, SECONDARY_CONTEXT);
 +
 +      put_cpu();
 +}
 +
 +void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long *vaddrs)
 +{
 +      u32 ctx = CTX_HWBITS(mm->context);
 +      int cpu = get_cpu();
 +
 +      if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1)
 +              mm->cpu_vm_mask = cpumask_of_cpu(cpu);
 +      else
 +              smp_cross_call_masked(&xcall_flush_tlb_pending,
 +                                    ctx, nr, (unsigned long) vaddrs,
 +                                    &mm->cpu_vm_mask);
 +
 +      __flush_tlb_pending(ctx, nr, vaddrs);
 +
 +      put_cpu();
 +}
 +
 +void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 +{
 +      start &= PAGE_MASK;
 +      end    = PAGE_ALIGN(end);
 +      if (start != end) {
 +              smp_cross_call(&xcall_flush_tlb_kernel_range,
 +                             0, start, end);
 +
 +              __flush_tlb_kernel_range(start, end);
 +      }
 +}
 +
 +/* CPU capture. */
 +/* #define CAPTURE_DEBUG */
 +extern unsigned long xcall_capture;
 +
 +static atomic_t smp_capture_depth = ATOMIC_INIT(0);
 +static atomic_t smp_capture_registry = ATOMIC_INIT(0);
 +static unsigned long penguins_are_doing_time;
 +
 +void smp_capture(void)
 +{
 +      int result = atomic_add_ret(1, &smp_capture_depth);
 +
 +      if (result == 1) {
 +              int ncpus = num_online_cpus();
 +
 +#ifdef CAPTURE_DEBUG
 +              printk("CPU[%d]: Sending penguins to jail...",
 +                     smp_processor_id());
 +#endif
 +              penguins_are_doing_time = 1;
 +              atomic_inc(&smp_capture_registry);
 +              smp_cross_call(&xcall_capture, 0, 0, 0);
 +              while (atomic_read(&smp_capture_registry) != ncpus)
 +                      rmb();
 +#ifdef CAPTURE_DEBUG
 +              printk("done\n");
 +#endif
 +      }
 +}
 +
 +void smp_release(void)
 +{
 +      if (atomic_dec_and_test(&smp_capture_depth)) {
 +#ifdef CAPTURE_DEBUG
 +              printk("CPU[%d]: Giving pardon to "
 +                     "imprisoned penguins\n",
 +                     smp_processor_id());
 +#endif
 +              penguins_are_doing_time = 0;
 +              membar_safe("#StoreLoad");
 +              atomic_dec(&smp_capture_registry);
 +      }
 +}
 +
 +/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE
 + * set, so they can service tlb flush xcalls...
 + */
 +extern void prom_world(int);
 +
 +void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 +{
 +      clear_softint(1 << irq);
 +
 +      preempt_disable();
 +
 +      __asm__ __volatile__("flushw");
 +      prom_world(1);
 +      atomic_inc(&smp_capture_registry);
 +      membar_safe("#StoreLoad");
 +      while (penguins_are_doing_time)
 +              rmb();
 +      atomic_dec(&smp_capture_registry);
 +      prom_world(0);
 +
 +      preempt_enable();
 +}
 +
 +/* /proc/profile writes can call this, don't __init it please. */
 +int setup_profiling_timer(unsigned int multiplier)
 +{
 +      return -EINVAL;
 +}
 +
 +void __init smp_prepare_cpus(unsigned int max_cpus)
 +{
 +}
 +
 +void __devinit smp_prepare_boot_cpu(void)
 +{
 +}
 +
 +void __init smp_setup_processor_id(void)
 +{
 +      if (tlb_type == spitfire)
 +              xcall_deliver_impl = spitfire_xcall_deliver;
 +      else if (tlb_type == cheetah || tlb_type == cheetah_plus)
 +              xcall_deliver_impl = cheetah_xcall_deliver;
 +      else
 +              xcall_deliver_impl = hypervisor_xcall_deliver;
 +}
 +
 +void __devinit smp_fill_in_sib_core_maps(void)
 +{
 +      unsigned int i;
 +
 +      for_each_present_cpu(i) {
 +              unsigned int j;
 +
 +              cpus_clear(cpu_core_map[i]);
 +              if (cpu_data(i).core_id == 0) {
 +                      cpu_set(i, cpu_core_map[i]);
 +                      continue;
 +              }
 +
 +              for_each_present_cpu(j) {
 +                      if (cpu_data(i).core_id ==
 +                          cpu_data(j).core_id)
 +                              cpu_set(j, cpu_core_map[i]);
 +              }
 +      }
 +
 +      for_each_present_cpu(i) {
 +              unsigned int j;
 +
 +              cpus_clear(per_cpu(cpu_sibling_map, i));
 +              if (cpu_data(i).proc_id == -1) {
 +                      cpu_set(i, per_cpu(cpu_sibling_map, i));
 +                      continue;
 +              }
 +
 +              for_each_present_cpu(j) {
 +                      if (cpu_data(i).proc_id ==
 +                          cpu_data(j).proc_id)
 +                              cpu_set(j, per_cpu(cpu_sibling_map, i));
 +              }
 +      }
 +}
 +
 +int __cpuinit __cpu_up(unsigned int cpu)
 +{
 +      int ret = smp_boot_one_cpu(cpu);
 +
 +      if (!ret) {
 +              cpu_set(cpu, smp_commenced_mask);
 +              while (!cpu_isset(cpu, cpu_online_map))
 +                      mb();
 +              if (!cpu_isset(cpu, cpu_online_map)) {
 +                      ret = -ENODEV;
 +              } else {
 +                      /* On SUN4V, writes to %tick and %stick are
 +                       * not allowed.
 +                       */
 +                      if (tlb_type != hypervisor)
 +                              smp_synchronize_one_tick(cpu);
 +              }
 +      }
 +      return ret;
 +}
 +
 +#ifdef CONFIG_HOTPLUG_CPU
 +void cpu_play_dead(void)
 +{
 +      int cpu = smp_processor_id();
 +      unsigned long pstate;
 +
 +      idle_task_exit();
 +
 +      if (tlb_type == hypervisor) {
 +              struct trap_per_cpu *tb = &trap_block[cpu];
 +
 +              sun4v_cpu_qconf(HV_CPU_QUEUE_CPU_MONDO,
 +                              tb->cpu_mondo_pa, 0);
 +              sun4v_cpu_qconf(HV_CPU_QUEUE_DEVICE_MONDO,
 +                              tb->dev_mondo_pa, 0);
 +              sun4v_cpu_qconf(HV_CPU_QUEUE_RES_ERROR,
 +                              tb->resum_mondo_pa, 0);
 +              sun4v_cpu_qconf(HV_CPU_QUEUE_NONRES_ERROR,
 +                              tb->nonresum_mondo_pa, 0);
 +      }
 +
 +      cpu_clear(cpu, smp_commenced_mask);
 +      membar_safe("#Sync");
 +
 +      local_irq_disable();
 +
 +      __asm__ __volatile__(
 +              "rdpr   %%pstate, %0\n\t"
 +              "wrpr   %0, %1, %%pstate"
 +              : "=r" (pstate)
 +              : "i" (PSTATE_IE));
 +
 +      while (1)
 +              barrier();
 +}
 +
 +int __cpu_disable(void)
 +{
 +      int cpu = smp_processor_id();
 +      cpuinfo_sparc *c;
 +      int i;
 +
 +      for_each_cpu_mask(i, cpu_core_map[cpu])
 +              cpu_clear(cpu, cpu_core_map[i]);
 +      cpus_clear(cpu_core_map[cpu]);
 +
 +      for_each_cpu_mask(i, per_cpu(cpu_sibling_map, cpu))
 +              cpu_clear(cpu, per_cpu(cpu_sibling_map, i));
 +      cpus_clear(per_cpu(cpu_sibling_map, cpu));
 +
 +      c = &cpu_data(cpu);
 +
 +      c->core_id = 0;
 +      c->proc_id = -1;
 +
 +      smp_wmb();
 +
 +      /* Make sure no interrupts point to this cpu.  */
 +      fixup_irqs();
 +
 +      local_irq_enable();
 +      mdelay(1);
 +      local_irq_disable();
 +
 +      ipi_call_lock();
 +      cpu_clear(cpu, cpu_online_map);
 +      ipi_call_unlock();
 +
 +      return 0;
 +}
 +
 +void __cpu_die(unsigned int cpu)
 +{
 +      int i;
 +
 +      for (i = 0; i < 100; i++) {
 +              smp_rmb();
 +              if (!cpu_isset(cpu, smp_commenced_mask))
 +                      break;
 +              msleep(100);
 +      }
 +      if (cpu_isset(cpu, smp_commenced_mask)) {
 +              printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 +      } else {
 +#if defined(CONFIG_SUN_LDOMS)
 +              unsigned long hv_err;
 +              int limit = 100;
 +
 +              do {
 +                      hv_err = sun4v_cpu_stop(cpu);
 +                      if (hv_err == HV_EOK) {
 +                              cpu_clear(cpu, cpu_present_map);
 +                              break;
 +                      }
 +              } while (--limit > 0);
 +              if (limit <= 0) {
 +                      printk(KERN_ERR "sun4v_cpu_stop() fails err=%lu\n",
 +                             hv_err);
 +              }
 +#endif
 +      }
 +}
 +#endif
 +
 +void __init smp_cpus_done(unsigned int max_cpus)
 +{
 +}
 +
 +void smp_send_reschedule(int cpu)
 +{
 +      xcall_deliver((u64) &xcall_receive_signal, 0, 0,
 +                    &cpumask_of_cpu(cpu));
 +}
 +
 +void smp_receive_signal_client(int irq, struct pt_regs *regs)
 +{
 +      clear_softint(1 << irq);
 +}
 +
 +/* This is a nop because we capture all other cpus
 + * anyways when making the PROM active.
 + */
 +void smp_send_stop(void)
 +{
 +}
 +
 +unsigned long __per_cpu_base __read_mostly;
 +unsigned long __per_cpu_shift __read_mostly;
 +
 +EXPORT_SYMBOL(__per_cpu_base);
 +EXPORT_SYMBOL(__per_cpu_shift);
 +
 +void __init real_setup_per_cpu_areas(void)
 +{
 +      unsigned long paddr, goal, size, i;
 +      char *ptr;
 +
 +      /* Copy section for each CPU (we discard the original) */
 +      goal = PERCPU_ENOUGH_ROOM;
 +
 +      __per_cpu_shift = PAGE_SHIFT;
 +      for (size = PAGE_SIZE; size < goal; size <<= 1UL)
 +              __per_cpu_shift++;
 +
 +      paddr = lmb_alloc(size * NR_CPUS, PAGE_SIZE);
 +      if (!paddr) {
 +              prom_printf("Cannot allocate per-cpu memory.\n");
 +              prom_halt();
 +      }
 +
 +      ptr = __va(paddr);
 +      __per_cpu_base = ptr - __per_cpu_start;
 +
 +      for (i = 0; i < NR_CPUS; i++, ptr += size)
 +              memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 +
 +      /* Setup %g5 for the boot cpu.  */
 +      __local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
 +}
index a4d45fc,0000000..e1e9763
mode 100644,000000..100644
--- /dev/null
@@@ -1,257 -1,0 +1,253 @@@
 +/*
 + * arch/sparc/kernel/ksyms.c: Sparc specific ksyms support.
 + *
 + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
 + * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
 + */
 +
 +/* Tell string.h we don't want memcpy etc. as cpp defines */
 +#define EXPORT_SYMTAB_STROPS
 +#define PROMLIB_INTERNAL
 +
 +#include <linux/module.h>
 +#include <linux/init.h>
 +#include <linux/smp.h>
 +#include <linux/types.h>
 +#include <linux/string.h>
 +#include <linux/sched.h>
 +#include <linux/interrupt.h>
 +#include <linux/in6.h>
 +#include <linux/spinlock.h>
 +#include <linux/mm.h>
 +#include <linux/syscalls.h>
 +#ifdef CONFIG_PCI
 +#include <linux/pci.h>
 +#endif
 +#include <linux/pm.h>
 +#ifdef CONFIG_HIGHMEM
 +#include <linux/highmem.h>
 +#endif
 +
 +#include <asm/oplib.h>
 +#include <asm/delay.h>
 +#include <asm/system.h>
 +#include <asm/auxio.h>
 +#include <asm/pgtable.h>
 +#include <asm/io.h>
 +#include <asm/irq.h>
 +#include <asm/idprom.h>
 +#include <asm/head.h>
 +#include <asm/smp.h>
 +#include <asm/ptrace.h>
 +#include <asm/uaccess.h>
 +#include <asm/checksum.h>
 +#ifdef CONFIG_SBUS
 +#include <asm/dma.h>
 +#endif
 +#include <asm/io-unit.h>
 +#include <asm/bug.h>
 +
 +extern spinlock_t rtc_lock;
 +
 +struct poll {
 +      int fd;
 +      short events;
 +      short revents;
 +};
 +
 +extern void (*__copy_1page)(void *, const void *);
 +extern void __memmove(void *, const void *, __kernel_size_t);
 +extern void (*bzero_1page)(void *);
 +extern void *__bzero(void *, size_t);
 +extern void *__memscan_zero(void *, size_t);
 +extern void *__memscan_generic(void *, int, size_t);
 +extern int __strncmp(const char *, const char *, __kernel_size_t);
 +
 +extern int __ashrdi3(int, int);
 +extern int __ashldi3(int, int);
 +extern int __lshrdi3(int, int);
 +extern int __muldi3(int, int);
 +extern int __divdi3(int, int);
 +
 +/* Private functions with odd calling conventions. */
 +extern void ___atomic24_add(void);
 +extern void ___atomic24_sub(void);
 +extern void ___rw_read_enter(void);
 +extern void ___rw_read_try(void);
 +extern void ___rw_read_exit(void);
 +extern void ___rw_write_enter(void);
 +
 +/* Alias functions whose names begin with "." and export the aliases.
 + * The module references will be fixed up by module_frob_arch_sections.
 + */
 +extern int _Div(int, int);
 +extern int _Mul(int, int);
 +extern int _Rem(int, int);
 +extern unsigned _Udiv(unsigned, unsigned);
 +extern unsigned _Umul(unsigned, unsigned);
 +extern unsigned _Urem(unsigned, unsigned);
 +
 +/* used by various drivers */
 +EXPORT_SYMBOL(sparc_cpu_model);
 +EXPORT_SYMBOL(kernel_thread);
 +#ifdef CONFIG_SMP
 +// XXX find what uses (or used) these.   AV: see asm/spinlock.h
 +EXPORT_SYMBOL(___rw_read_enter);
 +EXPORT_SYMBOL(___rw_read_try);
 +EXPORT_SYMBOL(___rw_read_exit);
 +EXPORT_SYMBOL(___rw_write_enter);
 +#endif
 +
 +EXPORT_SYMBOL(sparc_valid_addr_bitmap);
 +EXPORT_SYMBOL(phys_base);
 +EXPORT_SYMBOL(pfn_base);
 +
 +/* Atomic operations. */
 +EXPORT_SYMBOL(___atomic24_add);
 +EXPORT_SYMBOL(___atomic24_sub);
 +
 +/* Per-CPU information table */
 +EXPORT_PER_CPU_SYMBOL(__cpu_data);
 +
 +#ifdef CONFIG_SMP
 +/* IRQ implementation. */
 +EXPORT_SYMBOL(synchronize_irq);
- /* CPU online map and active count. */
- EXPORT_SYMBOL(cpu_online_map);
- EXPORT_SYMBOL(phys_cpu_present_map);
 +#endif
 +
 +EXPORT_SYMBOL(__udelay);
 +EXPORT_SYMBOL(__ndelay);
 +EXPORT_SYMBOL(rtc_lock);
 +EXPORT_SYMBOL(set_auxio);
 +EXPORT_SYMBOL(get_auxio);
 +EXPORT_SYMBOL(io_remap_pfn_range);
 +
 +#ifndef CONFIG_SMP
 +EXPORT_SYMBOL(BTFIXUP_CALL(___xchg32));
 +#else
 +EXPORT_SYMBOL(BTFIXUP_CALL(__hard_smp_processor_id));
 +#endif
 +EXPORT_SYMBOL(BTFIXUP_CALL(mmu_unlockarea));
 +EXPORT_SYMBOL(BTFIXUP_CALL(mmu_lockarea));
 +EXPORT_SYMBOL(BTFIXUP_CALL(mmu_get_scsi_sgl));
 +EXPORT_SYMBOL(BTFIXUP_CALL(mmu_get_scsi_one));
 +EXPORT_SYMBOL(BTFIXUP_CALL(mmu_release_scsi_sgl));
 +EXPORT_SYMBOL(BTFIXUP_CALL(mmu_release_scsi_one));
 +
 +EXPORT_SYMBOL(BTFIXUP_CALL(pgprot_noncached));
 +
 +#ifdef CONFIG_SBUS
 +EXPORT_SYMBOL(sbus_set_sbus64);
 +#endif
 +#ifdef CONFIG_PCI
 +EXPORT_SYMBOL(insb);
 +EXPORT_SYMBOL(outsb);
 +EXPORT_SYMBOL(insw);
 +EXPORT_SYMBOL(outsw);
 +EXPORT_SYMBOL(insl);
 +EXPORT_SYMBOL(outsl);
 +EXPORT_SYMBOL(pci_alloc_consistent);
 +EXPORT_SYMBOL(pci_free_consistent);
 +EXPORT_SYMBOL(pci_map_single);
 +EXPORT_SYMBOL(pci_unmap_single);
 +EXPORT_SYMBOL(pci_dma_sync_single_for_cpu);
 +EXPORT_SYMBOL(pci_dma_sync_single_for_device);
 +EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu);
 +EXPORT_SYMBOL(pci_dma_sync_sg_for_device);
 +EXPORT_SYMBOL(pci_map_sg);
 +EXPORT_SYMBOL(pci_unmap_sg);
 +EXPORT_SYMBOL(pci_map_page);
 +EXPORT_SYMBOL(pci_unmap_page);
 +/* Actually, ioremap/iounmap are not PCI specific. But it is ok for drivers. */
 +EXPORT_SYMBOL(ioremap);
 +EXPORT_SYMBOL(iounmap);
 +#endif
 +
 +/* in arch/sparc/mm/highmem.c */
 +#ifdef CONFIG_HIGHMEM
 +EXPORT_SYMBOL(kmap_atomic);
 +EXPORT_SYMBOL(kunmap_atomic);
 +#endif
 +
 +/* prom symbols */
 +EXPORT_SYMBOL(idprom);
 +EXPORT_SYMBOL(prom_root_node);
 +EXPORT_SYMBOL(prom_getchild);
 +EXPORT_SYMBOL(prom_getsibling);
 +EXPORT_SYMBOL(prom_searchsiblings);
 +EXPORT_SYMBOL(prom_firstprop);
 +EXPORT_SYMBOL(prom_nextprop);
 +EXPORT_SYMBOL(prom_getproplen);
 +EXPORT_SYMBOL(prom_getproperty);
 +EXPORT_SYMBOL(prom_node_has_property);
 +EXPORT_SYMBOL(prom_setprop);
 +EXPORT_SYMBOL(saved_command_line);
 +EXPORT_SYMBOL(prom_apply_obio_ranges);
 +EXPORT_SYMBOL(prom_feval);
 +EXPORT_SYMBOL(prom_getbool);
 +EXPORT_SYMBOL(prom_getstring);
 +EXPORT_SYMBOL(prom_getint);
 +EXPORT_SYMBOL(prom_getintdefault);
 +EXPORT_SYMBOL(prom_finddevice);
 +EXPORT_SYMBOL(romvec);
 +EXPORT_SYMBOL(__prom_getchild);
 +EXPORT_SYMBOL(__prom_getsibling);
 +
 +/* sparc library symbols */
 +EXPORT_SYMBOL(memscan);
 +EXPORT_SYMBOL(strlen);
 +EXPORT_SYMBOL(strncmp);
 +EXPORT_SYMBOL(page_kernel);
 +
 +/* Special internal versions of library functions. */
 +EXPORT_SYMBOL(__copy_1page);
 +EXPORT_SYMBOL(__memcpy);
 +EXPORT_SYMBOL(__memset);
 +EXPORT_SYMBOL(bzero_1page);
 +EXPORT_SYMBOL(__bzero);
 +EXPORT_SYMBOL(__memscan_zero);
 +EXPORT_SYMBOL(__memscan_generic);
 +EXPORT_SYMBOL(__strncmp);
 +EXPORT_SYMBOL(__memmove);
 +
 +/* Moving data to/from userspace. */
 +EXPORT_SYMBOL(__copy_user);
 +EXPORT_SYMBOL(__strncpy_from_user);
 +EXPORT_SYMBOL(__strnlen_user);
 +
 +/* Networking helper routines. */
 +EXPORT_SYMBOL(__csum_partial_copy_sparc_generic);
 +EXPORT_SYMBOL(csum_partial);
 +
 +/* Cache flushing.  */
 +EXPORT_SYMBOL(sparc_flush_page_to_ram);
 +
 +/* For when serial stuff is built as modules. */
 +EXPORT_SYMBOL(sun_do_break);
 +
 +EXPORT_SYMBOL(__ret_efault);
 +
 +EXPORT_SYMBOL(memcmp);
 +EXPORT_SYMBOL(memcpy);
 +EXPORT_SYMBOL(memset);
 +EXPORT_SYMBOL(memmove);
 +EXPORT_SYMBOL(__ashrdi3);
 +EXPORT_SYMBOL(__ashldi3);
 +EXPORT_SYMBOL(__lshrdi3);
 +EXPORT_SYMBOL(__muldi3);
 +EXPORT_SYMBOL(__divdi3);
 +
 +EXPORT_SYMBOL(_Rem);
 +EXPORT_SYMBOL(_Urem);
 +EXPORT_SYMBOL(_Mul);
 +EXPORT_SYMBOL(_Umul);
 +EXPORT_SYMBOL(_Div);
 +EXPORT_SYMBOL(_Udiv);
 +
 +#ifdef CONFIG_DEBUG_BUGVERBOSE
 +EXPORT_SYMBOL(do_BUG);
 +#endif
 +
 +/* Sun Power Management Idle Handler */
 +EXPORT_SYMBOL(pm_idle);
 +
 +EXPORT_SYMBOL(empty_zero_page);
Simple merge
Simple merge
@@@ -31,9 -31,13 +31,9 @@@ static inline int irq_canonicalize(int 
  # endif
  #endif
  
 -#ifdef CONFIG_IRQBALANCE
 -extern int irqbalance_disable(char *str);
 -#endif
 -
  #ifdef CONFIG_HOTPLUG_CPU
  #include <linux/cpumask.h>
- extern void fixup_irqs(cpumask_t map);
+ extern void fixup_irqs(void);
  #endif
  
  extern unsigned int do_IRQ(struct pt_regs *regs);
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -152,25 -152,25 +152,25 @@@ static struct irq_cfg irq_cfgx[] = 
  #else
  static struct irq_cfg irq_cfgx[NR_IRQS] = {
  #endif
-       [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-       [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-       [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-       [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-       [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-       [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-       [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-       [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-       [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-       [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-       [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-       [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-       [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-       [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-       [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-       [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+       [0]  = { .vector = IRQ0_VECTOR,  },
+       [1]  = { .vector = IRQ1_VECTOR,  },
+       [2]  = { .vector = IRQ2_VECTOR,  },
+       [3]  = { .vector = IRQ3_VECTOR,  },
+       [4]  = { .vector = IRQ4_VECTOR,  },
+       [5]  = { .vector = IRQ5_VECTOR,  },
+       [6]  = { .vector = IRQ6_VECTOR,  },
+       [7]  = { .vector = IRQ7_VECTOR,  },
+       [8]  = { .vector = IRQ8_VECTOR,  },
+       [9]  = { .vector = IRQ9_VECTOR,  },
+       [10] = { .vector = IRQ10_VECTOR, },
+       [11] = { .vector = IRQ11_VECTOR, },
+       [12] = { .vector = IRQ12_VECTOR, },
+       [13] = { .vector = IRQ13_VECTOR, },
+       [14] = { .vector = IRQ14_VECTOR, },
+       [15] = { .vector = IRQ15_VECTOR, },
  };
  
 -void __init arch_early_irq_init(void)
 +int __init arch_early_irq_init(void)
  {
        struct irq_cfg *cfg;
        struct irq_desc *desc;
        for (i = 0; i < count; i++) {
                desc = irq_to_desc(i);
                desc->chip_data = &cfg[i];
+               alloc_bootmem_cpumask_var(&cfg[i].domain);
+               alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+               if (i < NR_IRQS_LEGACY)
+                       cpumask_setall(cfg[i].domain);
        }
 +
 +      return 0;
  }
  
  #ifdef CONFIG_SPARSE_IRQ
@@@ -1349,8 -1400,10 +1404,8 @@@ void __setup_vector_irq(int cpu
  
        /* Mark the inuse vectors */
        for_each_irq_desc(irq, desc) {
 -              if (!desc)
 -                      continue;
                cfg = desc->chip_data;
-               if (!cpu_isset(cpu, cfg->domain))
+               if (!cpumask_test_cpu(cpu, cfg->domain))
                        continue;
                vector = cfg->vector;
                per_cpu(vector_irq, cpu)[vector] = irq;
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc init/Kconfig
Simple merge
Simple merge
Simple merge
diff --cc kernel/sched.c
@@@ -5426,8 -5477,17 +5495,16 @@@ long sched_setaffinity(pid_t pid, cons
        get_task_struct(p);
        read_unlock(&tasklist_lock);
  
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
        retval = -EPERM;
 -      if ((current->euid != p->euid) && (current->euid != p->uid) &&
 -                      !capable(CAP_SYS_NICE))
 +      if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
  
        retval = security_task_setscheduler(p, 0, NULL);
Simple merge
Simple merge
Simple merge
@@@ -282,31 -282,8 +282,31 @@@ void tick_nohz_stop_sched_tick(int inid
        /* Schedule the tick, if we are at least one jiffie off */
        if ((long)delta_jiffies >= 1) {
  
 +              /*
 +              * calculate the expiry time for the next timer wheel
 +              * timer
 +              */
 +              expires = ktime_add_ns(last_update, tick_period.tv64 *
 +                                 delta_jiffies);
 +
 +              /*
 +               * If this cpu is the one which updates jiffies, then
 +               * give up the assignment and let it be taken by the
 +               * cpu which runs the tick timer next, which might be
 +               * this cpu as well. If we don't drop this here the
 +               * jiffies might be stale and do_timer() never
 +               * invoked.
 +               */
 +              if (cpu == tick_do_timer_cpu)
 +                      tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 +
                if (delta_jiffies > 1)
-                       cpu_set(cpu, nohz_cpu_mask);
+                       cpumask_set_cpu(cpu, nohz_cpu_mask);
 +
 +              /* Skip reprogram of event if its not changed */
 +              if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
 +                      goto out;
 +
                /*
                 * nohz_stop_sched_tick can be called several times before
                 * the nohz_restart_sched_tick is called. This happens when
Simple merge
diff --cc lib/Kconfig
Simple merge
diff --cc mm/slub.c
Simple merge