Merge branch 'merge' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
Linus Torvalds [Fri, 25 Jul 2008 18:08:17 +0000 (11:08 -0700)]
* 'merge' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (34 commits)
  powerpc: Wireup new syscalls
  Move update_mmu_cache() declaration from tlbflush.h to pgtable.h
  powerpc/pseries: Remove kmalloc call in handling writes to lparcfg
  powerpc/pseries: Update arch vector to indicate support for CMO
  ibmvfc: Add support for collaborative memory overcommit
  ibmvscsi: driver enablement for CMO
  ibmveth: enable driver for CMO
  ibmveth: Automatically enable larger rx buffer pools for larger mtu
  powerpc/pseries: Verify CMO memory entitlement updates with virtual I/O
  powerpc/pseries: vio bus support for CMO
  powerpc/pseries: iommu enablement for CMO
  powerpc/pseries: Add CMO paging statistics
  powerpc/pseries: Add collaborative memory manager
  powerpc/pseries: Utilities to set firmware page state
  powerpc/pseries: Enable CMO feature during platform setup
  powerpc/pseries: Split retrieval of processor entitlement data into a helper routine
  powerpc/pseries: Add memory entitlement capabilities to /proc/ppc64/lparcfg
  powerpc/pseries: Split processor entitlement retrieval and gathering to helper routines
  powerpc/pseries: Remove extraneous error reporting for hcall failures in lparcfg
  powerpc: Fix compile error with binutils 2.15
  ...

Fixed up conflict in arch/powerpc/platforms/52xx/Kconfig manually.

49 files changed:
Documentation/powerpc/booting-without-of.txt
arch/powerpc/kernel/cputable.c
arch/powerpc/kernel/entry_32.S
arch/powerpc/kernel/iommu.c
arch/powerpc/kernel/lparcfg.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/prom_init.c
arch/powerpc/kernel/ptrace.c
arch/powerpc/kernel/signal.c
arch/powerpc/kernel/sysfs.c
arch/powerpc/kernel/traps.c
arch/powerpc/kernel/vio.c
arch/powerpc/kernel/vmlinux.lds.S
arch/powerpc/mm/fault.c
arch/powerpc/platforms/52xx/Kconfig
arch/powerpc/platforms/cell/iommu.c
arch/powerpc/platforms/cell/spufs/sched.c
arch/powerpc/platforms/cell/spufs/sputrace.c
arch/powerpc/platforms/iseries/iommu.c
arch/powerpc/platforms/pasemi/iommu.c
arch/powerpc/platforms/pseries/Kconfig
arch/powerpc/platforms/pseries/Makefile
arch/powerpc/platforms/pseries/cmm.c [new file with mode: 0644]
arch/powerpc/platforms/pseries/iommu.c
arch/powerpc/platforms/pseries/plpar_wrappers.h
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/sysdev/dart_iommu.c
drivers/net/ibmveth.c
drivers/net/ibmveth.h
drivers/of/of_i2c.c
drivers/scsi/ibmvscsi/ibmvfc.c
drivers/scsi/ibmvscsi/ibmvscsi.c
drivers/scsi/ibmvscsi/ibmvscsi.h
fs/binfmt_elf.c
include/asm-powerpc/cputable.h
include/asm-powerpc/elf.h
include/asm-powerpc/firmware.h
include/asm-powerpc/hvcall.h
include/asm-powerpc/lppaca.h
include/asm-powerpc/machdep.h
include/asm-powerpc/mpc52xx_psc.h
include/asm-powerpc/pgtable.h
include/asm-powerpc/syscalls.h
include/asm-powerpc/systbl.h
include/asm-powerpc/system.h
include/asm-powerpc/tlbflush.h
include/asm-powerpc/unistd.h
include/asm-powerpc/vio.h
include/linux/auxvec.h

index ea1b70b..99514ce 100644 (file)
@@ -59,6 +59,7 @@ Table of Contents
       p) Freescale Synchronous Serial Interface
          q) USB EHCI controllers
       r) MDIO on GPIOs
+      s) SPI busses
 
   VII - Marvell Discovery mv64[345]6x System Controller chips
     1) The /system-controller node
@@ -1883,6 +1884,62 @@ platforms are moved over to use the flattened-device-tree model.
                         &qe_pio_c 6>;
        };
 
+    s) SPI (Serial Peripheral Interface) busses
+
+    SPI busses can be described with a node for the SPI master device
+    and a set of child nodes for each SPI slave on the bus.  For this
+    discussion, it is assumed that the system's SPI controller is in
+    SPI master mode.  This binding does not describe SPI controllers
+    in slave mode.
+
+    The SPI master node requires the following properties:
+    - #address-cells  - number of cells required to define a chip select
+                       address on the SPI bus.
+    - #size-cells     - should be zero.
+    - compatible      - name of SPI bus controller following generic names
+                       recommended practice.
+    No other properties are required in the SPI bus node.  It is assumed
+    that a driver for an SPI bus device will understand that it is an SPI bus.
+    However, the binding does not attempt to define the specific method for
+    assigning chip select numbers.  Since SPI chip select configuration is
+    flexible and non-standardized, it is left out of this binding with the
+    assumption that board specific platform code will be used to manage
+    chip selects.  Individual drivers can define additional properties to
+    support describing the chip select layout.
+
+    SPI slave nodes must be children of the SPI master node and can
+    contain the following properties.
+    - reg             - (required) chip select address of device.
+    - compatible      - (required) name of SPI device following generic names
+                       recommended practice
+    - spi-max-frequency - (required) Maximum SPI clocking speed of device in Hz
+    - spi-cpol        - (optional) Empty property indicating device requires
+                       inverse clock polarity (CPOL) mode
+    - spi-cpha        - (optional) Empty property indicating device requires
+                       shifted clock phase (CPHA) mode
+
+    SPI example for an MPC5200 SPI bus:
+               spi@f00 {
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+                       compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
+                       reg = <0xf00 0x20>;
+                       interrupts = <2 13 0 2 14 0>;
+                       interrupt-parent = <&mpc5200_pic>;
+
+                       ethernet-switch@0 {
+                               compatible = "micrel,ks8995m";
+                               spi-max-frequency = <1000000>;
+                               reg = <0>;
+                       };
+
+                       codec@1 {
+                               compatible = "ti,tlv320aic26";
+                               spi-max-frequency = <100000>;
+                               reg = <1>;
+                       };
+               };
+
 VII - Marvell Discovery mv64[345]6x System Controller chips
 ===========================================================
 
index b936a1d..25a052c 100644 (file)
@@ -23,6 +23,9 @@
 struct cpu_spec* cur_cpu_spec = NULL;
 EXPORT_SYMBOL(cur_cpu_spec);
 
+/* The platform string corresponding to the real PVR */
+const char *powerpc_base_platform;
+
 /* NOTE:
  * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
  * the responsibility of the appropriate CPU save/restore functions to
@@ -1652,6 +1655,14 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
                        } else
                                *t = *s;
                        *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
+
+                       /*
+                        * Set the base platform string once; assumes
+                        * we're called with real pvr first.
+                        */
+                       if (powerpc_base_platform == NULL)
+                               powerpc_base_platform = t->platform;
+
 #if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
                        /* ppc64 and booke expect identify_cpu to also call
                         * setup_cpu for that processor. I will consolidate
index da52269..81c8324 100644 (file)
@@ -148,7 +148,7 @@ transfer_to_handler:
        /* Check to see if the dbcr0 register is set up to debug.  Use the
           internal debug mode bit to do this. */
        lwz     r12,THREAD_DBCR0(r12)
-       andis.  r12,r12,DBCR0_IDM@h
+       andis.  r12,r12,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
        beq+    3f
        /* From user and task is ptraced - load up global dbcr0 */
        li      r12,-1                  /* clear all pending debug events */
@@ -292,7 +292,7 @@ syscall_exit_cont:
        /* If the process has its own DBCR0 value, load it up.  The internal
           debug mode bit tells us that dbcr0 should be loaded. */
        lwz     r0,THREAD+THREAD_DBCR0(r2)
-       andis.  r10,r0,DBCR0_IDM@h
+       andis.  r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
        bnel-   load_dbcr0
 #endif
 #ifdef CONFIG_44x
@@ -720,7 +720,7 @@ restore_user:
        /* Check whether this process has its own DBCR0 value.  The internal
           debug mode bit tells us that dbcr0 should be loaded. */
        lwz     r0,THREAD+THREAD_DBCR0(r2)
-       andis.  r10,r0,DBCR0_IDM@h
+       andis.  r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
        bnel-   load_dbcr0
 #endif
 
index 2385f68..550a193 100644 (file)
@@ -49,6 +49,8 @@ static int novmerge = 1;
 
 static int protect4gb = 1;
 
+static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
+
 static inline unsigned long iommu_num_pages(unsigned long vaddr,
                                            unsigned long slen)
 {
@@ -191,6 +193,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 {
        unsigned long entry, flags;
        dma_addr_t ret = DMA_ERROR_CODE;
+       int build_fail;
 
        spin_lock_irqsave(&(tbl->it_lock), flags);
 
@@ -205,9 +208,21 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
        ret = entry << IOMMU_PAGE_SHIFT;        /* Set the return dma address */
 
        /* Put the TCEs in the HW table */
-       ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
-                        direction, attrs);
+       build_fail = ppc_md.tce_build(tbl, entry, npages,
+                                     (unsigned long)page & IOMMU_PAGE_MASK,
+                                     direction, attrs);
+
+       /* ppc_md.tce_build() only returns non-zero for transient errors.
+        * Clean up the table bitmap in this case and return
+        * DMA_ERROR_CODE. For all other errors the functionality is
+        * not altered.
+        */
+       if (unlikely(build_fail)) {
+               __iommu_free(tbl, ret, npages);
 
+               spin_unlock_irqrestore(&(tbl->it_lock), flags);
+               return DMA_ERROR_CODE;
+       }
 
        /* Flush/invalidate TLB caches if necessary */
        if (ppc_md.tce_flush)
@@ -276,7 +291,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
        dma_addr_t dma_next = 0, dma_addr;
        unsigned long flags;
        struct scatterlist *s, *outs, *segstart;
-       int outcount, incount, i;
+       int outcount, incount, i, build_fail = 0;
        unsigned int align;
        unsigned long handle;
        unsigned int max_seg_size;
@@ -337,8 +352,11 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
                            npages, entry, dma_addr);
 
                /* Insert into HW table */
-               ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK,
-                                direction, attrs);
+               build_fail = ppc_md.tce_build(tbl, entry, npages,
+                                             vaddr & IOMMU_PAGE_MASK,
+                                             direction, attrs);
+               if(unlikely(build_fail))
+                       goto failure;
 
                /* If we are in an open segment, try merging */
                if (segstart != s) {
index 827a572..9f856a0 100644 (file)
@@ -34,8 +34,9 @@
 #include <asm/time.h>
 #include <asm/prom.h>
 #include <asm/vdso_datapage.h>
+#include <asm/vio.h>
 
-#define MODULE_VERS "1.7"
+#define MODULE_VERS "1.8"
 #define MODULE_NAME "lparcfg"
 
 /* #define LPARCFG_DEBUG */
@@ -129,32 +130,46 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v)
 /*
  * Methods used to fetch LPAR data when running on a pSeries platform.
  */
-static void log_plpar_hcall_return(unsigned long rc, char *tag)
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
 {
-       switch(rc) {
-       case 0:
-               return;
-       case H_HARDWARE:
-               printk(KERN_INFO "plpar-hcall (%s) "
-                               "Hardware fault\n", tag);
-               return;
-       case H_FUNCTION:
-               printk(KERN_INFO "plpar-hcall (%s) "
-                               "Function not allowed\n", tag);
-               return;
-       case H_AUTHORITY:
-               printk(KERN_INFO "plpar-hcall (%s) "
-                               "Not authorized to this function\n", tag);
-               return;
-       case H_PARAMETER:
-               printk(KERN_INFO "plpar-hcall (%s) "
-                               "Bad parameter(s)\n",tag);
-               return;
-       default:
-               printk(KERN_INFO "plpar-hcall (%s) "
-                               "Unexpected rc(0x%lx)\n", tag, rc);
-       }
+       int rc;
+       unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+       rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+       mpp_data->entitled_mem = retbuf[0];
+       mpp_data->mapped_mem = retbuf[1];
+
+       mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+       mpp_data->pool_num = retbuf[2] & 0xffff;
+
+       mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+       mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+       mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
+
+       mpp_data->pool_size = retbuf[4];
+       mpp_data->loan_request = retbuf[5];
+       mpp_data->backing_mem = retbuf[6];
+
+       return rc;
 }
+EXPORT_SYMBOL(h_get_mpp);
+
+struct hvcall_ppp_data {
+       u64     entitlement;
+       u64     unallocated_entitlement;
+       u16     group_num;
+       u16     pool_num;
+       u8      capped;
+       u8      weight;
+       u8      unallocated_weight;
+       u16     active_procs_in_pool;
+       u16     active_system_procs;
+};
 
 /*
  * H_GET_PPP hcall returns info in 4 parms.
@@ -176,27 +191,30 @@ static void log_plpar_hcall_return(unsigned long rc, char *tag)
  *              XXXX - Active processors in Physical Processor Pool.
  *                  XXXX  - Processors active on platform.
  */
-static unsigned int h_get_ppp(unsigned long *entitled,
-                             unsigned long *unallocated,
-                             unsigned long *aggregation,
-                             unsigned long *resource)
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
 {
        unsigned long rc;
        unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 
        rc = plpar_hcall(H_GET_PPP, retbuf);
 
-       *entitled = retbuf[0];
-       *unallocated = retbuf[1];
-       *aggregation = retbuf[2];
-       *resource = retbuf[3];
+       ppp_data->entitlement = retbuf[0];
+       ppp_data->unallocated_entitlement = retbuf[1];
+
+       ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+       ppp_data->pool_num = retbuf[2] & 0xffff;
 
-       log_plpar_hcall_return(rc, "H_GET_PPP");
+       ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+       ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+       ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+       ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+       ppp_data->active_system_procs = retbuf[3] & 0xffff;
 
        return rc;
 }
 
-static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
+static unsigned h_pic(unsigned long *pool_idle_time,
+                     unsigned long *num_procs)
 {
        unsigned long rc;
        unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
@@ -206,8 +224,87 @@ static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
        *pool_idle_time = retbuf[0];
        *num_procs = retbuf[1];
 
-       if (rc != H_AUTHORITY)
-               log_plpar_hcall_return(rc, "H_PIC");
+       return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+       struct hvcall_ppp_data ppp_data;
+       int rc;
+
+       rc = h_get_ppp(&ppp_data);
+       if (rc)
+               return;
+
+       seq_printf(m, "partition_entitled_capacity=%ld\n",
+                  ppp_data.entitlement);
+       seq_printf(m, "group=%d\n", ppp_data.group_num);
+       seq_printf(m, "system_active_processors=%d\n",
+                  ppp_data.active_system_procs);
+
+       /* pool related entries are apropriate for shared configs */
+       if (lppaca[0].shared_proc) {
+               unsigned long pool_idle_time, pool_procs;
+
+               seq_printf(m, "pool=%d\n", ppp_data.pool_num);
+
+               /* report pool_capacity in percentage */
+               seq_printf(m, "pool_capacity=%d\n",
+                          ppp_data.active_procs_in_pool * 100);
+
+               h_pic(&pool_idle_time, &pool_procs);
+               seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+               seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+       }
+
+       seq_printf(m, "unallocated_capacity_weight=%d\n",
+                  ppp_data.unallocated_weight);
+       seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+       seq_printf(m, "capped=%d\n", ppp_data.capped);
+       seq_printf(m, "unallocated_capacity=%ld\n",
+                  ppp_data.unallocated_entitlement);
+}
+
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+       struct hvcall_mpp_data mpp_data;
+       int rc;
+
+       rc = h_get_mpp(&mpp_data);
+       if (rc)
+               return;
+
+       seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+       if (mpp_data.mapped_mem != -1)
+               seq_printf(m, "mapped_entitled_memory=%ld\n",
+                          mpp_data.mapped_mem);
+
+       seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+       seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+       seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+       seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+                  mpp_data.unallocated_mem_weight);
+       seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+                  mpp_data.unallocated_entitlement);
+
+       if (mpp_data.pool_size != -1)
+               seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+                          mpp_data.pool_size);
+
+       seq_printf(m, "entitled_memory_loan_request=%ld\n",
+                  mpp_data.loan_request);
+
+       seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
 }
 
 #define SPLPAR_CHARACTERISTICS_TOKEN 20
@@ -313,6 +410,25 @@ static int lparcfg_count_active_processors(void)
        return count;
 }
 
+static void pseries_cmo_data(struct seq_file *m)
+{
+       int cpu;
+       unsigned long cmo_faults = 0;
+       unsigned long cmo_fault_time = 0;
+
+       if (!firmware_has_feature(FW_FEATURE_CMO))
+               return;
+
+       for_each_possible_cpu(cpu) {
+               cmo_faults += lppaca[cpu].cmo_faults;
+               cmo_fault_time += lppaca[cpu].cmo_fault_time;
+       }
+
+       seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+       seq_printf(m, "cmo_fault_time_usec=%lu\n",
+                  cmo_fault_time / tb_ticks_per_usec);
+}
+
 static int pseries_lparcfg_data(struct seq_file *m, void *v)
 {
        int partition_potential_processors;
@@ -334,60 +450,13 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
        partition_active_processors = lparcfg_count_active_processors();
 
        if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-               unsigned long h_entitled, h_unallocated;
-               unsigned long h_aggregation, h_resource;
-               unsigned long pool_idle_time, pool_procs;
-               unsigned long purr;
-
-               h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
-                         &h_resource);
-
-               seq_printf(m, "R4=0x%lx\n", h_entitled);
-               seq_printf(m, "R5=0x%lx\n", h_unallocated);
-               seq_printf(m, "R6=0x%lx\n", h_aggregation);
-               seq_printf(m, "R7=0x%lx\n", h_resource);
-
-               purr = get_purr();
-
                /* this call handles the ibm,get-system-parameter contents */
                parse_system_parameter_string(m);
+               parse_ppp_data(m);
+               parse_mpp_data(m);
+               pseries_cmo_data(m);
 
-               seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
-
-               seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
-
-               seq_printf(m, "system_active_processors=%ld\n",
-                          (h_resource >> 0 * 8) & 0xffff);
-
-               /* pool related entries are apropriate for shared configs */
-               if (lppaca[0].shared_proc) {
-
-                       h_pic(&pool_idle_time, &pool_procs);
-
-                       seq_printf(m, "pool=%ld\n",
-                                  (h_aggregation >> 0 * 8) & 0xffff);
-
-                       /* report pool_capacity in percentage */
-                       seq_printf(m, "pool_capacity=%ld\n",
-                                  ((h_resource >> 2 * 8) & 0xffff) * 100);
-
-                       seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
-
-                       seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
-               }
-
-               seq_printf(m, "unallocated_capacity_weight=%ld\n",
-                          (h_resource >> 4 * 8) & 0xFF);
-
-               seq_printf(m, "capacity_weight=%ld\n",
-                          (h_resource >> 5 * 8) & 0xFF);
-
-               seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
-
-               seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
-
-               seq_printf(m, "purr=%ld\n", purr);
-
+               seq_printf(m, "purr=%ld\n", get_purr());
        } else {                /* non SPLPAR case */
 
                seq_printf(m, "system_active_processors=%d\n",
@@ -414,6 +483,83 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
        return 0;
 }
 
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+       struct hvcall_ppp_data ppp_data;
+       u8 new_weight;
+       u64 new_entitled;
+       ssize_t retval;
+
+       /* Get our current parameters */
+       retval = h_get_ppp(&ppp_data);
+       if (retval)
+               return retval;
+
+       if (entitlement) {
+               new_weight = ppp_data.weight;
+               new_entitled = *entitlement;
+       } else if (weight) {
+               new_weight = *weight;
+               new_entitled = ppp_data.entitlement;
+       } else
+               return -EINVAL;
+
+       pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+                __FUNCTION__, ppp_data.entitlement, ppp_data.weight);
+
+       pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+                __FUNCTION__, new_entitled, new_weight);
+
+       retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+       return retval;
+}
+
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition.  Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+       struct hvcall_mpp_data mpp_data;
+       u64 new_entitled;
+       u8 new_weight;
+       ssize_t rc;
+
+       if (entitlement) {
+               /* Check with vio to ensure the new memory entitlement
+                * can be handled.
+                */
+               rc = vio_cmo_entitlement_update(*entitlement);
+               if (rc)
+                       return rc;
+       }
+
+       rc = h_get_mpp(&mpp_data);
+       if (rc)
+               return rc;
+
+       if (entitlement) {
+               new_weight = mpp_data.mem_weight;
+               new_entitled = *entitlement;
+       } else if (weight) {
+               new_weight = *weight;
+               new_entitled = mpp_data.entitled_mem;
+       } else
+               return -EINVAL;
+
+       pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+                __FUNCTION__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+       pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+                __FUNCTION__, new_entitled, new_weight);
+
+       rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+       return rc;
+}
+
 /*
  * Interface for changing system parameters (variable capacity weight
  * and entitled capacity).  Format of input is "param_name=value";
@@ -427,35 +573,27 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 static ssize_t lparcfg_write(struct file *file, const char __user * buf,
                             size_t count, loff_t * off)
 {
-       char *kbuf;
+       int kbuf_sz = 64;
+       char kbuf[kbuf_sz];
        char *tmp;
        u64 new_entitled, *new_entitled_ptr = &new_entitled;
        u8 new_weight, *new_weight_ptr = &new_weight;
-
-       unsigned long current_entitled; /* parameters for h_get_ppp */
-       unsigned long dummy;
-       unsigned long resource;
-       u8 current_weight;
-
-       ssize_t retval = -ENOMEM;
+       ssize_t retval;
 
        if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
                        firmware_has_feature(FW_FEATURE_ISERIES))
                return -EINVAL;
 
-       kbuf = kmalloc(count, GFP_KERNEL);
-       if (!kbuf)
-               goto out;
+       if (count > kbuf_sz)
+               return -EINVAL;
 
-       retval = -EFAULT;
        if (copy_from_user(kbuf, buf, count))
-               goto out;
+               return -EFAULT;
 
-       retval = -EINVAL;
        kbuf[count - 1] = '\0';
        tmp = strchr(kbuf, '=');
        if (!tmp)
-               goto out;
+               return -EINVAL;
 
        *tmp++ = '\0';
 
@@ -463,34 +601,32 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
                char *endp;
                *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
                if (endp == tmp)
-                       goto out;
-               new_weight_ptr = &current_weight;
+                       return -EINVAL;
+
+               retval = update_ppp(new_entitled_ptr, NULL);
        } else if (!strcmp(kbuf, "capacity_weight")) {
                char *endp;
                *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
                if (endp == tmp)
-                       goto out;
-               new_entitled_ptr = &current_entitled;
-       } else
-               goto out;
-
-       /* Get our current parameters */
-       retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
-       if (retval) {
-               retval = -EIO;
-               goto out;
-       }
-
-       current_weight = (resource >> 5 * 8) & 0xFF;
+                       return -EINVAL;
 
-       pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
-                __func__, current_entitled, current_weight);
+               retval = update_ppp(NULL, new_weight_ptr);
+       } else if (!strcmp(kbuf, "entitled_memory")) {
+               char *endp;
+               *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+               if (endp == tmp)
+                       return -EINVAL;
 
-       pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
-                __func__, *new_entitled_ptr, *new_weight_ptr);
+               retval = update_mpp(new_entitled_ptr, NULL);
+       } else if (!strcmp(kbuf, "entitled_memory_weight")) {
+               char *endp;
+               *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+               if (endp == tmp)
+                       return -EINVAL;
 
-       retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr,
-                                   *new_weight_ptr);
+               retval = update_mpp(NULL, new_weight_ptr);
+       } else
+               return -EINVAL;
 
        if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
                retval = count;
@@ -506,8 +642,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
                retval = -EIO;
        }
 
-out:
-       kfree(kbuf);
        return retval;
 }
 
index 219f363..db2497c 100644 (file)
@@ -47,6 +47,8 @@
 #ifdef CONFIG_PPC64
 #include <asm/firmware.h>
 #endif
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
 
 extern unsigned long _get_SP(void);
 
@@ -239,6 +241,35 @@ void discard_lazy_cpu_state(void)
 }
 #endif /* CONFIG_SMP */
 
+void do_dabr(struct pt_regs *regs, unsigned long address,
+                   unsigned long error_code)
+{
+       siginfo_t info;
+
+       if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+                       11, SIGSEGV) == NOTIFY_STOP)
+               return;
+
+       if (debugger_dabr_match(regs))
+               return;
+
+       /* Clear the DAC and struct entries.  One shot trigger */
+#if (defined(CONFIG_44x) || defined(CONFIG_BOOKE))
+       mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | DBSR_DAC1W
+                                                       | DBCR0_IDM));
+#endif
+
+       /* Clear the DABR */
+       set_dabr(0);
+
+       /* Deliver the signal to userspace */
+       info.si_signo = SIGTRAP;
+       info.si_errno = 0;
+       info.si_code = TRAP_HWBKPT;
+       info.si_addr = (void __user *)address;
+       force_sig_info(SIGTRAP, &info, current);
+}
+
 static DEFINE_PER_CPU(unsigned long, current_dabr);
 
 int set_dabr(unsigned long dabr)
@@ -254,6 +285,11 @@ int set_dabr(unsigned long dabr)
 #if defined(CONFIG_PPC64) || defined(CONFIG_6xx)
        mtspr(SPRN_DABR, dabr);
 #endif
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+       mtspr(SPRN_DAC1, dabr);
+#endif
+
        return 0;
 }
 
@@ -337,6 +373,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
        if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
                set_dabr(new->thread.dabr);
 
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+       /* If new thread DAC (HW breakpoint) is the same then leave it */
+       if (new->thread.dabr)
+               set_dabr(new->thread.dabr);
+#endif
+
        new_thread = &new->thread;
        old_thread = &current->thread;
 
@@ -525,6 +567,10 @@ void flush_thread(void)
        if (current->thread.dabr) {
                current->thread.dabr = 0;
                set_dabr(0);
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+               current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W);
+#endif
        }
 }
 
index 1ea8c8d..c4ab219 100644 (file)
@@ -643,6 +643,11 @@ static void __init early_cmdline_parse(void)
 #else
 #define OV5_MSI                        0x00
 #endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO                        0x80    /* Cooperative Memory Overcommitment */
+#else
+#define OV5_CMO                        0x00
+#endif
 
 /*
  * The architecture vector has an array of PVR mask/value pairs,
@@ -687,10 +692,12 @@ static unsigned char ibm_architecture_vec[] = {
        0,                              /* don't halt */
 
        /* option vector 5: PAPR/OF options */
-       3 - 2,                          /* length */
+       5 - 2,                          /* length */
        0,                              /* don't ignore, don't halt */
        OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
        OV5_DONATE_DEDICATE_CPU | OV5_MSI,
+       0,
+       OV5_CMO,
 };
 
 /* Old method - ELF header with PT_NOTE sections */
index 8feb93e..a5d0e78 100644 (file)
@@ -703,7 +703,7 @@ void user_enable_single_step(struct task_struct *task)
 
        if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-               task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC;
+               task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
                regs->msr |= MSR_DE;
 #else
                regs->msr |= MSR_SE;
@@ -716,9 +716,16 @@ void user_disable_single_step(struct task_struct *task)
 {
        struct pt_regs *regs = task->thread.regs;
 
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+       /* If DAC then do not single step, skip */
+       if (task->thread.dabr)
+               return;
+#endif
+
        if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-               task->thread.dbcr0 = 0;
+               task->thread.dbcr0 &= ~(DBCR0_IC | DBCR0_IDM);
                regs->msr &= ~MSR_DE;
 #else
                regs->msr &= ~MSR_SE;
@@ -727,22 +734,75 @@ void user_disable_single_step(struct task_struct *task)
        clear_tsk_thread_flag(task, TIF_SINGLESTEP);
 }
 
-static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
+int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
                               unsigned long data)
 {
-       /* We only support one DABR and no IABRS at the moment */
+       /* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
+        *  For embedded processors we support one DAC and no IAC's at the
+        *  moment.
+        */
        if (addr > 0)
                return -EINVAL;
 
-       /* The bottom 3 bits are flags */
        if ((data & ~0x7UL) >= TASK_SIZE)
                return -EIO;
 
-       /* Ensure translation is on */
+#ifdef CONFIG_PPC64
+
+       /* For processors using DABR (i.e. 970), the bottom 3 bits are flags.
+        *  It was assumed, on previous implementations, that 3 bits were
+        *  passed together with the data address, fitting the design of the
+        *  DABR register, as follows:
+        *
+        *  bit 0: Read flag
+        *  bit 1: Write flag
+        *  bit 2: Breakpoint translation
+        *
+        *  Thus, we use them here as so.
+        */
+
+       /* Ensure breakpoint translation bit is set */
        if (data && !(data & DABR_TRANSLATION))
                return -EIO;
 
+       /* Move contents to the DABR register */
        task->thread.dabr = data;
+
+#endif
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+
+       /* As described above, it was assumed 3 bits were passed with the data
+        *  address, but we will assume only the mode bits will be passed
+        *  as to not cause alignment restrictions for DAC-based processors.
+        */
+
+       /* DAC's hold the whole address without any mode flags */
+       task->thread.dabr = data & ~0x3UL;
+
+       if (task->thread.dabr == 0) {
+               task->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | DBCR0_IDM);
+               task->thread.regs->msr &= ~MSR_DE;
+               return 0;
+       }
+
+       /* Read or Write bits must be set */
+
+       if (!(data & 0x3UL))
+               return -EINVAL;
+
+       /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
+          register */
+       task->thread.dbcr0 = DBCR0_IDM;
+
+       /* Check for write and read flags and set DBCR0
+          accordingly */
+       if (data & 0x1UL)
+               task->thread.dbcr0 |= DBSR_DAC1R;
+       if (data & 0x2UL)
+               task->thread.dbcr0 |= DBSR_DAC1W;
+
+       task->thread.regs->msr |= MSR_DE;
+#endif
        return 0;
 }
 
index ad55488..7aada78 100644 (file)
@@ -145,8 +145,12 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
         * user space. The DABR will have been cleared if it
         * triggered inside the kernel.
         */
-       if (current->thread.dabr)
+       if (current->thread.dabr) {
                set_dabr(current->thread.dabr);
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+               mtspr(SPRN_DBCR0, current->thread.dbcr0);
+#endif
+       }
 
        if (is32) {
                if (ka.sa.sa_flags & SA_SIGINFO)
index aba0ba9..800e5e9 100644 (file)
@@ -529,7 +529,8 @@ static void register_nodes(void)
 #endif
 
 /* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev, char *buf)
+static ssize_t show_physical_id(struct sys_device *dev,
+                               struct sysdev_attribute *attr, char *buf)
 {
        struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
index 878fbdd..81ccb8d 100644 (file)
@@ -1067,6 +1067,22 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
                }
 
                _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+       } else if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
+               regs->msr &= ~MSR_DE;
+
+               if (user_mode(regs)) {
+                       current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W |
+                                                               DBCR0_IDM);
+               } else {
+                       /* Disable DAC interupts */
+                       mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R |
+                                               DBSR_DAC1W | DBCR0_IDM));
+
+                       /* Clear the DAC event */
+                       mtspr(SPRN_DBSR, (DBSR_DAC1R | DBSR_DAC1W));
+               }
+               /* Setup and send the trap to the handler */
+               do_dabr(regs, mfspr(SPRN_DAC1), debug_status);
        }
 }
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
index b77f8af..ade8aea 100644 (file)
@@ -1,11 +1,12 @@
 /*
  * IBM PowerPC Virtual I/O Infrastructure Support.
  *
- *    Copyright (c) 2003-2005 IBM Corp.
+ *    Copyright (c) 2003,2008 IBM Corp.
  *     Dave Engebretsen engebret@us.ibm.com
  *     Santiago Leon santil@us.ibm.com
  *     Hollis Blanchard <hollisb@us.ibm.com>
  *     Stephen Rothwell
+ *     Robert Jennings <rcjenn@us.ibm.com>
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -46,6 +47,996 @@ static struct vio_dev vio_bus_device  = { /* fake "parent" device */
        .dev.bus = &vio_bus_type,
 };
 
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+       size_t size;
+       size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+       struct vio_dev *viodev;
+       struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+struct vio_cmo {
+       spinlock_t lock;
+       struct delayed_work balance_q;
+       struct list_head device_list;
+       size_t entitled;
+       struct vio_cmo_pool reserve;
+       struct vio_cmo_pool excess;
+       size_t spare;
+       size_t min;
+       size_t desired;
+       size_t curr;
+       size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+       struct device_node *node_vroot;
+       int count = 0;
+
+       /*
+        * Count the number of vdevice entries with an
+        * ibm,my-dma-window OF property
+        */
+       node_vroot = of_find_node_by_name(NULL, "vdevice");
+       if (node_vroot) {
+               struct device_node *of_node;
+               struct property *prop;
+
+               for_each_child_of_node(node_vroot, of_node) {
+                       prop = of_find_property(of_node, "ibm,my-dma-window",
+                                              NULL);
+                       if (prop)
+                               count++;
+               }
+       }
+       of_node_put(node_vroot);
+       return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices.  The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ *  0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+       unsigned long flags;
+       size_t reserve_free = 0;
+       size_t excess_free = 0;
+       int ret = -ENOMEM;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+
+       /* Determine the amount of free entitlement available in reserve */
+       if (viodev->cmo.entitled > viodev->cmo.allocated)
+               reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+       /* If spare is not fulfilled, the excess pool can not be used. */
+       if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+               excess_free = vio_cmo.excess.free;
+
+       /* The request can be satisfied */
+       if ((reserve_free + excess_free) >= size) {
+               vio_cmo.curr += size;
+               if (vio_cmo.curr > vio_cmo.high)
+                       vio_cmo.high = vio_cmo.curr;
+               viodev->cmo.allocated += size;
+               size -= min(reserve_free, size);
+               vio_cmo.excess.free -= size;
+               ret = 0;
+       }
+
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+       return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+       unsigned long flags;
+       size_t spare_needed = 0;
+       size_t excess_freed = 0;
+       size_t reserve_freed = size;
+       size_t tmp;
+       int balance = 0;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       vio_cmo.curr -= size;
+
+       /* Amount of memory freed from the excess pool */
+       if (viodev->cmo.allocated > viodev->cmo.entitled) {
+               excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+                                                  viodev->cmo.entitled));
+               reserve_freed -= excess_freed;
+       }
+
+       /* Remove allocation from device */
+       viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+       /* Spare is a subset of the reserve pool, replenish it first. */
+       spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+       /*
+        * Replenish the spare in the reserve pool from the excess pool.
+        * This moves entitlement into the reserve pool.
+        */
+       if (spare_needed && excess_freed) {
+               tmp = min(excess_freed, spare_needed);
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.reserve.size += tmp;
+               vio_cmo.spare += tmp;
+               excess_freed -= tmp;
+               spare_needed -= tmp;
+               balance = 1;
+       }
+
+       /*
+        * Replenish the spare in the reserve pool from the reserve pool.
+        * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+        * if needed, and gives it to the spare pool. The amount of used
+        * memory in this pool does not change.
+        */
+       if (spare_needed && reserve_freed) {
+               tmp = min(spare_needed, min(reserve_freed,
+                                           (viodev->cmo.entitled -
+                                            VIO_CMO_MIN_ENT)));
+
+               vio_cmo.spare += tmp;
+               viodev->cmo.entitled -= tmp;
+               reserve_freed -= tmp;
+               spare_needed -= tmp;
+               balance = 1;
+       }
+
+       /*
+        * Increase the reserve pool until the desired allocation is met.
+        * Move an allocation freed from the excess pool into the reserve
+        * pool and schedule a balance operation.
+        */
+       if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+               tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.reserve.size += tmp;
+               excess_freed -= tmp;
+               balance = 1;
+       }
+
+       /* Return memory from the excess pool to that pool */
+       if (excess_freed)
+               vio_cmo.excess.free += excess_freed;
+
+       if (balance)
+               schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool.  Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+       struct vio_dev *viodev;
+       struct vio_cmo_dev_entry *dev_ent;
+       unsigned long flags;
+       size_t avail, delta, tmp;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+
+       /* Entitlement increases */
+       if (new_entitlement > vio_cmo.entitled) {
+               delta = new_entitlement - vio_cmo.entitled;
+
+               /* Fulfill spare allocation */
+               if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+                       tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+                       vio_cmo.spare += tmp;
+                       vio_cmo.reserve.size += tmp;
+                       delta -= tmp;
+               }
+
+               /* Remaining new allocation goes to the excess pool */
+               vio_cmo.entitled += delta;
+               vio_cmo.excess.size += delta;
+               vio_cmo.excess.free += delta;
+
+               goto out;
+       }
+
+       /* Entitlement decreases */
+       delta = vio_cmo.entitled - new_entitlement;
+       avail = vio_cmo.excess.free;
+
+       /*
+        * Need to check how much unused entitlement each device can
+        * sacrifice to fulfill entitlement change.
+        */
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+               if (avail >= delta)
+                       break;
+
+               viodev = dev_ent->viodev;
+               if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+                   (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+                               avail += viodev->cmo.entitled -
+                                        max_t(size_t, viodev->cmo.allocated,
+                                              VIO_CMO_MIN_ENT);
+       }
+
+       if (delta <= avail) {
+               vio_cmo.entitled -= delta;
+
+               /* Take entitlement from the excess pool first */
+               tmp = min(vio_cmo.excess.free, delta);
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.excess.free -= tmp;
+               delta -= tmp;
+
+               /*
+                * Remove all but VIO_CMO_MIN_ENT bytes from devices
+                * until entitlement change is served
+                */
+               list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+                       if (!delta)
+                               break;
+
+                       viodev = dev_ent->viodev;
+                       tmp = 0;
+                       if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+                           (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+                               tmp = viodev->cmo.entitled -
+                                     max_t(size_t, viodev->cmo.allocated,
+                                           VIO_CMO_MIN_ENT);
+                       viodev->cmo.entitled -= min(tmp, delta);
+                       delta -= min(tmp, delta);
+               }
+       } else {
+               spin_unlock_irqrestore(&vio_cmo.lock, flags);
+               return -ENOMEM;
+       }
+
+out:
+       schedule_delayed_work(&vio_cmo.balance_q, 0);
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+       return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver.  The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+       struct vio_cmo *cmo;
+       struct vio_dev *viodev;
+       struct vio_cmo_dev_entry *dev_ent;
+       unsigned long flags;
+       size_t avail = 0, level, chunk, need;
+       int devcount = 0, fulfilled;
+
+       cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+
+       /* Calculate minimum entitlement and fulfill spare */
+       cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+       BUG_ON(cmo->min > cmo->entitled);
+       cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+       cmo->min += cmo->spare;
+       cmo->desired = cmo->min;
+
+       /*
+        * Determine how much entitlement is available and reset device
+        * entitlements
+        */
+       avail = cmo->entitled - cmo->spare;
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+               viodev = dev_ent->viodev;
+               devcount++;
+               viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+               cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+               avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+       }
+
+       /*
+        * Having provided each device with the minimum entitlement, loop
+        * over the devices portioning out the remaining entitlement
+        * until there is nothing left.
+        */
+       level = VIO_CMO_MIN_ENT;
+       while (avail) {
+               fulfilled = 0;
+               list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+                       viodev = dev_ent->viodev;
+
+                       if (viodev->cmo.desired <= level) {
+                               fulfilled++;
+                               continue;
+                       }
+
+                       /*
+                        * Give the device up to VIO_CMO_BALANCE_CHUNK
+                        * bytes of entitlement, but do not exceed the
+                        * desired level of entitlement for the device.
+                        */
+                       chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+                       chunk = min(chunk, (viodev->cmo.desired -
+                                           viodev->cmo.entitled));
+                       viodev->cmo.entitled += chunk;
+
+                       /*
+                        * If the memory for this entitlement increase was
+                        * already allocated to the device it does not come
+                        * from the available pool being portioned out.
+                        */
+                       need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+                              max(viodev->cmo.allocated, level);
+                       avail -= need;
+
+               }
+               if (fulfilled == devcount)
+                       break;
+               level += VIO_CMO_BALANCE_CHUNK;
+       }
+
+       /* Calculate new reserve and excess pool sizes */
+       cmo->reserve.size = cmo->min;
+       cmo->excess.free = 0;
+       cmo->excess.size = 0;
+       need = 0;
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+               viodev = dev_ent->viodev;
+               /* Calculated reserve size above the minimum entitlement */
+               if (viodev->cmo.entitled)
+                       cmo->reserve.size += (viodev->cmo.entitled -
+                                             VIO_CMO_MIN_ENT);
+               /* Calculated used excess entitlement */
+               if (viodev->cmo.allocated > viodev->cmo.entitled)
+                       need += viodev->cmo.allocated - viodev->cmo.entitled;
+       }
+       cmo->excess.size = cmo->entitled - cmo->reserve.size;
+       cmo->excess.free = cmo->excess.size - need;
+
+       cancel_delayed_work(container_of(work, struct delayed_work, work));
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+                                          dma_addr_t *dma_handle, gfp_t flag)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       void *ret;
+
+       if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+               atomic_inc(&viodev->cmo.allocs_failed);
+               return NULL;
+       }
+
+       ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag);
+       if (unlikely(ret == NULL)) {
+               vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+               atomic_inc(&viodev->cmo.allocs_failed);
+       }
+
+       return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+                                        void *vaddr, dma_addr_t dma_handle)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+
+       dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle);
+
+       vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr,
+                                           size_t size,
+                                           enum dma_data_direction direction,
+                                           struct dma_attrs *attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       dma_addr_t ret = DMA_ERROR_CODE;
+
+       if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+               atomic_inc(&viodev->cmo.allocs_failed);
+               return ret;
+       }
+
+       ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs);
+       if (unlikely(dma_mapping_error(ret))) {
+               vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+               atomic_inc(&viodev->cmo.allocs_failed);
+       }
+
+       return ret;
+}
+
+static void vio_dma_iommu_unmap_single(struct device *dev,
+               dma_addr_t dma_handle, size_t size,
+               enum dma_data_direction direction,
+               struct dma_attrs *attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+
+       dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs);
+
+       vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+                                int nelems, enum dma_data_direction direction,
+                                struct dma_attrs *attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct scatterlist *sgl;
+       int ret, count = 0;
+       size_t alloc_size = 0;
+
+       for (sgl = sglist; count < nelems; count++, sgl++)
+               alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+
+       if (vio_cmo_alloc(viodev, alloc_size)) {
+               atomic_inc(&viodev->cmo.allocs_failed);
+               return 0;
+       }
+
+       ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+       if (unlikely(!ret)) {
+               vio_cmo_dealloc(viodev, alloc_size);
+               atomic_inc(&viodev->cmo.allocs_failed);
+       }
+
+       for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+               alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+       if (alloc_size)
+               vio_cmo_dealloc(viodev, alloc_size);
+
+       return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+               struct scatterlist *sglist, int nelems,
+               enum dma_data_direction direction,
+               struct dma_attrs *attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct scatterlist *sgl;
+       size_t alloc_size = 0;
+       int count = 0;
+
+       for (sgl = sglist; count < nelems; count++, sgl++)
+               alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+
+       dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+       vio_cmo_dealloc(viodev, alloc_size);
+}
+
+struct dma_mapping_ops vio_dma_mapping_ops = {
+       .alloc_coherent = vio_dma_iommu_alloc_coherent,
+       .free_coherent  = vio_dma_iommu_free_coherent,
+       .map_single     = vio_dma_iommu_map_single,
+       .unmap_single   = vio_dma_iommu_unmap_single,
+       .map_sg         = vio_dma_iommu_map_sg,
+       .unmap_sg       = vio_dma_iommu_unmap_sg,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @new_desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs.  The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+       unsigned long flags;
+       struct vio_cmo_dev_entry *dev_ent;
+       int found = 0;
+
+       if (!firmware_has_feature(FW_FEATURE_CMO))
+               return;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       if (desired < VIO_CMO_MIN_ENT)
+               desired = VIO_CMO_MIN_ENT;
+
+       /*
+        * Changes will not be made for devices not in the device list.
+        * If it is not in the device list, then no driver is loaded
+        * for the device and it can not receive entitlement.
+        */
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+               if (viodev == dev_ent->viodev) {
+                       found = 1;
+                       break;
+               }
+       if (!found)
+               return;
+
+       /* Increase/decrease in desired device entitlement */
+       if (desired >= viodev->cmo.desired) {
+               /* Just bump the bus and device values prior to a balance*/
+               vio_cmo.desired += desired - viodev->cmo.desired;
+               viodev->cmo.desired = desired;
+       } else {
+               /* Decrease bus and device values for desired entitlement */
+               vio_cmo.desired -= viodev->cmo.desired - desired;
+               viodev->cmo.desired = desired;
+               /*
+                * If less entitlement is desired than current entitlement, move
+                * any reserve memory in the change region to the excess pool.
+                */
+               if (viodev->cmo.entitled > desired) {
+                       vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+                       vio_cmo.excess.size += viodev->cmo.entitled - desired;
+                       /*
+                        * If entitlement moving from the reserve pool to the
+                        * excess pool is currently unused, add to the excess
+                        * free counter.
+                        */
+                       if (viodev->cmo.allocated < viodev->cmo.entitled)
+                               vio_cmo.excess.free += viodev->cmo.entitled -
+                                                      max(viodev->cmo.allocated, desired);
+                       viodev->cmo.entitled = desired;
+               }
+       }
+       schedule_delayed_work(&vio_cmo.balance_q, 0);
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ *          -ENOMEM when entitlement is not available for device or
+ *          device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+       struct vio_cmo_dev_entry *dev_ent;
+       struct device *dev = &viodev->dev;
+       struct vio_driver *viodrv = to_vio_driver(dev->driver);
+       unsigned long flags;
+       size_t size;
+
+       /*
+        * Check to see that device has a DMA window and configure
+        * entitlement for the device.
+        */
+       if (of_get_property(viodev->dev.archdata.of_node,
+                           "ibm,my-dma-window", NULL)) {
+               /* Check that the driver is CMO enabled and get desired DMA */
+               if (!viodrv->get_desired_dma) {
+                       dev_err(dev, "%s: device driver does not support CMO\n",
+                               __func__);
+                       return -EINVAL;
+               }
+
+               viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+               if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+                       viodev->cmo.desired = VIO_CMO_MIN_ENT;
+               size = VIO_CMO_MIN_ENT;
+
+               dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+                                 GFP_KERNEL);
+               if (!dev_ent)
+                       return -ENOMEM;
+
+               dev_ent->viodev = viodev;
+               spin_lock_irqsave(&vio_cmo.lock, flags);
+               list_add(&dev_ent->list, &vio_cmo.device_list);
+       } else {
+               viodev->cmo.desired = 0;
+               size = 0;
+               spin_lock_irqsave(&vio_cmo.lock, flags);
+       }
+
+       /*
+        * If the needs for vio_cmo.min have not changed since they
+        * were last set, the number of devices in the OF tree has
+        * been constant and the IO memory for this is already in
+        * the reserve pool.
+        */
+       if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+                           VIO_CMO_MIN_ENT)) {
+               /* Updated desired entitlement if device requires it */
+               if (size)
+                       vio_cmo.desired += (viodev->cmo.desired -
+                                       VIO_CMO_MIN_ENT);
+       } else {
+               size_t tmp;
+
+               tmp = vio_cmo.spare + vio_cmo.excess.free;
+               if (tmp < size) {
+                       dev_err(dev, "%s: insufficient free "
+                               "entitlement to add device. "
+                               "Need %lu, have %lu\n", __func__,
+                               size, (vio_cmo.spare + tmp));
+                       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+                       return -ENOMEM;
+               }
+
+               /* Use excess pool first to fulfill request */
+               tmp = min(size, vio_cmo.excess.free);
+               vio_cmo.excess.free -= tmp;
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.reserve.size += tmp;
+
+               /* Use spare if excess pool was insufficient */
+               vio_cmo.spare -= size - tmp;
+
+               /* Update bus accounting */
+               vio_cmo.min += size;
+               vio_cmo.desired += viodev->cmo.desired;
+       }
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+       return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list.  The minimum entitlement
+ * will be reserved for the device as long as it is in the system.  The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+       struct vio_cmo_dev_entry *dev_ent;
+       unsigned long flags;
+       size_t tmp;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       if (viodev->cmo.allocated) {
+               dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+                       "allocated after remove operation.\n",
+                       __func__, viodev->cmo.allocated);
+               BUG();
+       }
+
+       /*
+        * Remove the device from the device list being maintained for
+        * CMO enabled devices.
+        */
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+               if (viodev == dev_ent->viodev) {
+                       list_del(&dev_ent->list);
+                       kfree(dev_ent);
+                       break;
+               }
+
+       /*
+        * Devices may not require any entitlement and they do not need
+        * to be processed.  Otherwise, return the device's entitlement
+        * back to the pools.
+        */
+       if (viodev->cmo.entitled) {
+               /*
+                * This device has not yet left the OF tree, it's
+                * minimum entitlement remains in vio_cmo.min and
+                * vio_cmo.desired
+                */
+               vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+               /*
+                * Save min allocation for device in reserve as long
+                * as it exists in OF tree as determined by later
+                * balance operation
+                */
+               viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+               /* Replenish spare from freed reserve pool */
+               if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+                       tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+                                                        vio_cmo.spare));
+                       vio_cmo.spare += tmp;
+                       viodev->cmo.entitled -= tmp;
+               }
+
+               /* Remaining reserve goes to excess pool */
+               vio_cmo.excess.size += viodev->cmo.entitled;
+               vio_cmo.excess.free += viodev->cmo.entitled;
+               vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+               /*
+                * Until the device is removed it will keep a
+                * minimum entitlement; this will guarantee that
+                * a module unload/load will result in a success.
+                */
+               viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+               viodev->cmo.desired = VIO_CMO_MIN_ENT;
+               atomic_set(&viodev->cmo.allocs_failed, 0);
+       }
+
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+       vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported;
+       viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops;
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+       struct hvcall_mpp_data mpp_data;
+       int err;
+
+       memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+       spin_lock_init(&vio_cmo.lock);
+       INIT_LIST_HEAD(&vio_cmo.device_list);
+       INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+       /* Get current system entitlement */
+       err = h_get_mpp(&mpp_data);
+
+       /*
+        * On failure, continue with entitlement set to 0, will panic()
+        * later when spare is reserved.
+        */
+       if (err != H_SUCCESS) {
+               printk(KERN_ERR "%s: unable to determine system IO "\
+                      "entitlement. (%d)\n", __func__, err);
+               vio_cmo.entitled = 0;
+       } else {
+               vio_cmo.entitled = mpp_data.entitled_mem;
+       }
+
+       /* Set reservation and check against entitlement */
+       vio_cmo.spare = VIO_CMO_MIN_ENT;
+       vio_cmo.reserve.size = vio_cmo.spare;
+       vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+                                VIO_CMO_MIN_ENT);
+       if (vio_cmo.reserve.size > vio_cmo.entitled) {
+               printk(KERN_ERR "%s: insufficient system entitlement\n",
+                      __func__);
+               panic("%s: Insufficient system entitlement", __func__);
+       }
+
+       /* Set the remaining accounting variables */
+       vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+       vio_cmo.excess.free = vio_cmo.excess.size;
+       vio_cmo.min = vio_cmo.reserve.size;
+       vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name)                                        \
+static ssize_t viodev_cmo_##name##_show(struct device *dev,             \
+                                        struct device_attribute *attr,  \
+                                         char *buf)                     \
+{                                                                       \
+       return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name);        \
+}
+
+static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       atomic_set(&viodev->cmo.allocs_failed, 0);
+       return count;
+}
+
+static ssize_t viodev_cmo_desired_set(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       size_t new_desired;
+       int ret;
+
+       ret = strict_strtoul(buf, 10, &new_desired);
+       if (ret)
+               return ret;
+
+       vio_cmo_set_dev_desired(viodev, new_desired);
+       return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static struct device_attribute vio_cmo_dev_attrs[] = {
+       __ATTR_RO(name),
+       __ATTR_RO(devspec),
+       __ATTR(cmo_desired,       S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+              viodev_cmo_desired_show, viodev_cmo_desired_set),
+       __ATTR(cmo_entitled,      S_IRUGO, viodev_cmo_entitled_show,      NULL),
+       __ATTR(cmo_allocated,     S_IRUGO, viodev_cmo_allocated_show,     NULL),
+       __ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+              viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
+       __ATTR_NULL
+};
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name)                                        \
+static ssize_t                                                          \
+viobus_cmo_##name##_show(struct bus_type *bt, char *buf)                \
+{                                                                       \
+       return sprintf(buf, "%lu\n", vio_cmo.name);                     \
+}
+
+#define viobus_cmo_pool_rd_attr(name, var)                              \
+static ssize_t                                                          \
+viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf)     \
+{                                                                       \
+       return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
+}
+
+static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
+                                     size_t count)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       vio_cmo.high = vio_cmo.curr;
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+       return count;
+}
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_rd_attr(high);
+
+static struct bus_attribute vio_cmo_bus_attrs[] = {
+       __ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL),
+       __ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL),
+       __ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL),
+       __ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL),
+       __ATTR(cmo_spare,   S_IRUGO, viobus_cmo_spare_show,   NULL),
+       __ATTR(cmo_min,     S_IRUGO, viobus_cmo_min_show,     NULL),
+       __ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL),
+       __ATTR(cmo_curr,    S_IRUGO, viobus_cmo_curr_show,    NULL),
+       __ATTR(cmo_high,    S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+              viobus_cmo_high_show, viobus_cmo_high_reset),
+       __ATTR_NULL
+};
+
+static void vio_cmo_sysfs_init(void)
+{
+       vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
+       vio_bus_type.bus_attrs = vio_cmo_bus_attrs;
+}
+#else /* CONFIG_PPC_SMLPAR */
+/* Dummy functions for iSeries platform */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init() {}
+static void vio_cmo_sysfs_init() { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
 static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 {
        const unsigned char *dma_window;
@@ -114,8 +1105,17 @@ static int vio_bus_probe(struct device *dev)
                return error;
 
        id = vio_match_device(viodrv->id_table, viodev);
-       if (id)
+       if (id) {
+               memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+               if (firmware_has_feature(FW_FEATURE_CMO)) {
+                       error = vio_cmo_bus_probe(viodev);
+                       if (error)
+                               return error;
+               }
                error = viodrv->probe(viodev, id);
+               if (error)
+                       vio_cmo_bus_remove(viodev);
+       }
 
        return error;
 }
@@ -125,12 +1125,23 @@ static int vio_bus_remove(struct device *dev)
 {
        struct vio_dev *viodev = to_vio_dev(dev);
        struct vio_driver *viodrv = to_vio_driver(dev->driver);
+       struct device *devptr;
+       int ret = 1;
+
+       /*
+        * Hold a reference to the device after the remove function is called
+        * to allow for CMO accounting cleanup for the device.
+        */
+       devptr = get_device(dev);
 
        if (viodrv->remove)
-               return viodrv->remove(viodev);
+               ret = viodrv->remove(viodev);
+
+       if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+               vio_cmo_bus_remove(viodev);
 
-       /* driver can't remove */
-       return 1;
+       put_device(devptr);
+       return ret;
 }
 
 /**
@@ -215,7 +1226,11 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
                        viodev->unit_address = *unit_address;
        }
        viodev->dev.archdata.of_node = of_node_get(of_node);
-       viodev->dev.archdata.dma_ops = &dma_iommu_ops;
+
+       if (firmware_has_feature(FW_FEATURE_CMO))
+               vio_cmo_set_dma_ops(viodev);
+       else
+               viodev->dev.archdata.dma_ops = &dma_iommu_ops;
        viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev);
        viodev->dev.archdata.numa_node = of_node_to_nid(of_node);
 
@@ -245,6 +1260,9 @@ static int __init vio_bus_init(void)
        int err;
        struct device_node *node_vroot;
 
+       if (firmware_has_feature(FW_FEATURE_CMO))
+               vio_cmo_sysfs_init();
+
        err = bus_register(&vio_bus_type);
        if (err) {
                printk(KERN_ERR "failed to register VIO bus\n");
@@ -262,6 +1280,9 @@ static int __init vio_bus_init(void)
                return err;
        }
 
+       if (firmware_has_feature(FW_FEATURE_CMO))
+               vio_cmo_bus_init();
+
        node_vroot = of_find_node_by_name(NULL, "vdevice");
        if (node_vroot) {
                struct device_node *of_node;
index a914411..4a8ce62 100644 (file)
@@ -85,7 +85,7 @@ SECTIONS
 
        /* The dummy segment contents for the bug workaround mentioned above
           near PHDRS.  */
-       .dummy : {
+       .dummy : AT(ADDR(.dummy) - LOAD_OFFSET) {
                LONG(0xf177)
        } :kernel :dummy
 
index 1707d00..565b7a2 100644 (file)
@@ -100,31 +100,6 @@ static int store_updates_sp(struct pt_regs *regs)
        return 0;
 }
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
-                   unsigned long error_code)
-{
-       siginfo_t info;
-
-       if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
-                       11, SIGSEGV) == NOTIFY_STOP)
-               return;
-
-       if (debugger_dabr_match(regs))
-               return;
-
-       /* Clear the DABR */
-       set_dabr(0);
-
-       /* Deliver the signal to userspace */
-       info.si_signo = SIGTRAP;
-       info.si_errno = 0;
-       info.si_code = TRAP_HWBKPT;
-       info.si_addr = (void __user *)address;
-       force_sig_info(SIGTRAP, &info, current);
-}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
-
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
  * for a data fault, SRR1 for an instruction fault. For 400-family processors
index ccbd495..696a5ee 100644 (file)
@@ -1,7 +1,6 @@
 config PPC_MPC52xx
        bool "52xx-based boards"
        depends on PPC_MULTIPLATFORM && PPC32
-       select FSL_SOC
        select PPC_CLOCK
        select PPC_PCI_CHOICE
 
@@ -49,5 +48,6 @@ config PPC_MPC5200_GPIO
        bool "MPC5200 GPIO support"
        depends on PPC_MPC52xx
        select ARCH_REQUIRE_GPIOLIB
+       select GENERIC_GPIO
        help
          Enable gpiolib support for mpc5200 based boards
index 208005c..e06420a 100644 (file)
@@ -172,7 +172,7 @@ static void invalidate_tce_cache(struct cbe_iommu *iommu, unsigned long *pte,
        }
 }
 
-static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
+static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
                unsigned long uaddr, enum dma_data_direction direction,
                struct dma_attrs *attrs)
 {
@@ -213,6 +213,7 @@ static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
 
        pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n",
                 index, npages, direction, base_pte);
+       return 0;
 }
 
 static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
@@ -1150,12 +1151,23 @@ static int iommu_fixed_disabled;
 
 static int __init setup_iommu_fixed(char *str)
 {
+       struct device_node *pciep;
+
        if (strcmp(str, "off") == 0)
                iommu_fixed_disabled = 1;
 
-       else if (strcmp(str, "weak") == 0)
+       /* If we can find a pcie-endpoint in the device tree assume that
+        * we're on a triblade or a CAB so by default the fixed mapping
+        * should be set to be weakly ordered; but only if the boot
+        * option WASN'T set for strong ordering
+        */
+       pciep = of_find_node_by_type(NULL, "pcie-endpoint");
+
+       if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0))
                iommu_fixed_is_weak = 1;
 
+       of_node_put(pciep);
+
        return 1;
 }
 __setup("iommu_fixed=", setup_iommu_fixed);
index 3465474..2deeeba 100644 (file)
@@ -312,11 +312,28 @@ static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
         */
        node = cpu_to_node(raw_smp_processor_id());
        for (n = 0; n < MAX_NUMNODES; n++, node++) {
+               int available_spus;
+
                node = (node < MAX_NUMNODES) ? node : 0;
                if (!node_allowed(ctx, node))
                        continue;
+
+               available_spus = 0;
                mutex_lock(&cbe_spu_info[node].list_mutex);
                list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+                       if (spu->ctx && spu->ctx->gang
+                                       && spu->ctx->aff_offset == 0)
+                               available_spus -=
+                                       (spu->ctx->gang->contexts - 1);
+                       else
+                               available_spus++;
+               }
+               if (available_spus < ctx->gang->contexts) {
+                       mutex_unlock(&cbe_spu_info[node].list_mutex);
+                       continue;
+               }
+
+               list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
                        if ((!mem_aff || spu->has_mem_affinity) &&
                                                        sched_spu(spu)) {
                                mutex_unlock(&cbe_spu_info[node].list_mutex);
@@ -389,6 +406,9 @@ static int has_affinity(struct spu_context *ctx)
        if (list_empty(&ctx->aff_list))
                return 0;
 
+       if (atomic_read(&ctx->gang->aff_sched_count) == 0)
+               ctx->gang->aff_ref_spu = NULL;
+
        if (!gang->aff_ref_spu) {
                if (!(gang->aff_flags & AFF_MERGED))
                        aff_merge_remaining_ctxs(gang);
@@ -416,14 +436,8 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
        if (spu->ctx->flags & SPU_CREATE_NOSCHED)
                atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
 
-       if (ctx->gang){
-               mutex_lock(&ctx->gang->aff_mutex);
-               if (has_affinity(ctx)) {
-                       if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
-                               ctx->gang->aff_ref_spu = NULL;
-               }
-               mutex_unlock(&ctx->gang->aff_mutex);
-       }
+       if (ctx->gang)
+               atomic_dec_if_positive(&ctx->gang->aff_sched_count);
 
        spu_switch_notify(spu, NULL);
        spu_unmap_mappings(ctx);
@@ -562,10 +576,7 @@ static struct spu *spu_get_idle(struct spu_context *ctx)
                                goto found;
                        mutex_unlock(&cbe_spu_info[node].list_mutex);
 
-                       mutex_lock(&ctx->gang->aff_mutex);
-                       if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
-                               ctx->gang->aff_ref_spu = NULL;
-                       mutex_unlock(&ctx->gang->aff_mutex);
+                       atomic_dec(&ctx->gang->aff_sched_count);
                        goto not_found;
                }
                mutex_unlock(&ctx->gang->aff_mutex);
index 8c0e957..92d20e9 100644 (file)
@@ -196,8 +196,7 @@ static int __init sputrace_init(void)
        struct proc_dir_entry *entry;
        int i, error = -ENOMEM;
 
-       sputrace_log = kcalloc(sizeof(struct sputrace),
-                               bufsize, GFP_KERNEL);
+       sputrace_log = kcalloc(bufsize, sizeof(struct sputrace), GFP_KERNEL);
        if (!sputrace_log)
                goto out;
 
index bc818e4..bb464d1 100644 (file)
@@ -41,7 +41,7 @@
 #include <asm/iseries/hv_call_event.h>
 #include <asm/iseries/iommu.h>
 
-static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
+static int tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
                unsigned long uaddr, enum dma_data_direction direction,
                struct dma_attrs *attrs)
 {
@@ -71,6 +71,7 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
                index++;
                uaddr += TCE_PAGE_SIZE;
        }
+       return 0;
 }
 
 static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
index 70541b7..a0ff03a 100644 (file)
@@ -83,7 +83,7 @@ static u32 *iob_l2_base;
 static struct iommu_table iommu_table_iobmap;
 static int iommu_table_iobmap_inited;
 
-static void iobmap_build(struct iommu_table *tbl, long index,
+static int iobmap_build(struct iommu_table *tbl, long index,
                         long npages, unsigned long uaddr,
                         enum dma_data_direction direction,
                         struct dma_attrs *attrs)
@@ -108,6 +108,7 @@ static void iobmap_build(struct iommu_table *tbl, long index,
                uaddr += IOBMAP_PAGE_SIZE;
                bus_addr += IOBMAP_PAGE_SIZE;
        }
+       return 0;
 }
 
 
index 757c029..97619fd 100644 (file)
@@ -40,3 +40,26 @@ config PPC_PSERIES_DEBUG
        depends on PPC_PSERIES && PPC_EARLY_DEBUG
        bool "Enable extra debug logging in platforms/pseries"
        default y
+
+config PPC_SMLPAR
+       bool "Support for shared-memory logical partitions"
+       depends on PPC_PSERIES
+       select LPARCFG
+       default n
+       help
+         Select this option to enable shared memory partition support.
+         With this option a system running in an LPAR can be given more
+         memory than physically available and will allow firmware to
+         balance memory across many LPARs.
+
+config CMM
+       tristate "Collaborative memory management"
+       depends on PPC_SMLPAR
+       default y
+       help
+         Select this option, if you want to enable the kernel interface
+         to reduce the memory size of the system. This is accomplished
+         by allocating pages of memory and put them "on hold". This only
+         makes sense for a system running in an LPAR where the unused pages
+         will be reused for other LPARs. The interface allows firmware to
+         balance memory across many LPARs.
index 554c6e4..dfe574a 100644 (file)
@@ -24,3 +24,4 @@ obj-$(CONFIG_HVC_CONSOLE)     += hvconsole.o
 obj-$(CONFIG_HVCS)             += hvcserver.o
 obj-$(CONFIG_HCALL_STATS)      += hvCall_inst.o
 obj-$(CONFIG_PHYP_DUMP)        += phyp_dump.o
+obj-$(CONFIG_CMM)              += cmm.o
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
new file mode 100644 (file)
index 0000000..c6b3be0
--- /dev/null
@@ -0,0 +1,468 @@
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Author(s): Brian King (brking@linux.vnet.ibm.com),
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/sched.h>
+#include <linux/stringify.h>
+#include <linux/swap.h>
+#include <linux/sysdev.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+
+#include "plpar_wrappers.h"
+
+#define CMM_DRIVER_VERSION     "1.0.0"
+#define CMM_DEFAULT_DELAY      1
+#define CMM_DEBUG                      0
+#define CMM_DISABLE            0
+#define CMM_OOM_KB             1024
+#define CMM_MIN_MEM_MB         256
+#define KB2PAGES(_p)           ((_p)>>(PAGE_SHIFT-10))
+#define PAGES2KB(_p)           ((_p)<<(PAGE_SHIFT-10))
+
+static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int oom_kb = CMM_OOM_KB;
+static unsigned int cmm_debug = CMM_DEBUG;
+static unsigned int cmm_disabled = CMM_DISABLE;
+static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
+static struct sys_device cmm_sysdev;
+
+MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CMM_DRIVER_VERSION);
+
+module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
+                "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
+                "[Default=" __stringify(CMM_OOM_KB) "]");
+module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
+                "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
+module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
+                "[Default=" __stringify(CMM_DEBUG) "]");
+
+#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
+
+#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
+
+struct cmm_page_array {
+       struct cmm_page_array *next;
+       unsigned long index;
+       unsigned long page[CMM_NR_PAGES];
+};
+
+static unsigned long loaned_pages;
+static unsigned long loaned_pages_target;
+static unsigned long oom_freed_pages;
+
+static struct cmm_page_array *cmm_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static struct task_struct *cmm_thread_ptr;
+
+/**
+ * cmm_alloc_pages - Allocate pages and mark them as loaned
+ * @nr:        number of pages to allocate
+ *
+ * Return value:
+ *     number of pages requested to be allocated which were not
+ **/
+static long cmm_alloc_pages(long nr)
+{
+       struct cmm_page_array *pa, *npa;
+       unsigned long addr;
+       long rc;
+
+       cmm_dbg("Begin request for %ld pages\n", nr);
+
+       while (nr) {
+               addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
+                                      __GFP_NORETRY | __GFP_NOMEMALLOC);
+               if (!addr)
+                       break;
+               spin_lock(&cmm_lock);
+               pa = cmm_page_list;
+               if (!pa || pa->index >= CMM_NR_PAGES) {
+                       /* Need a new page for the page list. */
+                       spin_unlock(&cmm_lock);
+                       npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
+                                                                      __GFP_NORETRY | __GFP_NOMEMALLOC);
+                       if (!npa) {
+                               pr_info("%s: Can not allocate new page list\n", __FUNCTION__);
+                               free_page(addr);
+                               break;
+                       }
+                       spin_lock(&cmm_lock);
+                       pa = cmm_page_list;
+
+                       if (!pa || pa->index >= CMM_NR_PAGES) {
+                               npa->next = pa;
+                               npa->index = 0;
+                               pa = npa;
+                               cmm_page_list = pa;
+                       } else
+                               free_page((unsigned long) npa);
+               }
+
+               if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+                       pr_err("%s: Can not set page to loaned. rc=%ld\n", __FUNCTION__, rc);
+                       spin_unlock(&cmm_lock);
+                       free_page(addr);
+                       break;
+               }
+
+               pa->page[pa->index++] = addr;
+               loaned_pages++;
+               totalram_pages--;
+               spin_unlock(&cmm_lock);
+               nr--;
+       }
+
+       cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+       return nr;
+}
+
+/**
+ * cmm_free_pages - Free pages and mark them as active
+ * @nr:        number of pages to free
+ *
+ * Return value:
+ *     number of pages requested to be freed which were not
+ **/
+static long cmm_free_pages(long nr)
+{
+       struct cmm_page_array *pa;
+       unsigned long addr;
+
+       cmm_dbg("Begin free of %ld pages.\n", nr);
+       spin_lock(&cmm_lock);
+       pa = cmm_page_list;
+       while (nr) {
+               if (!pa || pa->index <= 0)
+                       break;
+               addr = pa->page[--pa->index];
+
+               if (pa->index == 0) {
+                       pa = pa->next;
+                       free_page((unsigned long) cmm_page_list);
+                       cmm_page_list = pa;
+               }
+
+               plpar_page_set_active(__pa(addr));
+               free_page(addr);
+               loaned_pages--;
+               nr--;
+               totalram_pages++;
+       }
+       spin_unlock(&cmm_lock);
+       cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+       return nr;
+}
+
+/**
+ * cmm_oom_notify - OOM notifier
+ * @self:      notifier block struct
+ * @dummy:     not used
+ * @parm:      returned - number of pages freed
+ *
+ * Return value:
+ *     NOTIFY_OK
+ **/
+static int cmm_oom_notify(struct notifier_block *self,
+                         unsigned long dummy, void *parm)
+{
+       unsigned long *freed = parm;
+       long nr = KB2PAGES(oom_kb);
+
+       cmm_dbg("OOM processing started\n");
+       nr = cmm_free_pages(nr);
+       loaned_pages_target = loaned_pages;
+       *freed += KB2PAGES(oom_kb) - nr;
+       oom_freed_pages += KB2PAGES(oom_kb) - nr;
+       cmm_dbg("OOM processing complete\n");
+       return NOTIFY_OK;
+}
+
+/**
+ * cmm_get_mpp - Read memory performance parameters
+ *
+ * Makes hcall to query the current page loan request from the hypervisor.
+ *
+ * Return value:
+ *     nothing
+ **/
+static void cmm_get_mpp(void)
+{
+       int rc;
+       struct hvcall_mpp_data mpp_data;
+       unsigned long active_pages_target;
+       signed long page_loan_request;
+
+       rc = h_get_mpp(&mpp_data);
+
+       if (rc != H_SUCCESS)
+               return;
+
+       page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
+       loaned_pages_target = page_loan_request + loaned_pages;
+       if (loaned_pages_target > oom_freed_pages)
+               loaned_pages_target -= oom_freed_pages;
+       else
+               loaned_pages_target = 0;
+
+       active_pages_target = totalram_pages + loaned_pages - loaned_pages_target;
+
+       if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE))
+               loaned_pages_target = totalram_pages + loaned_pages -
+                       ((min_mem_mb * 1024 * 1024) / PAGE_SIZE);
+
+       cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
+               page_loan_request, loaned_pages, loaned_pages_target,
+               oom_freed_pages, totalram_pages);
+}
+
+static struct notifier_block cmm_oom_nb = {
+       .notifier_call = cmm_oom_notify
+};
+
+/**
+ * cmm_thread - CMM task thread
+ * @dummy:     not used
+ *
+ * Return value:
+ *     0
+ **/
+static int cmm_thread(void *dummy)
+{
+       unsigned long timeleft;
+
+       while (1) {
+               timeleft = msleep_interruptible(delay * 1000);
+
+               if (kthread_should_stop() || timeleft) {
+                       loaned_pages_target = loaned_pages;
+                       break;
+               }
+
+               cmm_get_mpp();
+
+               if (loaned_pages_target > loaned_pages) {
+                       if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
+                               loaned_pages_target = loaned_pages;
+               } else if (loaned_pages_target < loaned_pages)
+                       cmm_free_pages(loaned_pages - loaned_pages_target);
+       }
+       return 0;
+}
+
+#define CMM_SHOW(name, format, args...)                        \
+       static ssize_t show_##name(struct sys_device *dev, char *buf)   \
+       {                                                       \
+               return sprintf(buf, format, ##args);            \
+       }                                                       \
+       static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
+
+static ssize_t show_oom_pages(struct sys_device *dev, char *buf)
+{
+       return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
+}
+
+static ssize_t store_oom_pages(struct sys_device *dev,
+                              const char *buf, size_t count)
+{
+       unsigned long val = simple_strtoul (buf, NULL, 10);
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       if (val != 0)
+               return -EBADMSG;
+
+       oom_freed_pages = 0;
+       return count;
+}
+
+static SYSDEV_ATTR(oom_freed_kb, S_IWUSR| S_IRUGO,
+                  show_oom_pages, store_oom_pages);
+
+static struct sysdev_attribute *cmm_attrs[] = {
+       &attr_loaned_kb,
+       &attr_loaned_target_kb,
+       &attr_oom_freed_kb,
+};
+
+static struct sysdev_class cmm_sysdev_class = {
+       .name = "cmm",
+};
+
+/**
+ * cmm_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ *     0 on success / other on failure
+ **/
+static int cmm_sysfs_register(struct sys_device *sysdev)
+{
+       int i, rc;
+
+       if ((rc = sysdev_class_register(&cmm_sysdev_class)))
+               return rc;
+
+       sysdev->id = 0;
+       sysdev->cls = &cmm_sysdev_class;
+
+       if ((rc = sysdev_register(sysdev)))
+               goto class_unregister;
+
+       for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
+               if ((rc = sysdev_create_file(sysdev, cmm_attrs[i])))
+                       goto fail;
+       }
+
+       return 0;
+
+fail:
+       while (--i >= 0)
+               sysdev_remove_file(sysdev, cmm_attrs[i]);
+       sysdev_unregister(sysdev);
+class_unregister:
+       sysdev_class_unregister(&cmm_sysdev_class);
+       return rc;
+}
+
+/**
+ * cmm_unregister_sysfs - Unregister from sysfs
+ *
+ **/
+static void cmm_unregister_sysfs(struct sys_device *sysdev)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
+               sysdev_remove_file(sysdev, cmm_attrs[i]);
+       sysdev_unregister(sysdev);
+       sysdev_class_unregister(&cmm_sysdev_class);
+}
+
+/**
+ * cmm_init - Module initialization
+ *
+ * Return value:
+ *     0 on success / other on failure
+ **/
+static int cmm_init(void)
+{
+       int rc = -ENOMEM;
+
+       if (!firmware_has_feature(FW_FEATURE_CMO))
+               return -EOPNOTSUPP;
+
+       if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+               return rc;
+
+       if ((rc = cmm_sysfs_register(&cmm_sysdev)))
+               goto out_oom_notifier;
+
+       if (cmm_disabled)
+               return rc;
+
+       cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+       if (IS_ERR(cmm_thread_ptr)) {
+               rc = PTR_ERR(cmm_thread_ptr);
+               goto out_unregister_sysfs;
+       }
+
+       return rc;
+
+out_unregister_sysfs:
+       cmm_unregister_sysfs(&cmm_sysdev);
+out_oom_notifier:
+       unregister_oom_notifier(&cmm_oom_nb);
+       return rc;
+}
+
+/**
+ * cmm_exit - Module exit
+ *
+ * Return value:
+ *     nothing
+ **/
+static void cmm_exit(void)
+{
+       if (cmm_thread_ptr)
+               kthread_stop(cmm_thread_ptr);
+       unregister_oom_notifier(&cmm_oom_nb);
+       cmm_free_pages(loaned_pages);
+       cmm_unregister_sysfs(&cmm_sysdev);
+}
+
+/**
+ * cmm_set_disable - Disable/Enable CMM
+ *
+ * Return value:
+ *     0 on success / other on failure
+ **/
+static int cmm_set_disable(const char *val, struct kernel_param *kp)
+{
+       int disable = simple_strtoul(val, NULL, 10);
+
+       if (disable != 0 && disable != 1)
+               return -EINVAL;
+
+       if (disable && !cmm_disabled) {
+               if (cmm_thread_ptr)
+                       kthread_stop(cmm_thread_ptr);
+               cmm_thread_ptr = NULL;
+               cmm_free_pages(loaned_pages);
+       } else if (!disable && cmm_disabled) {
+               cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+               if (IS_ERR(cmm_thread_ptr))
+                       return PTR_ERR(cmm_thread_ptr);
+       }
+
+       cmm_disabled = disable;
+       return 0;
+}
+
+module_param_call(disable, cmm_set_disable, param_get_uint,
+                 &cmm_disabled, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
+                "[Default=" __stringify(CMM_DISABLE) "]");
+
+module_init(cmm_init);
+module_exit(cmm_exit);
index 5377dd4..a8c4466 100644 (file)
@@ -48,7 +48,7 @@
 #include "plpar_wrappers.h"
 
 
-static void tce_build_pSeries(struct iommu_table *tbl, long index,
+static int tce_build_pSeries(struct iommu_table *tbl, long index,
                              long npages, unsigned long uaddr,
                              enum dma_data_direction direction,
                              struct dma_attrs *attrs)
@@ -72,6 +72,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index,
                uaddr += TCE_PAGE_SIZE;
                tcep++;
        }
+       return 0;
 }
 
 
@@ -94,14 +95,19 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
        return *tcep;
 }
 
-static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static void tce_free_pSeriesLP(struct iommu_table*, long, long);
+static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+
+static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
                                long npages, unsigned long uaddr,
                                enum dma_data_direction direction,
                                struct dma_attrs *attrs)
 {
-       u64 rc;
+       u64 rc = 0;
        u64 proto_tce, tce;
        u64 rpn;
+       int ret = 0;
+       long tcenum_start = tcenum, npages_start = npages;
 
        rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
        proto_tce = TCE_PCI_READ;
@@ -112,6 +118,13 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
                tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
                rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
 
+               if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+                       ret = (int)rc;
+                       tce_free_pSeriesLP(tbl, tcenum_start,
+                                          (npages_start - (npages + 1)));
+                       break;
+               }
+
                if (rc && printk_ratelimit()) {
                        printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
                        printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
@@ -123,25 +136,27 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
                tcenum++;
                rpn++;
        }
+       return ret;
 }
 
 static DEFINE_PER_CPU(u64 *, tce_page) = NULL;
 
-static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
                                     long npages, unsigned long uaddr,
                                     enum dma_data_direction direction,
                                     struct dma_attrs *attrs)
 {
-       u64 rc;
+       u64 rc = 0;
        u64 proto_tce;
        u64 *tcep;
        u64 rpn;
        long l, limit;
+       long tcenum_start = tcenum, npages_start = npages;
+       int ret = 0;
 
        if (npages == 1) {
-               tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
-                                   direction, attrs);
-               return;
+               return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+                                          direction, attrs);
        }
 
        tcep = __get_cpu_var(tce_page);
@@ -153,9 +168,8 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
                tcep = (u64 *)__get_free_page(GFP_ATOMIC);
                /* If allocation fails, fall back to the loop implementation */
                if (!tcep) {
-                       tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+                       return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
                                            direction, attrs);
-                       return;
                }
                __get_cpu_var(tce_page) = tcep;
        }
@@ -187,6 +201,13 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
                tcenum += limit;
        } while (npages > 0 && !rc);
 
+       if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+               ret = (int)rc;
+               tce_freemulti_pSeriesLP(tbl, tcenum_start,
+                                       (npages_start - (npages + limit)));
+               return ret;
+       }
+
        if (rc && printk_ratelimit()) {
                printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
                printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
@@ -194,6 +215,7 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
                printk("\ttce[0] val = 0x%lx\n", tcep[0]);
                show_stack(current, (unsigned long *)__get_SP());
        }
+       return ret;
 }
 
 static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
index d8680b5..a437267 100644 (file)
@@ -42,6 +42,16 @@ static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
        return vpa_call(0x3, cpu, vpa);
 }
 
+static inline long plpar_page_set_loaned(unsigned long vpa)
+{
+       return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa, 0);
+}
+
+static inline long plpar_page_set_active(unsigned long vpa)
+{
+       return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa, 0);
+}
+
 extern void vpa_init(int cpu);
 
 static inline long plpar_pte_enter(unsigned long flags,
index 90beb44..063a0d2 100644 (file)
@@ -314,6 +314,76 @@ static int pseries_set_xdabr(unsigned long dabr)
                        H_DABRX_KERNEL | H_DABRX_USER);
 }
 
+#define CMO_CHARACTERISTICS_TOKEN 44
+#define CMO_MAXLENGTH 1026
+
+/**
+ * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
+ * handle that here. (Stolen from parse_system_parameter_string)
+ */
+void pSeries_cmo_feature_init(void)
+{
+       char *ptr, *key, *value, *end;
+       int call_status;
+       int PrPSP = -1;
+       int SecPSP = -1;
+
+       pr_debug(" -> fw_cmo_feature_init()\n");
+       spin_lock(&rtas_data_buf_lock);
+       memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+       call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+                               NULL,
+                               CMO_CHARACTERISTICS_TOKEN,
+                               __pa(rtas_data_buf),
+                               RTAS_DATA_BUF_SIZE);
+
+       if (call_status != 0) {
+               spin_unlock(&rtas_data_buf_lock);
+               pr_debug("CMO not available\n");
+               pr_debug(" <- fw_cmo_feature_init()\n");
+               return;
+       }
+
+       end = rtas_data_buf + CMO_MAXLENGTH - 2;
+       ptr = rtas_data_buf + 2;        /* step over strlen value */
+       key = value = ptr;
+
+       while (*ptr && (ptr <= end)) {
+               /* Separate the key and value by replacing '=' with '\0' and
+                * point the value at the string after the '='
+                */
+               if (ptr[0] == '=') {
+                       ptr[0] = '\0';
+                       value = ptr + 1;
+               } else if (ptr[0] == '\0' || ptr[0] == ',') {
+                       /* Terminate the string containing the key/value pair */
+                       ptr[0] = '\0';
+
+                       if (key == value) {
+                               pr_debug("Malformed key/value pair\n");
+                               /* Never found a '=', end processing */
+                               break;
+                       }
+
+                       if (0 == strcmp(key, "PrPSP"))
+                               PrPSP = simple_strtol(value, NULL, 10);
+                       else if (0 == strcmp(key, "SecPSP"))
+                               SecPSP = simple_strtol(value, NULL, 10);
+                       value = key = ptr + 1;
+               }
+               ptr++;
+       }
+
+       if (PrPSP != -1 || SecPSP != -1) {
+               pr_info("CMO enabled\n");
+               pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
+               powerpc_firmware_features |= FW_FEATURE_CMO;
+       } else
+               pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
+       spin_unlock(&rtas_data_buf_lock);
+       pr_debug(" <- fw_cmo_feature_init()\n");
+}
+
 /*
  * Early initialization.  Relocation is on but do not reference unbolted pages
  */
@@ -329,6 +399,7 @@ static void __init pSeries_init_early(void)
        else if (firmware_has_feature(FW_FEATURE_XDABR))
                ppc_md.set_dabr = pseries_set_xdabr;
 
+       pSeries_cmo_feature_init();
        iommu_init_early_pSeries();
 
        pr_debug(" <- pSeries_init_early()\n");
index de8c8b5..89639ec 100644 (file)
@@ -147,7 +147,7 @@ static void dart_flush(struct iommu_table *tbl)
        }
 }
 
-static void dart_build(struct iommu_table *tbl, long index,
+static int dart_build(struct iommu_table *tbl, long index,
                       long npages, unsigned long uaddr,
                       enum dma_data_direction direction,
                       struct dma_attrs *attrs)
@@ -184,6 +184,7 @@ static void dart_build(struct iommu_table *tbl, long index,
        } else {
                dart_dirty = 1;
        }
+       return 0;
 }
 
 
index 0052780..e5a6e2e 100644 (file)
@@ -33,6 +33,7 @@
 */
 
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
@@ -52,7 +53,9 @@
 #include <asm/hvcall.h>
 #include <asm/atomic.h>
 #include <asm/vio.h>
+#include <asm/iommu.h>
 #include <asm/uaccess.h>
+#include <asm/firmware.h>
 #include <linux/seq_file.h>
 
 #include "ibmveth.h"
@@ -94,8 +97,10 @@ static void ibmveth_proc_register_adapter(struct ibmveth_adapter *adapter);
 static void ibmveth_proc_unregister_adapter(struct ibmveth_adapter *adapter);
 static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance);
 static void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter);
+static unsigned long ibmveth_get_desired_dma(struct vio_dev *vdev);
 static struct kobj_type ktype_veth_pool;
 
+
 #ifdef CONFIG_PROC_FS
 #define IBMVETH_PROC_DIR "ibmveth"
 static struct proc_dir_entry *ibmveth_proc_dir;
@@ -226,16 +231,16 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
        u32 i;
        u32 count = pool->size - atomic_read(&pool->available);
        u32 buffers_added = 0;
+       struct sk_buff *skb;
+       unsigned int free_index, index;
+       u64 correlator;
+       unsigned long lpar_rc;
+       dma_addr_t dma_addr;
 
        mb();
 
        for(i = 0; i < count; ++i) {
-               struct sk_buff *skb;
-               unsigned int free_index, index;
-               u64 correlator;
                union ibmveth_buf_desc desc;
-               unsigned long lpar_rc;
-               dma_addr_t dma_addr;
 
                skb = alloc_skb(pool->buff_size, GFP_ATOMIC);
 
@@ -255,6 +260,9 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
                dma_addr = dma_map_single(&adapter->vdev->dev, skb->data,
                                pool->buff_size, DMA_FROM_DEVICE);
 
+               if (dma_mapping_error(dma_addr))
+                       goto failure;
+
                pool->free_map[free_index] = IBM_VETH_INVALID_MAP;
                pool->dma_addr[index] = dma_addr;
                pool->skbuff[index] = skb;
@@ -267,20 +275,9 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
 
                lpar_rc = h_add_logical_lan_buffer(adapter->vdev->unit_address, desc.desc);
 
-               if(lpar_rc != H_SUCCESS) {
-                       pool->free_map[free_index] = index;
-                       pool->skbuff[index] = NULL;
-                       if (pool->consumer_index == 0)
-                               pool->consumer_index = pool->size - 1;
-                       else
-                               pool->consumer_index--;
-                       dma_unmap_single(&adapter->vdev->dev,
-                                       pool->dma_addr[index], pool->buff_size,
-                                       DMA_FROM_DEVICE);
-                       dev_kfree_skb_any(skb);
-                       adapter->replenish_add_buff_failure++;
-                       break;
-               } else {
+               if (lpar_rc != H_SUCCESS)
+                       goto failure;
+               else {
                        buffers_added++;
                        adapter->replenish_add_buff_success++;
                }
@@ -288,6 +285,24 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
 
        mb();
        atomic_add(buffers_added, &(pool->available));
+       return;
+
+failure:
+       pool->free_map[free_index] = index;
+       pool->skbuff[index] = NULL;
+       if (pool->consumer_index == 0)
+               pool->consumer_index = pool->size - 1;
+       else
+               pool->consumer_index--;
+       if (!dma_mapping_error(dma_addr))
+               dma_unmap_single(&adapter->vdev->dev,
+                                pool->dma_addr[index], pool->buff_size,
+                                DMA_FROM_DEVICE);
+       dev_kfree_skb_any(skb);
+       adapter->replenish_add_buff_failure++;
+
+       mb();
+       atomic_add(buffers_added, &(pool->available));
 }
 
 /* replenish routine */
@@ -297,7 +312,7 @@ static void ibmveth_replenish_task(struct ibmveth_adapter *adapter)
 
        adapter->replenish_task_cycles++;
 
-       for(i = 0; i < IbmVethNumBufferPools; i++)
+       for (i = (IbmVethNumBufferPools - 1); i >= 0; i--)
                if(adapter->rx_buff_pool[i].active)
                        ibmveth_replenish_buffer_pool(adapter,
                                                     &adapter->rx_buff_pool[i]);
@@ -472,6 +487,18 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
                if (adapter->rx_buff_pool[i].active)
                        ibmveth_free_buffer_pool(adapter,
                                                 &adapter->rx_buff_pool[i]);
+
+       if (adapter->bounce_buffer != NULL) {
+               if (!dma_mapping_error(adapter->bounce_buffer_dma)) {
+                       dma_unmap_single(&adapter->vdev->dev,
+                                       adapter->bounce_buffer_dma,
+                                       adapter->netdev->mtu + IBMVETH_BUFF_OH,
+                                       DMA_BIDIRECTIONAL);
+                       adapter->bounce_buffer_dma = DMA_ERROR_CODE;
+               }
+               kfree(adapter->bounce_buffer);
+               adapter->bounce_buffer = NULL;
+       }
 }
 
 static int ibmveth_register_logical_lan(struct ibmveth_adapter *adapter,
@@ -607,6 +634,24 @@ static int ibmveth_open(struct net_device *netdev)
                return rc;
        }
 
+       adapter->bounce_buffer =
+           kmalloc(netdev->mtu + IBMVETH_BUFF_OH, GFP_KERNEL);
+       if (!adapter->bounce_buffer) {
+               ibmveth_error_printk("unable to allocate bounce buffer\n");
+               ibmveth_cleanup(adapter);
+               napi_disable(&adapter->napi);
+               return -ENOMEM;
+       }
+       adapter->bounce_buffer_dma =
+           dma_map_single(&adapter->vdev->dev, adapter->bounce_buffer,
+                          netdev->mtu + IBMVETH_BUFF_OH, DMA_BIDIRECTIONAL);
+       if (dma_mapping_error(adapter->bounce_buffer_dma)) {
+               ibmveth_error_printk("unable to map bounce buffer\n");
+               ibmveth_cleanup(adapter);
+               napi_disable(&adapter->napi);
+               return -ENOMEM;
+       }
+
        ibmveth_debug_printk("initial replenish cycle\n");
        ibmveth_interrupt(netdev->irq, netdev);
 
@@ -853,10 +898,12 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
        unsigned int tx_packets = 0;
        unsigned int tx_send_failed = 0;
        unsigned int tx_map_failed = 0;
+       int used_bounce = 0;
+       unsigned long data_dma_addr;
 
        desc.fields.flags_len = IBMVETH_BUF_VALID | skb->len;
-       desc.fields.address = dma_map_single(&adapter->vdev->dev, skb->data,
-                                            skb->len, DMA_TO_DEVICE);
+       data_dma_addr = dma_map_single(&adapter->vdev->dev, skb->data,
+                                      skb->len, DMA_TO_DEVICE);
 
        if (skb->ip_summed == CHECKSUM_PARTIAL &&
            ip_hdr(skb)->protocol != IPPROTO_TCP && skb_checksum_help(skb)) {
@@ -875,12 +922,16 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
                buf[1] = 0;
        }
 
-       if (dma_mapping_error(desc.fields.address)) {
-               ibmveth_error_printk("tx: unable to map xmit buffer\n");
+       if (dma_mapping_error(data_dma_addr)) {
+               if (!firmware_has_feature(FW_FEATURE_CMO))
+                       ibmveth_error_printk("tx: unable to map xmit buffer\n");
+               skb_copy_from_linear_data(skb, adapter->bounce_buffer,
+                                         skb->len);
+               desc.fields.address = adapter->bounce_buffer_dma;
                tx_map_failed++;
-               tx_dropped++;
-               goto out;
-       }
+               used_bounce = 1;
+       } else
+               desc.fields.address = data_dma_addr;
 
        /* send the frame. Arbitrarily set retrycount to 1024 */
        correlator = 0;
@@ -904,8 +955,9 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
                netdev->trans_start = jiffies;
        }
 
-       dma_unmap_single(&adapter->vdev->dev, desc.fields.address,
-                        skb->len, DMA_TO_DEVICE);
+       if (!used_bounce)
+               dma_unmap_single(&adapter->vdev->dev, data_dma_addr,
+                                skb->len, DMA_TO_DEVICE);
 
 out:   spin_lock_irqsave(&adapter->stats_lock, flags);
        netdev->stats.tx_dropped += tx_dropped;
@@ -1053,9 +1105,9 @@ static void ibmveth_set_multicast_list(struct net_device *netdev)
 static int ibmveth_change_mtu(struct net_device *dev, int new_mtu)
 {
        struct ibmveth_adapter *adapter = dev->priv;
+       struct vio_dev *viodev = adapter->vdev;
        int new_mtu_oh = new_mtu + IBMVETH_BUFF_OH;
-       int reinit = 0;
-       int i, rc;
+       int i;
 
        if (new_mtu < IBMVETH_MAX_MTU)
                return -EINVAL;
@@ -1067,23 +1119,34 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu)
        if (i == IbmVethNumBufferPools)
                return -EINVAL;
 
+       /* Deactivate all the buffer pools so that the next loop can activate
+          only the buffer pools necessary to hold the new MTU */
+       for (i = 0; i < IbmVethNumBufferPools; i++)
+               if (adapter->rx_buff_pool[i].active) {
+                       ibmveth_free_buffer_pool(adapter,
+                                                &adapter->rx_buff_pool[i]);
+                       adapter->rx_buff_pool[i].active = 0;
+               }
+
        /* Look for an active buffer pool that can hold the new MTU */
        for(i = 0; i<IbmVethNumBufferPools; i++) {
-               if (!adapter->rx_buff_pool[i].active) {
-                       adapter->rx_buff_pool[i].active = 1;
-                       reinit = 1;
-               }
+               adapter->rx_buff_pool[i].active = 1;
 
                if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) {
-                       if (reinit && netif_running(adapter->netdev)) {
+                       if (netif_running(adapter->netdev)) {
                                adapter->pool_config = 1;
                                ibmveth_close(adapter->netdev);
                                adapter->pool_config = 0;
                                dev->mtu = new_mtu;
-                               if ((rc = ibmveth_open(adapter->netdev)))
-                                       return rc;
-                       } else
-                               dev->mtu = new_mtu;
+                               vio_cmo_set_dev_desired(viodev,
+                                               ibmveth_get_desired_dma
+                                               (viodev));
+                               return ibmveth_open(adapter->netdev);
+                       }
+                       dev->mtu = new_mtu;
+                       vio_cmo_set_dev_desired(viodev,
+                                               ibmveth_get_desired_dma
+                                               (viodev));
                        return 0;
                }
        }
@@ -1098,6 +1161,46 @@ static void ibmveth_poll_controller(struct net_device *dev)
 }
 #endif
 
+/**
+ * ibmveth_get_desired_dma - Calculate IO memory desired by the driver
+ *
+ * @vdev: struct vio_dev for the device whose desired IO mem is to be returned
+ *
+ * Return value:
+ *     Number of bytes of IO data the driver will need to perform well.
+ */
+static unsigned long ibmveth_get_desired_dma(struct vio_dev *vdev)
+{
+       struct net_device *netdev = dev_get_drvdata(&vdev->dev);
+       struct ibmveth_adapter *adapter;
+       unsigned long ret;
+       int i;
+       int rxqentries = 1;
+
+       /* netdev inits at probe time along with the structures we need below*/
+       if (netdev == NULL)
+               return IOMMU_PAGE_ALIGN(IBMVETH_IO_ENTITLEMENT_DEFAULT);
+
+       adapter = netdev_priv(netdev);
+
+       ret = IBMVETH_BUFF_LIST_SIZE + IBMVETH_FILT_LIST_SIZE;
+       ret += IOMMU_PAGE_ALIGN(netdev->mtu);
+
+       for (i = 0; i < IbmVethNumBufferPools; i++) {
+               /* add the size of the active receive buffers */
+               if (adapter->rx_buff_pool[i].active)
+                       ret +=
+                           adapter->rx_buff_pool[i].size *
+                           IOMMU_PAGE_ALIGN(adapter->rx_buff_pool[i].
+                                   buff_size);
+               rxqentries += adapter->rx_buff_pool[i].size;
+       }
+       /* add the size of the receive queue entries */
+       ret += IOMMU_PAGE_ALIGN(rxqentries * sizeof(struct ibmveth_rx_q_entry));
+
+       return ret;
+}
+
 static int __devinit ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
 {
        int rc, i;
@@ -1242,6 +1345,8 @@ static int __devexit ibmveth_remove(struct vio_dev *dev)
        ibmveth_proc_unregister_adapter(adapter);
 
        free_netdev(netdev);
+       dev_set_drvdata(&dev->dev, NULL);
+
        return 0;
 }
 
@@ -1402,14 +1507,15 @@ const char * buf, size_t count)
                                return -EPERM;
                        }
 
-                       pool->active = 0;
                        if (netif_running(netdev)) {
                                adapter->pool_config = 1;
                                ibmveth_close(netdev);
+                               pool->active = 0;
                                adapter->pool_config = 0;
                                if ((rc = ibmveth_open(netdev)))
                                        return rc;
                        }
+                       pool->active = 0;
                }
        } else if (attr == &veth_num_attr) {
                if (value <= 0 || value > IBMVETH_MAX_POOL_COUNT)
@@ -1485,6 +1591,7 @@ static struct vio_driver ibmveth_driver = {
        .id_table       = ibmveth_device_table,
        .probe          = ibmveth_probe,
        .remove         = ibmveth_remove,
+       .get_desired_dma = ibmveth_get_desired_dma,
        .driver         = {
                .name   = ibmveth_driver_name,
                .owner  = THIS_MODULE,
index 41f61cd..d281869 100644 (file)
@@ -93,9 +93,12 @@ static inline long h_illan_attributes(unsigned long unit_address,
   plpar_hcall_norets(H_CHANGE_LOGICAL_LAN_MAC, ua, mac)
 
 #define IbmVethNumBufferPools 5
+#define IBMVETH_IO_ENTITLEMENT_DEFAULT 4243456 /* MTU of 1500 needs 4.2Mb */
 #define IBMVETH_BUFF_OH 22 /* Overhead: 14 ethernet header + 8 opaque handle */
 #define IBMVETH_MAX_MTU 68
 #define IBMVETH_MAX_POOL_COUNT 4096
+#define IBMVETH_BUFF_LIST_SIZE 4096
+#define IBMVETH_FILT_LIST_SIZE 4096
 #define IBMVETH_MAX_BUF_SIZE (1024 * 128)
 
 static int pool_size[] = { 512, 1024 * 2, 1024 * 16, 1024 * 32, 1024 * 64 };
@@ -143,6 +146,8 @@ struct ibmveth_adapter {
     struct ibmveth_rx_q rx_queue;
     int pool_config;
     int rx_csum;
+    void *bounce_buffer;
+    dma_addr_t bounce_buffer_dma;
 
     /* adapter specific stats */
     u64 replenish_task_cycles;
index 5c015d3..344e1b0 100644 (file)
@@ -91,8 +91,6 @@ void of_register_i2c_devices(struct i2c_adapter *adap,
                }
 
                info.irq = irq_of_parse_and_map(node, 0);
-               if (info.irq == NO_IRQ)
-                       info.irq = -1;
 
                if (of_find_i2c_driver(node, &info) < 0) {
                        irq_dispose_mapping(info.irq);
index eb702b9..c4a7c06 100644 (file)
@@ -3819,6 +3819,20 @@ static int ibmvfc_remove(struct vio_dev *vdev)
        return 0;
 }
 
+/**
+ * ibmvfc_get_desired_dma - Calculate DMA resources needed by the driver
+ * @vdev:      vio device struct
+ *
+ * Return value:
+ *     Number of bytes the driver will need to DMA map at the same time in
+ *     order to perform well.
+ */
+static unsigned long ibmvfc_get_desired_dma(struct vio_dev *vdev)
+{
+       unsigned long pool_dma = max_requests * sizeof(union ibmvfc_iu);
+       return pool_dma + ((512 * 1024) * driver_template.cmd_per_lun);
+}
+
 static struct vio_device_id ibmvfc_device_table[] __devinitdata = {
        {"fcp", "IBM,vfc-client"},
        { "", "" }
@@ -3829,6 +3843,7 @@ static struct vio_driver ibmvfc_driver = {
        .id_table = ibmvfc_device_table,
        .probe = ibmvfc_probe,
        .remove = ibmvfc_remove,
+       .get_desired_dma = ibmvfc_get_desired_dma,
        .driver = {
                .name = IBMVFC_NAME,
                .owner = THIS_MODULE,
index 5d23368..20000ec 100644 (file)
@@ -72,6 +72,7 @@
 #include <linux/delay.h>
 #include <asm/firmware.h>
 #include <asm/vio.h>
+#include <asm/firmware.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_host.h>
@@ -426,8 +427,10 @@ static int map_sg_data(struct scsi_cmnd *cmd,
                                           SG_ALL * sizeof(struct srp_direct_buf),
                                           &evt_struct->ext_list_token, 0);
                if (!evt_struct->ext_list) {
-                       sdev_printk(KERN_ERR, cmd->device,
-                                   "Can't allocate memory for indirect table\n");
+                       if (!firmware_has_feature(FW_FEATURE_CMO))
+                               sdev_printk(KERN_ERR, cmd->device,
+                                           "Can't allocate memory "
+                                           "for indirect table\n");
                        return 0;
                }
        }
@@ -743,7 +746,9 @@ static int ibmvscsi_queuecommand(struct scsi_cmnd *cmnd,
        srp_cmd->lun = ((u64) lun) << 48;
 
        if (!map_data_for_srp_cmd(cmnd, evt_struct, srp_cmd, hostdata->dev)) {
-               sdev_printk(KERN_ERR, cmnd->device, "couldn't convert cmd to srp_cmd\n");
+               if (!firmware_has_feature(FW_FEATURE_CMO))
+                       sdev_printk(KERN_ERR, cmnd->device,
+                                   "couldn't convert cmd to srp_cmd\n");
                free_event_struct(&hostdata->pool, evt_struct);
                return SCSI_MLQUEUE_HOST_BUSY;
        }
@@ -855,7 +860,10 @@ static void send_mad_adapter_info(struct ibmvscsi_host_data *hostdata)
                                            DMA_BIDIRECTIONAL);
 
        if (dma_mapping_error(req->buffer)) {
-               dev_err(hostdata->dev, "Unable to map request_buffer for adapter_info!\n");
+               if (!firmware_has_feature(FW_FEATURE_CMO))
+                       dev_err(hostdata->dev,
+                               "Unable to map request_buffer for "
+                               "adapter_info!\n");
                free_event_struct(&hostdata->pool, evt_struct);
                return;
        }
@@ -1400,7 +1408,9 @@ static int ibmvscsi_do_host_config(struct ibmvscsi_host_data *hostdata,
                                                    DMA_BIDIRECTIONAL);
 
        if (dma_mapping_error(host_config->buffer)) {
-               dev_err(hostdata->dev, "dma_mapping error getting host config\n");
+               if (!firmware_has_feature(FW_FEATURE_CMO))
+                       dev_err(hostdata->dev,
+                               "dma_mapping error getting host config\n");
                free_event_struct(&hostdata->pool, evt_struct);
                return -1;
        }
@@ -1604,7 +1614,7 @@ static struct scsi_host_template driver_template = {
        .eh_host_reset_handler = ibmvscsi_eh_host_reset_handler,
        .slave_configure = ibmvscsi_slave_configure,
        .change_queue_depth = ibmvscsi_change_queue_depth,
-       .cmd_per_lun = 16,
+       .cmd_per_lun = IBMVSCSI_CMDS_PER_LUN_DEFAULT,
        .can_queue = IBMVSCSI_MAX_REQUESTS_DEFAULT,
        .this_id = -1,
        .sg_tablesize = SG_ALL,
@@ -1613,6 +1623,26 @@ static struct scsi_host_template driver_template = {
 };
 
 /**
+ * ibmvscsi_get_desired_dma - Calculate IO memory desired by the driver
+ *
+ * @vdev: struct vio_dev for the device whose desired IO mem is to be returned
+ *
+ * Return value:
+ *     Number of bytes of IO data the driver will need to perform well.
+ */
+static unsigned long ibmvscsi_get_desired_dma(struct vio_dev *vdev)
+{
+       /* iu_storage data allocated in initialize_event_pool */
+       unsigned long desired_io = max_requests * sizeof(union viosrp_iu);
+
+       /* add io space for sg data */
+       desired_io += (IBMVSCSI_MAX_SECTORS_DEFAULT *
+                            IBMVSCSI_CMDS_PER_LUN_DEFAULT);
+
+       return desired_io;
+}
+
+/**
  * Called by bus code for each adapter
  */
 static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id)
@@ -1641,7 +1671,7 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id)
        hostdata->host = host;
        hostdata->dev = dev;
        atomic_set(&hostdata->request_limit, -1);
-       hostdata->host->max_sectors = 32 * 8; /* default max I/O 32 pages */
+       hostdata->host->max_sectors = IBMVSCSI_MAX_SECTORS_DEFAULT;
 
        rc = ibmvscsi_ops->init_crq_queue(&hostdata->queue, hostdata, max_requests);
        if (rc != 0 && rc != H_RESOURCE) {
@@ -1735,6 +1765,7 @@ static struct vio_driver ibmvscsi_driver = {
        .id_table = ibmvscsi_device_table,
        .probe = ibmvscsi_probe,
        .remove = ibmvscsi_remove,
+       .get_desired_dma = ibmvscsi_get_desired_dma,
        .driver = {
                .name = "ibmvscsi",
                .owner = THIS_MODULE,
index 46e850e..2d4339d 100644 (file)
@@ -45,6 +45,8 @@ struct Scsi_Host;
 #define MAX_INDIRECT_BUFS 10
 
 #define IBMVSCSI_MAX_REQUESTS_DEFAULT 100
+#define IBMVSCSI_CMDS_PER_LUN_DEFAULT 16
+#define IBMVSCSI_MAX_SECTORS_DEFAULT 256 /* 32 * 8 = default max I/O 32 pages */
 #define IBMVSCSI_MAX_CMDS_PER_LUN 64
 
 /* ------------------------------------------------------------
index 88d1803..3b6ff85 100644 (file)
@@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss)
 #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
 #endif
 
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
 static int
 create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                unsigned long load_addr, unsigned long interp_load_addr)
@@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        elf_addr_t __user *envp;
        elf_addr_t __user *sp;
        elf_addr_t __user *u_platform;
+       elf_addr_t __user *u_base_platform;
        const char *k_platform = ELF_PLATFORM;
+       const char *k_base_platform = ELF_BASE_PLATFORM;
        int items;
        elf_addr_t *elf_info;
        int ei_index = 0;
@@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                        return -EFAULT;
        }
 
+       /*
+        * If this architecture has a "base" platform capability
+        * string, copy it to userspace.
+        */
+       u_base_platform = NULL;
+       if (k_base_platform) {
+               size_t len = strlen(k_base_platform) + 1;
+
+               u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+               if (__copy_to_user(u_base_platform, k_base_platform, len))
+                       return -EFAULT;
+       }
+
        /* Create the ELF interpreter info */
        elf_info = (elf_addr_t *)current->mm->saved_auxv;
        /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -209,6 +233,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                NEW_AUX_ENT(AT_PLATFORM,
                            (elf_addr_t)(unsigned long)u_platform);
        }
+       if (k_base_platform) {
+               NEW_AUX_ENT(AT_BASE_PLATFORM,
+                           (elf_addr_t)(unsigned long)u_base_platform);
+       }
        if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
                NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
        }
index 2a3e907..ef8a248 100644 (file)
@@ -127,6 +127,8 @@ extern struct cpu_spec *identify_cpu(unsigned long offset, unsigned int pvr);
 extern void do_feature_fixups(unsigned long value, void *fixup_start,
                              void *fixup_end);
 
+extern const char *powerpc_base_platform;
+
 #endif /* __ASSEMBLY__ */
 
 /* CPU kernel features */
index 8966467..80d1f39 100644 (file)
@@ -217,6 +217,14 @@ typedef elf_vrregset_t elf_fpxregset_t;
 
 #define ELF_PLATFORM   (cur_cpu_spec->platform)
 
+/* While ELF_PLATFORM indicates the ISA supported by the platform, it
+ * may not accurately reflect the underlying behavior of the hardware
+ * (as in the case of running in Power5+ compatibility mode on a
+ * Power6 machine).  ELF_BASE_PLATFORM allows ld.so to load libraries
+ * that are tuned for the real hardware.
+ */
+#define ELF_BASE_PLATFORM (powerpc_base_platform)
+
 #ifdef __powerpc64__
 # define ELF_PLAT_INIT(_r, load_addr)  do {    \
        _r->gpr[2] = load_addr;                 \
index ef32899..3a17982 100644 (file)
@@ -46,6 +46,7 @@
 #define FW_FEATURE_PS3_LV1     ASM_CONST(0x0000000000800000)
 #define FW_FEATURE_BEAT                ASM_CONST(0x0000000001000000)
 #define FW_FEATURE_BULK_REMOVE ASM_CONST(0x0000000002000000)
+#define FW_FEATURE_CMO         ASM_CONST(0x0000000004000000)
 
 #ifndef __ASSEMBLY__
 
@@ -58,7 +59,7 @@ enum {
                FW_FEATURE_MIGRATE | FW_FEATURE_PERFMON | FW_FEATURE_CRQ |
                FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN |
                FW_FEATURE_BULK | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE |
-               FW_FEATURE_SPLPAR | FW_FEATURE_LPAR,
+               FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO,
        FW_FEATURE_PSERIES_ALWAYS = 0,
        FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES | FW_FEATURE_LPAR,
        FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES | FW_FEATURE_LPAR,
index bf6cd7c..fbe2932 100644 (file)
 #define H_EXACT                        (1UL<<(63-24))  /* Use exact PTE or return H_PTEG_FULL */
 #define H_R_XLATE              (1UL<<(63-25))  /* include a valid logical page num in the pte if the valid bit is set */
 #define H_READ_4               (1UL<<(63-26))  /* Return 4 PTEs */
+#define H_PAGE_STATE_CHANGE    (1UL<<(63-28))
+#define H_PAGE_UNUSED          ((1UL<<(63-29)) | (1UL<<(63-30)))
+#define H_PAGE_SET_UNUSED      (H_PAGE_STATE_CHANGE | H_PAGE_UNUSED)
+#define H_PAGE_SET_LOANED      (H_PAGE_SET_UNUSED | (1UL<<(63-31)))
+#define H_PAGE_SET_ACTIVE      H_PAGE_STATE_CHANGE
 #define H_AVPN                 (1UL<<(63-32))  /* An avpn is provided as a sanity test */
 #define H_ANDCOND              (1UL<<(63-33))
 #define H_ICACHE_INVALIDATE    (1UL<<(63-40))  /* icbi, etc.  (ignored for IO pages) */
 #define H_JOIN                 0x298
 #define H_VASI_STATE            0x2A4
 #define H_ENABLE_CRQ           0x2B0
-#define MAX_HCALL_OPCODE       H_ENABLE_CRQ
+#define H_SET_MPP              0x2D0
+#define H_GET_MPP              0x2D4
+#define MAX_HCALL_OPCODE       H_GET_MPP
 
 #ifndef __ASSEMBLY__
 
@@ -270,6 +277,20 @@ struct hcall_stats {
 };
 #define HCALL_STAT_ARRAY_SIZE  ((MAX_HCALL_OPCODE >> 2) + 1)
 
+struct hvcall_mpp_data {
+       unsigned long entitled_mem;
+       unsigned long mapped_mem;
+       unsigned short group_num;
+       unsigned short pool_num;
+       unsigned char mem_weight;
+       unsigned char unallocated_mem_weight;
+       unsigned long unallocated_entitlement;  /* value in bytes */
+       unsigned long pool_size;
+       signed long loan_request;
+       unsigned long backing_mem;
+};
+
+int h_get_mpp(struct hvcall_mpp_data *);
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
index 567ed92..2fe268b 100644 (file)
@@ -125,7 +125,10 @@ struct lppaca {
        // NOTE: This value will ALWAYS be zero for dedicated processors and
        // will NEVER be zero for shared processors (ie, initialized to a 1).
        volatile u32 yield_count;       // PLIC increments each dispatchx00-x03
-       u8      reserved6[124];         // Reserved                     x04-x7F
+       u32 reserved6;
+       volatile u64 cmo_faults;        // CMO page fault count         x08-x0F
+       volatile u64 cmo_fault_time;    // CMO page fault time          x10-x17
+       u8      reserved7[104];         // Reserved                     x18-x7F
 
 //=============================================================================
 // CACHE_LINE_4-5 0x0180 - 0x027F Contains PMC interrupt data
index 1233d73..893aafd 100644 (file)
@@ -76,7 +76,7 @@ struct machdep_calls {
         * destroyed as well */
        void            (*hpte_clear_all)(void);
 
-       void            (*tce_build)(struct iommu_table * tbl,
+       int             (*tce_build)(struct iommu_table *tbl,
                                     long index,
                                     long npages,
                                     unsigned long uaddr,
index 710c5d3..8917ed6 100644 (file)
 #define MPC52xx_PSC_RXTX_FIFO_ALARM    0x0002
 #define MPC52xx_PSC_RXTX_FIFO_EMPTY    0x0001
 
-/* PSC interrupt mask bits */
+/* PSC interrupt status/mask bits */
 #define MPC52xx_PSC_IMR_TXRDY          0x0100
 #define MPC52xx_PSC_IMR_RXRDY          0x0200
 #define MPC52xx_PSC_IMR_DB             0x0400
+#define MPC52xx_PSC_IMR_TXEMP          0x0800
+#define MPC52xx_PSC_IMR_ORERR          0x1000
 #define MPC52xx_PSC_IMR_IPC            0x8000
 
 /* PSC input port change bit */
 
 #define MPC52xx_PSC_RFNUM_MASK 0x01ff
 
+#define MPC52xx_PSC_SICR_DTS1                  (1 << 29)
+#define MPC52xx_PSC_SICR_SHDR                  (1 << 28)
+#define MPC52xx_PSC_SICR_SIM_MASK              (0xf << 24)
+#define MPC52xx_PSC_SICR_SIM_UART              (0x0 << 24)
+#define MPC52xx_PSC_SICR_SIM_UART_DCD          (0x8 << 24)
+#define MPC52xx_PSC_SICR_SIM_CODEC_8           (0x1 << 24)
+#define MPC52xx_PSC_SICR_SIM_CODEC_16          (0x2 << 24)
+#define MPC52xx_PSC_SICR_SIM_AC97              (0x3 << 24)
+#define MPC52xx_PSC_SICR_SIM_SIR               (0x8 << 24)
+#define MPC52xx_PSC_SICR_SIM_SIR_DCD           (0xc << 24)
+#define MPC52xx_PSC_SICR_SIM_MIR               (0x5 << 24)
+#define MPC52xx_PSC_SICR_SIM_FIR               (0x6 << 24)
+#define MPC52xx_PSC_SICR_SIM_CODEC_24          (0x7 << 24)
+#define MPC52xx_PSC_SICR_SIM_CODEC_32          (0xf << 24)
+#define MPC52xx_PSC_SICR_GENCLK                        (1 << 23)
+#define MPC52xx_PSC_SICR_I2S                   (1 << 22)
+#define MPC52xx_PSC_SICR_CLKPOL                        (1 << 21)
+#define MPC52xx_PSC_SICR_SYNCPOL               (1 << 20)
+#define MPC52xx_PSC_SICR_CELLSLAVE             (1 << 19)
+#define MPC52xx_PSC_SICR_CELL2XCLK             (1 << 18)
+#define MPC52xx_PSC_SICR_ESAI                  (1 << 17)
+#define MPC52xx_PSC_SICR_ENAC97                        (1 << 16)
+#define MPC52xx_PSC_SICR_SPI                   (1 << 15)
+#define MPC52xx_PSC_SICR_MSTR                  (1 << 14)
+#define MPC52xx_PSC_SICR_CPOL                  (1 << 13)
+#define MPC52xx_PSC_SICR_CPHA                  (1 << 12)
+#define MPC52xx_PSC_SICR_USEEOF                        (1 << 11)
+#define MPC52xx_PSC_SICR_DISABLEEOF            (1 << 10)
 
 /* Structure of the hardware registers */
 struct mpc52xx_psc {
@@ -132,8 +162,12 @@ struct mpc52xx_psc {
        u8              reserved5[3];
        u8              ctlr;           /* PSC + 0x1c */
        u8              reserved6[3];
-       u16             ccr;            /* PSC + 0x20 */
-       u8              reserved7[14];
+       /* BitClkDiv field of CCR is byte swapped in
+        * the hardware for mpc5200/b compatibility */
+       u32             ccr;            /* PSC + 0x20 */
+       u32             ac97_slots;     /* PSC + 0x24 */
+       u32             ac97_cmd;       /* PSC + 0x28 */
+       u32             ac97_data;      /* PSC + 0x2c */
        u8              ivr;            /* PSC + 0x30 */
        u8              reserved8[3];
        u8              ip;             /* PSC + 0x34 */
index d18ffe7..dbb8ca1 100644 (file)
@@ -38,6 +38,19 @@ extern void paging_init(void);
                remap_pfn_range(vma, vaddr, pfn, size, prot)
 
 #include <asm-generic/pgtable.h>
+
+
+/*
+ * This gets called at the end of handling a page fault, when
+ * the kernel has put a new PTE into the page table for the process.
+ * We use it to ensure coherency between the i-cache and d-cache
+ * for the page which has just been mapped in.
+ * On machines which use an MMU hash table, we use this to put a
+ * corresponding HPTE into the hash table ahead of time, instead of
+ * waiting for the inevitable extra hash-table miss exception.
+ */
+extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
index 2b8a458..eb8eb40 100644 (file)
@@ -31,6 +31,7 @@ asmlinkage int sys_vfork(unsigned long p1, unsigned long p2,
                unsigned long p3, unsigned long p4, unsigned long p5,
                unsigned long p6, struct pt_regs *regs);
 asmlinkage long sys_pipe(int __user *fildes);
+asmlinkage long sys_pipe2(int __user *fildes, int flags);
 asmlinkage long sys_rt_sigaction(int sig,
                const struct sigaction __user *act,
                struct sigaction __user *oact, size_t sigsetsize);
index ae7085c..e084272 100644 (file)
@@ -316,3 +316,9 @@ COMPAT_SYS(fallocate)
 SYSCALL(subpage_prot)
 COMPAT_SYS_SPU(timerfd_settime)
 COMPAT_SYS_SPU(timerfd_gettime)
+COMPAT_SYS_SPU(signalfd4)
+SYSCALL_SPU(eventfd2)
+SYSCALL_SPU(epoll_create1)
+SYSCALL_SPU(dup3)
+SYSCALL_SPU(pipe2)
+SYSCALL(inotify_init1)
index e6e25e2..d6648c1 100644 (file)
@@ -110,6 +110,8 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
 extern int set_dabr(unsigned long dabr);
+extern void do_dabr(struct pt_regs *regs, unsigned long address,
+                   unsigned long error_code);
 extern void print_backtrace(unsigned long *);
 extern void show_regs(struct pt_regs * regs);
 extern void flush_instruction_cache(void);
index 5c91081..361cd5c 100644 (file)
@@ -162,16 +162,5 @@ extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 
 #endif
 
-/*
- * This gets called at the end of handling a page fault, when
- * the kernel has put a new PTE into the page table for the process.
- * We use it to ensure coherency between the i-cache and d-cache
- * for the page which has just been mapped in.
- * On machines which use an MMU hash table, we use this to put a
- * corresponding HPTE into the hash table ahead of time, instead of
- * waiting for the inevitable extra hash-table miss exception.
- */
-extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
-
 #endif /*__KERNEL__ */
 #endif /* _ASM_POWERPC_TLBFLUSH_H */
index ce91bb6..e07d0c7 100644 (file)
 #define __NR_subpage_prot      310
 #define __NR_timerfd_settime   311
 #define __NR_timerfd_gettime   312
+#define __NR_signalfd4         313
+#define __NR_eventfd2          314
+#define __NR_epoll_create1     315
+#define __NR_dup3              316
+#define __NR_pipe2             317
+#define __NR_inotify_init1     318
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls          313
+#define __NR_syscalls          319
 
 #define __NR__exit __NR_exit
 #define NR_syscalls    __NR_syscalls
index 56512a9..0a290a1 100644 (file)
 #define VIO_IRQ_DISABLE                0UL
 #define VIO_IRQ_ENABLE         1UL
 
+/*
+ * VIO CMO minimum entitlement for all devices and spare entitlement
+ */
+#define VIO_CMO_MIN_ENT 1562624
+
 struct iommu_table;
 
-/*
- * The vio_dev structure is used to describe virtual I/O devices.
+/**
+ * vio_dev - This structure is used to describe virtual I/O devices.
+ *
+ * @desired: set from return of driver's get_desired_dma() function
+ * @entitled: bytes of IO data that has been reserved for this device.
+ * @allocated: bytes of IO data currently in use by the device.
+ * @allocs_failed: number of DMA failures due to insufficient entitlement.
  */
 struct vio_dev {
        const char *name;
        const char *type;
        uint32_t unit_address;
        unsigned int irq;
+       struct {
+               size_t desired;
+               size_t entitled;
+               size_t allocated;
+               atomic_t allocs_failed;
+       } cmo;
        struct device dev;
 };
 
@@ -56,12 +72,19 @@ struct vio_driver {
        const struct vio_device_id *id_table;
        int (*probe)(struct vio_dev *dev, const struct vio_device_id *id);
        int (*remove)(struct vio_dev *dev);
+       /* A driver must have a get_desired_dma() function to
+        * be loaded in a CMO environment if it uses DMA.
+        */
+       unsigned long (*get_desired_dma)(struct vio_dev *dev);
        struct device_driver driver;
 };
 
 extern int vio_register_driver(struct vio_driver *drv);
 extern void vio_unregister_driver(struct vio_driver *drv);
 
+extern int vio_cmo_entitlement_update(size_t);
+extern void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired);
+
 extern void __devinit vio_unregister_device(struct vio_dev *dev);
 
 struct device_node;
index 0da17d1..d7afa9d 100644 (file)
 
 #define AT_SECURE 23   /* secure mode boolean */
 
+#define AT_BASE_PLATFORM 24    /* string identifying real platform, may
+                                * differ from AT_PLATFORM. */
+
 #define AT_EXECFN  31  /* filename of program */
+
 #ifdef __KERNEL__
-#define AT_VECTOR_SIZE_BASE 17 /* NEW_AUX_ENT entries in auxiliary table */
+#define AT_VECTOR_SIZE_BASE 18 /* NEW_AUX_ENT entries in auxiliary table */
   /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */
 #endif