cgroups: fix a css_set not found bug in cgroup_attach_proc

[linux-2.6.git] / kernel / kexec.c
diff --git a/kernel/kexec.c b/kernel/kexec.c

index bfbbd120623c63090d82798140c1a43b4ccc8ed7..296fbc84d659d7d5749353e06d814b579ff50989 100644 (file)
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
  #include <linux/slab.h>
  #include <linux/fs.h>
  #include <linux/kexec.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
  #include <linux/list.h>
  #include <linux/highmem.h>
  #include <linux/syscalls.h>
@@ -21,7 +21,7 @@
  #include <linux/hardirq.h>
  #include <linux/elf.h>
  #include <linux/elfcore.h>
-#include <linux/utsrelease.h>
+#include <generated/utsrelease.h>
  #include <linux/utsname.h>
  #include <linux/numa.h>
  #include <linux/suspend.h>
@@ -30,6 +30,10 @@
  #include <linux/pm.h>
  #include <linux/cpu.h>
  #include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/kmsg_dump.h>
+#include <linux/syscore_ops.h>
  
  #include <asm/page.h>
  #include <asm/uaccess.h>
@@ -38,10 +42,10 @@
  #include <asm/sections.h>
  
  /* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t* crash_notes;
+note_buf_t __percpu *crash_notes;
  
  /* vmcoreinfo stuff */
-unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
  u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
  size_t vmcoreinfo_size;
  size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@@ -77,7 +81,7 @@ int kexec_should_crash(struct task_struct *p)
   *
   * The code for the transition from the current kernel to the
   * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
   * page of memory is necessary, but some architectures require more.
   * Because this memory must be identity mapped in the transition from
   * virtual to physical addresses it must live in the range
@@ -141,15 +145,17 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
         /* Initialize the list of destination pages */
         INIT_LIST_HEAD(&image->dest_pages);
  
-       /* Initialize the list of unuseable pages */
+       /* Initialize the list of unusable pages */
         INIT_LIST_HEAD(&image->unuseable_pages);
  
         /* Read in the segments */
         image->nr_segments = nr_segments;
         segment_bytes = nr_segments * sizeof(*segments);
         result = copy_from_user(image->segment, segments, segment_bytes);
-       if (result)
+       if (result) {
+               result = -EFAULT;
                 goto out;
+       }
  
         /*
          * Verify we have good destination addresses.  The caller is
@@ -158,7 +164,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
          * just verifies it is an address we can use.
          *
          * Since the kernel does everything in page size chunks ensure
-        * the destination addreses are page aligned.  Too many
+        * the destination addresses are page aligned.  Too many
          * special cases crop of when we don't do this.  The most
          * insidious is getting overlapping destination addresses
          * simply because addresses are changed to page size
@@ -242,7 +248,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
          */
         result = -ENOMEM;
         image->control_code_page = kimage_alloc_control_pages(image,
-                                          get_order(KEXEC_CONTROL_CODE_SIZE));
+                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
         if (!image->control_code_page) {
                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
                 goto out;
@@ -317,7 +323,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
          */
         result = -ENOMEM;
         image->control_code_page = kimage_alloc_control_pages(image,
-                                          get_order(KEXEC_CONTROL_CODE_SIZE));
+                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
         if (!image->control_code_page) {
                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
                 goto out;
@@ -449,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
         /* Deal with the destination pages I have inadvertently allocated.
          *
          * Ideally I would convert multi-page allocations into single
-        * page allocations, and add everyting to image->dest_pages.
+        * page allocations, and add everything to image->dest_pages.
          *
          * For now it is simpler to just free the pages.
          */
@@ -597,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
         /* Walk through and free any extra destination pages I may have */
         kimage_free_page_list(&image->dest_pages);
  
-       /* Walk through and free any unuseable pages I have cached */
+       /* Walk through and free any unusable pages I have cached */
         kimage_free_page_list(&image->unuseable_pages);
  
  }
@@ -753,8 +759,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
                         *old = addr | (*old & ~PAGE_MASK);
  
                         /* The old page I have found cannot be a
-                        * destination page, so return it.
+                        * destination page, so return it if it's
+                        * gfp_flags honor the ones passed in.
                          */
+                       if (!(gfp_mask & __GFP_HIGHMEM) &&
+                           PageHighMem(old_page)) {
+                               kimage_free_pages(old_page);
+                               continue;
+                       }
                         addr = old_addr;
                         page = old_page;
                         break;
@@ -805,7 +817,7 @@ static int kimage_load_normal_segment(struct kimage *image,
  
                 ptr = kmap(page);
                 /* Start with a clear page */
-               memset(ptr, 0, PAGE_SIZE);
+               clear_page(ptr);
                 ptr += maddr & ~PAGE_MASK;
                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
                 if (mchunk > mbytes)
@@ -818,7 +830,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                 result = copy_from_user(ptr, buf, uchunk);
                 kunmap(page);
                 if (result) {
-                       result = (result < 0) ? result : -EIO;
+                       result = -EFAULT;
                         goto out;
                 }
                 ubytes -= uchunk;
@@ -873,7 +885,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                 kexec_flush_icache_page(page);
                 kunmap(page);
                 if (result) {
-                       result = (result < 0) ? result : -EIO;
+                       result = -EFAULT;
                         goto out;
                 }
                 ubytes -= uchunk;
@@ -924,19 +936,13 @@ static int kimage_load_segment(struct kimage *image,
   */
  struct kimage *kexec_image;
  struct kimage *kexec_crash_image;
-/*
- * A home grown binary mutex.
- * Nothing can wait so this mutex is safe to use
- * in interrupt context :)
- */
-static int kexec_lock;
  
-asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
-                               struct kexec_segment __user *segments,
-                               unsigned long flags)
+static DEFINE_MUTEX(kexec_mutex);
+
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+               struct kexec_segment __user *, segments, unsigned long, flags)
  {
         struct kimage **dest_image, *image;
-       int locked;
         int result;
  
         /* We only trust the superuser with rebooting the system. */
@@ -972,8 +978,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
          *
          * KISS: always take the mutex.
          */
-       locked = xchg(&kexec_lock, 1);
-       if (locked)
+       if (!mutex_trylock(&kexec_mutex))
                 return -EBUSY;
  
         dest_image = &kexec_image;
@@ -1015,8 +1020,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
         image = xchg(dest_image, image);
  
  out:
-       locked = xchg(&kexec_lock, 0); /* Release the mutex */
-       BUG_ON(!locked);
+       mutex_unlock(&kexec_mutex);
         kimage_free(image);
  
         return result;
@@ -1063,10 +1067,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
  
  void crash_kexec(struct pt_regs *regs)
  {
-       int locked;
-
-
-       /* Take the kexec_lock here to prevent sys_kexec_load
+       /* Take the kexec_mutex here to prevent sys_kexec_load
          * running on one cpu from replacing the crash kernel
          * we are using after a panic on a different cpu.
          *
@@ -1074,20 +1075,79 @@ void crash_kexec(struct pt_regs *regs)
          * of memory the xchg(&kexec_crash_image) would be
          * sufficient.  But since I reuse the memory...
          */
-       locked = xchg(&kexec_lock, 1);
-       if (!locked) {
+       if (mutex_trylock(&kexec_mutex)) {
                 if (kexec_crash_image) {
                         struct pt_regs fixed_regs;
+
+                       kmsg_dump(KMSG_DUMP_KEXEC);
+
                         crash_setup_regs(&fixed_regs, regs);
                         crash_save_vmcoreinfo();
                         machine_crash_shutdown(&fixed_regs);
                         machine_kexec(kexec_crash_image);
                 }
-               locked = xchg(&kexec_lock, 0);
-               BUG_ON(!locked);
+               mutex_unlock(&kexec_mutex);
         }
  }
  
+size_t crash_get_memory_size(void)
+{
+       size_t size = 0;
+       mutex_lock(&kexec_mutex);
+       if (crashk_res.end != crashk_res.start)
+               size = resource_size(&crashk_res);
+       mutex_unlock(&kexec_mutex);
+       return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+                                          unsigned long end)
+{
+       unsigned long addr;
+
+       for (addr = begin; addr < end; addr += PAGE_SIZE) {
+               ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
+               init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+               free_page((unsigned long)__va(addr));
+               totalram_pages++;
+       }
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+       int ret = 0;
+       unsigned long start, end;
+
+       mutex_lock(&kexec_mutex);
+
+       if (kexec_crash_image) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+       start = crashk_res.start;
+       end = crashk_res.end;
+
+       if (new_size >= end - start + 1) {
+               ret = -EINVAL;
+               if (new_size == end - start + 1)
+                       ret = 0;
+               goto unlock;
+       }
+
+       start = roundup(start, PAGE_SIZE);
+       end = roundup(start + new_size, PAGE_SIZE);
+
+       crash_free_reserved_phys_range(end, crashk_res.end);
+
+       if ((start == end) && (crashk_res.parent != NULL))
+               release_resource(&crashk_res);
+       crashk_res.end = end - 1;
+
+unlock:
+       mutex_unlock(&kexec_mutex);
+       return ret;
+}
+
  static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
                             size_t data_len)
  {
@@ -1121,7 +1181,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
         struct elf_prstatus prstatus;
         u32 *buf;
  
-       if ((cpu < 0) || (cpu >= NR_CPUS))
+       if ((cpu < 0) || (cpu >= nr_cpu_ids))
                 return;
  
         /* Using ELF notes here is opportunistic.
@@ -1136,7 +1196,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
                 return;
         memset(&prstatus, 0, sizeof(prstatus));
         prstatus.pr_pid = current->pid;
-       elf_core_copy_regs(&prstatus.pr_reg, regs);
+       elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
         buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
                               &prstatus, sizeof(prstatus));
         final_note(buf);
@@ -1234,7 +1294,7 @@ static int __init parse_crashkernel_mem(char                      *cmdline,
         } while (*cur++ == ',');
  
         if (*crash_size > 0) {
-               while (*cur != ' ' && *cur != '@')
+               while (*cur && *cur != ' ' && *cur != '@')
                         cur++;
                 if (*cur == '@') {
                         cur++;
@@ -1377,6 +1437,7 @@ static int __init crash_save_vmcoreinfo_init(void)
         VMCOREINFO_SYMBOL(node_online_map);
         VMCOREINFO_SYMBOL(swapper_pg_dir);
         VMCOREINFO_SYMBOL(_stext);
+       VMCOREINFO_SYMBOL(vmlist);
  
  #ifndef CONFIG_NEED_MULTIPLE_NODES
         VMCOREINFO_SYMBOL(mem_map);
@@ -1412,7 +1473,9 @@ static int __init crash_save_vmcoreinfo_init(void)
         VMCOREINFO_OFFSET(free_area, free_list);
         VMCOREINFO_OFFSET(list_head, next);
         VMCOREINFO_OFFSET(list_head, prev);
+       VMCOREINFO_OFFSET(vm_struct, addr);
         VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+       log_buf_kexec_setup();
         VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
         VMCOREINFO_NUMBER(NR_FREE_PAGES);
         VMCOREINFO_NUMBER(PG_lru);
@@ -1434,7 +1497,7 @@ int kernel_kexec(void)
  {
         int error = 0;
  
-       if (xchg(&kexec_lock, 1))
+       if (!mutex_trylock(&kexec_mutex))
                 return -EBUSY;
         if (!kexec_image) {
                 error = -EINVAL;
@@ -1451,32 +1514,30 @@ int kernel_kexec(void)
                         goto Restore_console;
                 }
                 suspend_console();
-               error = device_suspend(PMSG_FREEZE);
+               error = dpm_suspend_start(PMSG_FREEZE);
                 if (error)
                         goto Resume_console;
-               error = disable_nonboot_cpus();
-               if (error)
-                       goto Resume_devices;
-               local_irq_disable();
-               /* At this point, device_suspend() has been called,
-                * but *not* device_power_down(). We *must*
-                * device_power_down() now.  Otherwise, drivers for
+               /* At this point, dpm_suspend_start() has been called,
+                * but *not* dpm_suspend_noirq(). We *must* call
+                * dpm_suspend_noirq() now.  Otherwise, drivers for
                  * some devices (e.g. interrupt controllers) become
                  * desynchronized with the actual state of the
                  * hardware at resume time, and evil weirdness ensues.
                  */
-               error = device_power_down(PMSG_FREEZE);
+               error = dpm_suspend_noirq(PMSG_FREEZE);
+               if (error)
+                       goto Resume_devices;
+               error = disable_nonboot_cpus();
+               if (error)
+                       goto Enable_cpus;
+               local_irq_disable();
+               error = syscore_suspend();
                 if (error)
                         goto Enable_irqs;
-               save_processor_state();
         } else
  #endif
         {
-               blocking_notifier_call_chain(&reboot_notifier_list,
-                                            SYS_RESTART, NULL);
-               system_state = SYSTEM_RESTART;
-               device_shutdown();
-               sysdev_shutdown();
+               kernel_restart_prepare(NULL);
                 printk(KERN_EMERG "Starting new kernel\n");
                 machine_shutdown();
         }
@@ -1485,13 +1546,14 @@ int kernel_kexec(void)
  
  #ifdef CONFIG_KEXEC_JUMP
         if (kexec_image->preserve_context) {
-               restore_processor_state();
-               device_power_up(PMSG_RESTORE);
+               syscore_resume();
   Enable_irqs:
                 local_irq_enable();
+ Enable_cpus:
                 enable_nonboot_cpus();
+               dpm_resume_noirq(PMSG_RESTORE);
   Resume_devices:
-               device_resume(PMSG_RESTORE);
+               dpm_resume_end(PMSG_RESTORE);
   Resume_console:
                 resume_console();
                 thaw_processes();
@@ -1502,8 +1564,6 @@ int kernel_kexec(void)
  #endif
  
   Unlock:
-       if (!xchg(&kexec_lock, 0))
-               BUG();
-
+       mutex_unlock(&kexec_mutex);
         return error;
  }