]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - kernel/sysctl.c
mm: add /proc controls for pdflush threads
[linux-2.6.git] / kernel / sysctl.c
index fd3364827ccf0a838c79fe240016c93cedd508e4..72eb1a41dcabef2a7d46ab7199cae10edcc3aa5c 100644 (file)
 #include <linux/writeback.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
+#include <linux/key.h>
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
+#include <linux/slow-work.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -77,25 +81,27 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
-extern int maps_protect;
-extern int sysctl_stat_interval;
 extern int latencytop_enabled;
-
-/* Constants used for minimum and  maximum */
-#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
-static int one = 1;
+extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifndef CONFIG_MMU
+extern int sysctl_nr_trim_pages;
 #endif
+#ifdef CONFIG_RCU_TORTURE_TEST
+extern int rcutorture_runnable;
+#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
+/* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
-#endif
-
-#ifdef CONFIG_MMU
-static int two = 2;
+static int neg_one = -1;
 #endif
 
 static int zero;
+static int __maybe_unused one = 1;
+static int __maybe_unused two = 2;
+static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -104,17 +110,19 @@ static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 extern char modprobe_path[];
 #endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
 #endif
 
-#ifdef __sparc__
-extern char reboot_command [];
-extern int stop_a_enabled;
-extern int scons_pwroff;
+#ifdef CONFIG_SPARC
+#include <asm/system.h>
+#endif
+
+#ifdef CONFIG_SPARC64
+extern int sysctl_tsb_ratio;
 #endif
 
 #ifdef __hppa__
@@ -130,43 +138,38 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
 
-extern int sysctl_hz_timer;
-
 #ifdef CONFIG_BSD_PROCESS_ACCT
 extern int acct_parm[];
 #endif
 
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
+extern int unaligned_dump_stack;
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
 extern int max_lock_depth;
 #endif
 
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *, int, void __user *, size_t __user *,
-               void __user *, size_t, struct ctl_table *);
-#endif
-
-
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write, struct file *filp,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
+       .count = 1,
        .ctl_table = root_table,
-       .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+       .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
        .root = &sysctl_table_root,
+       .set = &sysctl_table_root.default_set,
 };
 static struct ctl_table_root sysctl_table_root = {
        .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-       .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+       .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
 };
 
 static struct ctl_table kern_table[];
@@ -178,6 +181,9 @@ extern struct ctl_table random_table[];
 #ifdef CONFIG_INOTIFY_USER
 extern struct ctl_table inotify_table[];
 #endif
+#ifdef CONFIG_EPOLL
+extern struct ctl_table epoll_table[];
+#endif
 
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 int sysctl_legacy_va_layout;
@@ -268,6 +274,24 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_shares_ratelimit",
+               .data           = &sysctl_sched_shares_ratelimit,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_shares_thresh",
+               .data           = &sysctl_sched_shares_thresh,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+       },
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_child_runs_first",
@@ -373,10 +397,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
-               .data           = &tainted,
-               .maxlen         = sizeof(int),
+               .maxlen         = sizeof(long),
                .mode           = 0644,
-               .proc_handler   = &proc_dointvec_taint,
+               .proc_handler   = &proc_taint,
        },
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -406,7 +429,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
        {
                .ctl_name       = KERN_SPARC_REBOOT,
                .procname       = "reboot-cmd",
@@ -433,6 +456,16 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_SPARC64
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "tsb-ratio",
+               .data           = &sysctl_tsb_ratio,
+               .maxlen         = sizeof (int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
 #ifdef __hppa__
        {
                .ctl_name       = KERN_HPPA_PWRSW,
@@ -459,7 +492,37 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_FUNCTION_TRACER
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "ftrace_enabled",
+               .data           = &ftrace_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &ftrace_enable_sysctl,
+       },
+#endif
+#ifdef CONFIG_STACK_TRACER
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "stack_tracer_enabled",
+               .data           = &stack_tracer_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &stack_trace_sysctl,
+       },
+#endif
+#ifdef CONFIG_TRACING
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "ftrace_dump_on_oops",
+               .data           = &ftrace_dump_on_oops,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
+#ifdef CONFIG_MODULES
        {
                .ctl_name       = KERN_MODPROBE,
                .procname       = "modprobe",
@@ -566,16 +629,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#endif
-#ifdef CONFIG_NO_IDLE_HZ
-       {
-               .ctl_name       = KERN_HZ_TIMER,
-               .procname       = "hz_timer",
-               .data           = &sysctl_hz_timer,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
 #endif
        {
                .ctl_name       = KERN_S390_USER_DEBUG_LOGGING,
@@ -617,7 +670,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT,
                .procname       = "printk_ratelimit",
-               .data           = &printk_ratelimit_jiffies,
+               .data           = &printk_ratelimit_state.interval,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec_jiffies,
@@ -626,7 +679,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT_BURST,
                .procname       = "printk_ratelimit_burst",
-               .data           = &printk_ratelimit_burst,
+               .data           = &printk_ratelimit_state.burst,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
@@ -729,17 +782,36 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "unaligned-dump-stack",
+               .data           = &unaligned_dump_stack,
+               .maxlen         = sizeof (int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "softlockup_panic",
+               .data           = &softlockup_panic,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "softlockup_thresh",
                .data           = &softlockup_thresh,
-               .maxlen         = sizeof(unsigned long),
+               .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = &proc_doulongvec_minmax,
+               .proc_handler   = &proc_dosoftlockup_thresh,
                .strategy       = &sysctl_intvec,
-               .extra1         = &one,
+               .extra1         = &neg_one,
                .extra2         = &sixty,
        },
        {
@@ -790,25 +862,51 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
-#ifdef CONFIG_PROC_FS
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "poweroff_cmd",
+               .data           = &poweroff_cmd,
+               .maxlen         = POWEROFF_CMD_PATH_LEN,
+               .mode           = 0644,
+               .proc_handler   = &proc_dostring,
+               .strategy       = &sysctl_string,
+       },
+#ifdef CONFIG_KEYS
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "keys",
+               .mode           = 0555,
+               .child          = key_sysctls,
+       },
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST
        {
                .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "maps_protect",
-               .data           = &maps_protect,
+               .procname       = "rcutorture_runnable",
+               .data           = &rcutorture_runnable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
        {
                .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "poweroff_cmd",
-               .data           = &poweroff_cmd,
-               .maxlen         = POWEROFF_CMD_PATH_LEN,
+               .procname       = "scan_unevictable_pages",
+               .data           = &scan_unevictable_pages,
+               .maxlen         = sizeof(scan_unevictable_pages),
                .mode           = 0644,
-               .proc_handler   = &proc_dostring,
-               .strategy       = &sysctl_string,
+               .proc_handler   = &scan_unevictable_handler,
        },
+#endif
+#ifdef CONFIG_SLOW_WORK
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "slow-work",
+               .mode           = 0555,
+               .child          = slow_work_sysctls,
+       },
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
@@ -871,11 +969,21 @@ static struct ctl_table vm_table[] = {
                .data           = &dirty_background_ratio,
                .maxlen         = sizeof(dirty_background_ratio),
                .mode           = 0644,
-               .proc_handler   = &proc_dointvec_minmax,
+               .proc_handler   = &dirty_background_ratio_handler,
                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "dirty_background_bytes",
+               .data           = &dirty_background_bytes,
+               .maxlen         = sizeof(dirty_background_bytes),
+               .mode           = 0644,
+               .proc_handler   = &dirty_background_bytes_handler,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &one_ul,
+       },
        {
                .ctl_name       = VM_DIRTY_RATIO,
                .procname       = "dirty_ratio",
@@ -887,6 +995,16 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "dirty_bytes",
+               .data           = &vm_dirty_bytes,
+               .maxlen         = sizeof(vm_dirty_bytes),
+               .mode           = 0644,
+               .proc_handler   = &dirty_bytes_handler,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &one_ul,
+       },
        {
                .procname       = "dirty_writeback_centisecs",
                .data           = &dirty_writeback_interval,
@@ -899,7 +1017,7 @@ static struct ctl_table vm_table[] = {
                .data           = &dirty_expire_interval,
                .maxlen         = sizeof(dirty_expire_interval),
                .mode           = 0644,
-               .proc_handler   = &proc_dointvec_userhz_jiffies,
+               .proc_handler   = &proc_dointvec,
        },
        {
                .ctl_name       = VM_NR_PDFLUSH_THREADS,
@@ -909,6 +1027,28 @@ static struct ctl_table vm_table[] = {
                .mode           = 0444 /* read-only*/,
                .proc_handler   = &proc_dointvec,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "nr_pdflush_threads_min",
+               .data           = &nr_pdflush_threads_min,
+               .maxlen         = sizeof nr_pdflush_threads_min,
+               .mode           = 0644 /* read-write */,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &one,
+               .extra2         = &nr_pdflush_threads_max,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "nr_pdflush_threads_max",
+               .data           = &nr_pdflush_threads_max,
+               .maxlen         = sizeof nr_pdflush_threads_max,
+               .mode           = 0644 /* read-write */,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &nr_pdflush_threads_min,
+               .extra2         = &one_thousand,
+       },
        {
                .ctl_name       = VM_SWAPPINESS,
                .procname       = "swappiness",
@@ -923,7 +1063,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_HUGETLB_PAGE
         {
                .procname       = "nr_hugepages",
-               .data           = &max_huge_pages,
+               .data           = NULL,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_sysctl_handler,
@@ -949,10 +1089,12 @@ static struct ctl_table vm_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_overcommit_hugepages",
-               .data           = &sysctl_overcommit_huge_pages,
-               .maxlen         = sizeof(sysctl_overcommit_huge_pages),
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_overcommit_handler,
+               .extra1         = (void *)&hugetlb_zero,
+               .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
        {
@@ -1002,6 +1144,17 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec
        },
+#else
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "nr_trim_pages",
+               .data           = &sysctl_nr_trim_pages,
+               .maxlen         = sizeof(sysctl_nr_trim_pages),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+       },
 #endif
        {
                .ctl_name       = VM_LAPTOP_MODE,
@@ -1187,7 +1340,9 @@ static struct ctl_table fs_table[] = {
                .data           = &sysctl_nr_open,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = &proc_dointvec_minmax,
+               .extra1         = &sysctl_nr_open_min,
+               .extra2         = &sysctl_nr_open_max,
        },
        {
                .ctl_name       = FS_DENTRY,
@@ -1219,6 +1374,7 @@ static struct ctl_table fs_table[] = {
                .extra1         = &minolduid,
                .extra2         = &maxolduid,
        },
+#ifdef CONFIG_FILE_LOCKING
        {
                .ctl_name       = FS_LEASES,
                .procname       = "leases-enable",
@@ -1227,6 +1383,7 @@ static struct ctl_table fs_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+#endif
 #ifdef CONFIG_DNOTIFY
        {
                .ctl_name       = FS_DIR_NOTIFY,
@@ -1238,17 +1395,17 @@ static struct ctl_table fs_table[] = {
        },
 #endif
 #ifdef CONFIG_MMU
+#ifdef CONFIG_FILE_LOCKING
        {
                .ctl_name       = FS_LEASE_TIME,
                .procname       = "lease-break-time",
                .data           = &lease_break_time,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = &proc_dointvec_minmax,
-               .strategy       = &sysctl_intvec,
-               .extra1         = &zero,
-               .extra2         = &two,
+               .proc_handler   = &proc_dointvec,
        },
+#endif
+#ifdef CONFIG_AIO
        {
                .procname       = "aio-nr",
                .data           = &aio_nr,
@@ -1263,6 +1420,7 @@ static struct ctl_table fs_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_doulongvec_minmax,
        },
+#endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
        {
                .ctl_name       = FS_INOTIFY,
@@ -1271,6 +1429,13 @@ static struct ctl_table fs_table[] = {
                .child          = inotify_table,
        },
 #endif 
+#ifdef CONFIG_EPOLL
+       {
+               .procname       = "epoll",
+               .mode           = 0555,
+               .child          = epoll_table,
+       },
+#endif
 #endif
        {
                .ctl_name       = KERN_SETUID_DUMPABLE,
@@ -1278,7 +1443,10 @@ static struct ctl_table fs_table[] = {
                .data           = &suid_dumpable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &two,
        },
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
        {
@@ -1346,6 +1514,9 @@ static void start_unregistering(struct ctl_table_header *p)
                spin_unlock(&sysctl_lock);
                wait_for_completion(&wait);
                spin_lock(&sysctl_lock);
+       } else {
+               /* anything non-NULL; we'll never dereference it */
+               p->unregistering = ERR_PTR(-EINVAL);
        }
        /*
         * do not remove from the list until nobody holds it; walking the
@@ -1354,6 +1525,32 @@ static void start_unregistering(struct ctl_table_header *p)
        list_del_init(&p->ctl_entry);
 }
 
+void sysctl_head_get(struct ctl_table_header *head)
+{
+       spin_lock(&sysctl_lock);
+       head->count++;
+       spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+       spin_lock(&sysctl_lock);
+       if (!--head->count)
+               kfree(head);
+       spin_unlock(&sysctl_lock);
+}
+
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+       if (!head)
+               BUG();
+       spin_lock(&sysctl_lock);
+       if (!use_table(head))
+               head = ERR_PTR(-ENOENT);
+       spin_unlock(&sysctl_lock);
+       return head;
+}
+
 void sysctl_head_finish(struct ctl_table_header *head)
 {
        if (!head)
@@ -1363,14 +1560,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
 
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+       struct ctl_table_set *set = &root->default_set;
+       if (root->lookup)
+               set = root->lookup(root, namespaces);
+       return set;
+}
+
 static struct list_head *
 lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
-       struct list_head *header_list;
-       header_list = &root->header_list;
-       if (root->lookup)
-               header_list = root->lookup(root, namespaces);
-       return header_list;
+       struct ctl_table_set *set = lookup_header_set(root, namespaces);
+       return &set->list;
 }
 
 struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1430,6 +1633,73 @@ void register_sysctl_root(struct ctl_table_root *root)
 }
 
 #ifdef CONFIG_SYSCTL_SYSCALL
+/* Perform the actual read/write of a sysctl table entry. */
+static int do_sysctl_strategy(struct ctl_table_root *root,
+                       struct ctl_table *table,
+                       void __user *oldval, size_t __user *oldlenp,
+                       void __user *newval, size_t newlen)
+{
+       int op = 0, rc;
+
+       if (oldval)
+               op |= MAY_READ;
+       if (newval)
+               op |= MAY_WRITE;
+       if (sysctl_perm(root, table, op))
+               return -EPERM;
+
+       if (table->strategy) {
+               rc = table->strategy(table, oldval, oldlenp, newval, newlen);
+               if (rc < 0)
+                       return rc;
+               if (rc > 0)
+                       return 0;
+       }
+
+       /* If there is no strategy routine, or if the strategy returns
+        * zero, proceed with automatic r/w */
+       if (table->data && table->maxlen) {
+               rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
+               if (rc < 0)
+                       return rc;
+       }
+       return 0;
+}
+
+static int parse_table(int __user *name, int nlen,
+                      void __user *oldval, size_t __user *oldlenp,
+                      void __user *newval, size_t newlen,
+                      struct ctl_table_root *root,
+                      struct ctl_table *table)
+{
+       int n;
+repeat:
+       if (!nlen)
+               return -ENOTDIR;
+       if (get_user(n, name))
+               return -EFAULT;
+       for ( ; table->ctl_name || table->procname; table++) {
+               if (!table->ctl_name)
+                       continue;
+               if (n == table->ctl_name) {
+                       int error;
+                       if (table->child) {
+                               if (sysctl_perm(root, table, MAY_EXEC))
+                                       return -EPERM;
+                               name++;
+                               nlen--;
+                               table = table->child;
+                               goto repeat;
+                       }
+                       error = do_sysctl_strategy(root, table,
+                                                  oldval, oldlenp,
+                                                  newval, newlen);
+                       return error;
+               }
+       }
+       return -ENOTDIR;
+}
+
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
               void __user *newval, size_t newlen)
 {
@@ -1447,7 +1717,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
        for (head = sysctl_head_next(NULL); head;
                        head = sysctl_head_next(head)) {
                error = parse_table(name, nlen, oldval, oldlenp, 
-                                       newval, newlen, head->ctl_table);
+                                       newval, newlen,
+                                       head->root, head->ctl_table);
                if (error != -ENOTDIR) {
                        sysctl_head_finish(head);
                        break;
@@ -1456,7 +1727,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
        return error;
 }
 
-asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
 {
        struct __sysctl_args tmp;
        int error;
@@ -1484,93 +1755,31 @@ out:
 
 static int test_perm(int mode, int op)
 {
-       if (!current->euid)
+       if (!current_euid())
                mode >>= 6;
        else if (in_egroup_p(0))
                mode >>= 3;
-       if ((mode & op & 0007) == op)
+       if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
                return 0;
        return -EACCES;
 }
 
-int sysctl_perm(struct ctl_table *table, int op)
+int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
        int error;
-       error = security_sysctl(table, op);
+       int mode;
+
+       error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
        if (error)
                return error;
-       return test_perm(table->mode, op);
-}
 
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *name, int nlen,
-                      void __user *oldval, size_t __user *oldlenp,
-                      void __user *newval, size_t newlen,
-                      struct ctl_table *table)
-{
-       int n;
-repeat:
-       if (!nlen)
-               return -ENOTDIR;
-       if (get_user(n, name))
-               return -EFAULT;
-       for ( ; table->ctl_name || table->procname; table++) {
-               if (!table->ctl_name)
-                       continue;
-               if (n == table->ctl_name) {
-                       int error;
-                       if (table->child) {
-                               if (sysctl_perm(table, 001))
-                                       return -EPERM;
-                               name++;
-                               nlen--;
-                               table = table->child;
-                               goto repeat;
-                       }
-                       error = do_sysctl_strategy(table, name, nlen,
-                                                  oldval, oldlenp,
-                                                  newval, newlen);
-                       return error;
-               }
-       }
-       return -ENOTDIR;
-}
+       if (root->permissions)
+               mode = root->permissions(root, current->nsproxy, table);
+       else
+               mode = table->mode;
 
-/* Perform the actual read/write of a sysctl table entry. */
-int do_sysctl_strategy (struct ctl_table *table,
-                       int __user *name, int nlen,
-                       void __user *oldval, size_t __user *oldlenp,
-                       void __user *newval, size_t newlen)
-{
-       int op = 0, rc;
-
-       if (oldval)
-               op |= 004;
-       if (newval) 
-               op |= 002;
-       if (sysctl_perm(table, op))
-               return -EPERM;
-
-       if (table->strategy) {
-               rc = table->strategy(table, name, nlen, oldval, oldlenp,
-                                    newval, newlen);
-               if (rc < 0)
-                       return rc;
-               if (rc > 0)
-                       return 0;
-       }
-
-       /* If there is no strategy routine, or if the strategy returns
-        * zero, proceed with automatic r/w */
-       if (table->data && table->maxlen) {
-               rc = sysctl_data(table, name, nlen, oldval, oldlenp,
-                                newval, newlen);
-               if (rc < 0)
-                       return rc;
-       }
-       return 0;
+       return test_perm(mode, op);
 }
-#endif /* CONFIG_SYSCTL_SYSCALL */
 
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
@@ -1583,14 +1792,66 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 
 static __init int sysctl_init(void)
 {
-       int err;
        sysctl_set_parent(NULL, root_table);
-       err = sysctl_check_table(current->nsproxy, root_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+       {
+               int err;
+               err = sysctl_check_table(current->nsproxy, root_table);
+       }
+#endif
        return 0;
 }
 
 core_initcall(sysctl_init);
 
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+                                     struct ctl_table *table)
+{
+       struct ctl_table *p;
+       const char *s = branch->procname;
+
+       /* branch should have named subdirectory as its first element */
+       if (!s || !branch->child)
+               return NULL;
+
+       /* ... and nothing else */
+       if (branch[1].procname || branch[1].ctl_name)
+               return NULL;
+
+       /* table should contain subdirectory with the same name */
+       for (p = table; p->procname || p->ctl_name; p++) {
+               if (!p->child)
+                       continue;
+               if (p->procname && strcmp(p->procname, s) == 0)
+                       return p;
+       }
+       return NULL;
+}
+
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+       struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+       struct ctl_table *next;
+       int is_better = 0;
+       int not_in_parent = !p->attached_by;
+
+       while ((next = is_branch_in(by, to)) != NULL) {
+               if (by == q->attached_by)
+                       is_better = 1;
+               if (to == p->attached_by)
+                       not_in_parent = 1;
+               by = by->child;
+               to = next->child;
+       }
+
+       if (is_better && not_in_parent) {
+               q->attached_by = by;
+               q->attached_to = to;
+               q->parent = p;
+       }
+}
+
 /**
  * __register_sysctl_paths - register a sysctl hierarchy
  * @root: List of sysctl headers to register on
@@ -1667,10 +1928,10 @@ struct ctl_table_header *__register_sysctl_paths(
        struct nsproxy *namespaces,
        const struct ctl_path *path, struct ctl_table *table)
 {
-       struct list_head *header_list;
        struct ctl_table_header *header;
        struct ctl_table *new, **prevp;
        unsigned int n, npath;
+       struct ctl_table_set *set;
 
        /* Count the path components */
        for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1712,13 +1973,28 @@ struct ctl_table_header *__register_sysctl_paths(
        header->unregistering = NULL;
        header->root = root;
        sysctl_set_parent(NULL, header->ctl_table);
+       header->count = 1;
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
        if (sysctl_check_table(namespaces, header->ctl_table)) {
                kfree(header);
                return NULL;
        }
+#endif
        spin_lock(&sysctl_lock);
-       header_list = lookup_header_list(root, namespaces);
-       list_add_tail(&header->ctl_entry, header_list);
+       header->set = lookup_header_set(root, namespaces);
+       header->attached_by = header->ctl_table;
+       header->attached_to = root_table;
+       header->parent = &root_table_header;
+       for (set = header->set; set; set = set->parent) {
+               struct ctl_table_header *p;
+               list_for_each_entry(p, &set->list, ctl_entry) {
+                       if (p->unregistering)
+                               continue;
+                       try_attach(p, header);
+               }
+       }
+       header->parent->count++;
+       list_add_tail(&header->ctl_entry, &header->set->list);
        spin_unlock(&sysctl_lock);
 
        return header;
@@ -1773,8 +2049,37 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 
        spin_lock(&sysctl_lock);
        start_unregistering(header);
+       if (!--header->parent->count) {
+               WARN_ON(1);
+               kfree(header->parent);
+       }
+       if (!--header->count)
+               kfree(header);
        spin_unlock(&sysctl_lock);
-       kfree(header);
+}
+
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+       struct ctl_table_set *set = p->set;
+       int res;
+       spin_lock(&sysctl_lock);
+       if (p->unregistering)
+               res = 0;
+       else if (!set->is_seen)
+               res = 1;
+       else
+               res = set->is_seen(set);
+       spin_unlock(&sysctl_lock);
+       return res;
+}
+
+void setup_sysctl_set(struct ctl_table_set *p,
+       struct ctl_table_set *parent,
+       int (*is_seen)(struct ctl_table_set *))
+{
+       INIT_LIST_HEAD(&p->list);
+       p->parent = parent ? parent : &sysctl_table_root.default_set;
+       p->is_seen = is_seen;
 }
 
 #else /* !CONFIG_SYSCTL */
@@ -1793,6 +2098,16 @@ void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
 
+void setup_sysctl_set(struct ctl_table_set *p,
+       struct ctl_table_set *parent,
+       int (*is_seen)(struct ctl_table_set *))
+{
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
+
 #endif /* CONFIG_SYSCTL */
 
 /*
@@ -2045,49 +2360,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
                            NULL,NULL);
 }
 
-#define OP_SET 0
-#define OP_AND 1
-#define OP_OR  2
-
-static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
-                                     int *valp,
-                                     int write, void *data)
-{
-       int op = *(int *)data;
-       if (write) {
-               int val = *negp ? -*lvalp : *lvalp;
-               switch(op) {
-               case OP_SET:    *valp = val; break;
-               case OP_AND:    *valp &= val; break;
-               case OP_OR:     *valp |= val; break;
-               }
-       } else {
-               int val = *valp;
-               if (val < 0) {
-                       *negp = -1;
-                       *lvalp = (unsigned long)-val;
-               } else {
-                       *negp = 0;
-                       *lvalp = (unsigned long)val;
-               }
-       }
-       return 0;
-}
-
 /*
- *     Taint values can only be increased
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
  */
-static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write, struct file *filp,
                               void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       int op;
+       struct ctl_table t;
+       unsigned long tmptaint = get_taint();
+       int err;
 
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       op = OP_OR;
-       return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
-                               do_proc_dointvec_bset_conv,&op);
+       t = *table;
+       t.data = &tmptaint;
+       err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+       if (err < 0)
+               return err;
+
+       if (write) {
+               /*
+                * Poor man's atomic or. Not worth adding a primitive
+                * to everyone's atomic.h for this
+                */
+               int i;
+               for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+                       if ((tmptaint >> i) & 1)
+                               add_taint(i);
+               }
+       }
+
+       return err;
 }
 
 struct do_proc_dointvec_minmax_conv_param {
@@ -2535,7 +2840,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
  */
 
 /* The generic sysctl data routine (used if no strategy routine supplied) */
-int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_data(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2569,7 +2874,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
 }
 
 /* The generic string strategy routine: */
-int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_string(struct ctl_table *table,
                  void __user *oldval, size_t __user *oldlenp,
                  void __user *newval, size_t newlen)
 {
@@ -2615,7 +2920,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
  * are between the minimum and maximum values given in the arrays
  * table->extra1 and table->extra2, respectively.
  */
-int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_intvec(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2651,7 +2956,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
 }
 
 /* Strategy function to convert jiffies to seconds */ 
-int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2685,7 +2990,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
 }
 
 /* Strategy function to convert jiffies to seconds */ 
-int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_ms_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2723,7 +3028,7 @@ int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
 #else /* CONFIG_SYSCTL_SYSCALL */
 
 
-asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
 {
        struct __sysctl_args tmp;
        int error;
@@ -2740,35 +3045,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
        return error;
 }
 
-int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_data(struct ctl_table *table,
                  void __user *oldval, size_t __user *oldlenp,
                  void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 
-int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_string(struct ctl_table *table,
                  void __user *oldval, size_t __user *oldlenp,
                  void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 
-int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_intvec(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 
-int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 
-int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_ms_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {