#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <linux/module.h>
-#include <linux/kmemtrace.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/rtmutex.h>
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
+#include <linux/kmemcheck.h>
+#include <linux/memory.h>
+#include <linux/prefetch.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#define BYTES_PER_WORD sizeof(void *)
#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
-
#ifndef ARCH_KMALLOC_FLAGS
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
+ SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#else
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
+ SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#endif
/*
#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
-/*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
- struct list_head list;
- unsigned long colouroff;
- void *s_mem; /* including colour offset */
- unsigned int inuse; /* num of objs active in slab */
- kmem_bufctl_t free;
- unsigned short nodeid;
-};
-
/*
* struct slab_rcu
*
*
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
- *
- * We assume struct slab_rcu can overlay struct slab when destroying.
*/
struct slab_rcu {
struct rcu_head head;
void *addr;
};
+/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+ union {
+ struct {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+ unsigned short nodeid;
+ };
+ struct slab_rcu __slab_cover_slab_rcu;
+ };
+};
+
/*
* struct array_cache
*
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define CACHE_CACHE 0
#define SIZE_AC MAX_NUMNODES
#define SIZE_L3 (2 * MAX_NUMNODES)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
-/*
- * struct kmem_cache
- *
- * manages a cache.
- */
-
-struct kmem_cache {
-/* 1) per-cpu data, touched during every alloc/free */
- struct array_cache *array[NR_CPUS];
-/* 2) Cache tunables. Protected by cache_chain_mutex */
- unsigned int batchcount;
- unsigned int limit;
- unsigned int shared;
-
- unsigned int buffer_size;
- u32 reciprocal_buffer_size;
-/* 3) touched by every alloc & free from the backend */
-
- unsigned int flags; /* constant flags */
- unsigned int num; /* # of objs per slab */
-
-/* 4) cache_grow/shrink */
- /* order of pgs per slab (2^n) */
- unsigned int gfporder;
-
- /* force GFP flags, e.g. GFP_DMA */
- gfp_t gfpflags;
-
- size_t colour; /* cache colouring range */
- unsigned int colour_off; /* colour offset */
- struct kmem_cache *slabp_cache;
- unsigned int slab_size;
- unsigned int dflags; /* dynamic flags */
-
- /* constructor func */
- void (*ctor)(void *obj);
-
-/* 5) cache creation/removal */
- const char *name;
- struct list_head next;
-
-/* 6) statistics */
-#if STATS
- unsigned long num_active;
- unsigned long num_allocations;
- unsigned long high_mark;
- unsigned long grown;
- unsigned long reaped;
- unsigned long errors;
- unsigned long max_freeable;
- unsigned long node_allocs;
- unsigned long node_frees;
- unsigned long node_overflow;
- atomic_t allochit;
- atomic_t allocmiss;
- atomic_t freehit;
- atomic_t freemiss;
-#endif
-#if DEBUG
- /*
- * If debugging is enabled, then the allocator can add additional
- * fields and/or padding to every object. buffer_size contains the total
- * object size including these internal fields, the following two
- * variables contain the offset to the user object and its size.
- */
- int obj_offset;
- int obj_size;
-#endif
- /*
- * We put nodelists[] at the end of kmem_cache, because we want to size
- * this array to nr_node_ids slots instead of MAX_NUMNODES
- * (see kmem_cache_init())
- * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
- * is statically defined, so we reserve the max number of nodes.
- */
- struct kmem_list3 *nodelists[MAX_NUMNODES];
- /*
- * Do not add fields after nodelists[]
- */
-};
-
#define CFLGS_OFF_SLAB (0x80000000UL)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
#endif
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
size_t slab_buffer_size(struct kmem_cache *cachep)
{
return cachep->buffer_size;
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
static struct kmem_cache cache_cache = {
+ .nodelists = cache_cache_nodelists,
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
#define BAD_ALIEN_MAGIC 0x01020304ul
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+ NONE,
+ PARTIAL_AC,
+ PARTIAL_L3,
+ EARLY,
+ FULL
+} g_cpucache_up;
+
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+ return g_cpucache_up >= EARLY;
+}
+
#ifdef CONFIG_LOCKDEP
/*
static struct lock_class_key on_slab_l3_key;
static struct lock_class_key on_slab_alc_key;
-static inline void init_lock_keys(void)
-
+static void init_node_lock_keys(int q)
{
- int q;
struct cache_sizes *s = malloc_sizes;
- while (s->cs_size != ULONG_MAX) {
- for_each_node(q) {
- struct array_cache **alc;
- int r;
- struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
- if (!l3 || OFF_SLAB(s->cs_cachep))
- continue;
- lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
- alc = l3->alien;
- /*
- * FIXME: This check for BAD_ALIEN_MAGIC
- * should go away when common slab code is taught to
- * work even without alien caches.
- * Currently, non NUMA code returns BAD_ALIEN_MAGIC
- * for alloc_alien_cache,
- */
- if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
- continue;
- for_each_node(r) {
- if (alc[r])
- lockdep_set_class(&alc[r]->lock,
- &on_slab_alc_key);
- }
+ if (g_cpucache_up != FULL)
+ return;
+
+ for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
+ struct array_cache **alc;
+ struct kmem_list3 *l3;
+ int r;
+
+ l3 = s->cs_cachep->nodelists[q];
+ if (!l3 || OFF_SLAB(s->cs_cachep))
+ continue;
+ lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
+ alc = l3->alien;
+ /*
+ * FIXME: This check for BAD_ALIEN_MAGIC
+ * should go away when common slab code is taught to
+ * work even without alien caches.
+ * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+ * for alloc_alien_cache,
+ */
+ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+ continue;
+ for_each_node(r) {
+ if (alc[r])
+ lockdep_set_class(&alc[r]->lock,
+ &on_slab_alc_key);
}
- s++;
}
}
+
+static inline void init_lock_keys(void)
+{
+ int node;
+
+ for_each_node(node)
+ init_node_lock_keys(node);
+}
#else
+static void init_node_lock_keys(int q)
+{
+}
+
static inline void init_lock_keys(void)
{
}
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
- NONE,
- PARTIAL_AC,
- PARTIAL_L3,
- FULL
-} g_cpucache_up;
-
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
- return g_cpucache_up == FULL;
-}
-
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
+static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
*/
static int use_alien_caches __read_mostly = 1;
-static int numa_platform __read_mostly = 1;
static int __init noaliencache_setup(char *s)
{
use_alien_caches = 0;
* objects freed on different nodes from which they were allocated) and the
* flushing of remote pcps by calling drain_node_pages.
*/
-static DEFINE_PER_CPU(unsigned long, reap_node);
+static DEFINE_PER_CPU(unsigned long, slab_reap_node);
static void init_reap_node(int cpu)
{
int node;
- node = next_node(cpu_to_node(cpu), node_online_map);
+ node = next_node(cpu_to_mem(cpu), node_online_map);
if (node == MAX_NUMNODES)
node = first_node(node_online_map);
- per_cpu(reap_node, cpu) = node;
+ per_cpu(slab_reap_node, cpu) = node;
}
static void next_reap_node(void)
{
- int node = __get_cpu_var(reap_node);
+ int node = __this_cpu_read(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
+ __this_cpu_write(slab_reap_node, node);
}
#else
*/
static void __cpuinit start_cpu_timer(int cpu)
{
- struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+ struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
/*
* When this gets called from do_initcalls via cpucache_init(),
*/
if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_DELAYED_WORK(reap_work, cache_reap);
+ INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
__round_jiffies_relative(HZ, cpu));
}
nc = kmalloc_node(memsize, gfp, node);
/*
* The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transfered to another
+ * However, when such objects are allocated or transferred to another
* cache the pointers are not cleared and they could be counted as
* valid references during a kmemleak scan. Therefore, kmemleak must
* not scan such objects.
struct array_cache *from, unsigned int max)
{
/* Figure out how many entries to transfer */
- int nr = min(min(from->avail, max), to->limit - to->avail);
+ int nr = min3(from->avail, max, to->limit - to->avail);
if (!nr)
return 0;
from->avail -= nr;
to->avail += nr;
- to->touched = 1;
return nr;
}
if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, gfp, node);
+ ac_ptr = kzalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
- if (i == node || !node_online(i)) {
- ac_ptr[i] = NULL;
+ if (i == node || !node_online(i))
continue;
- }
ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
for (i--; i >= 0; i--)
*/
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
- int node = __get_cpu_var(reap_node);
+ int node = __this_cpu_read(slab_reap_node);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
struct array_cache *alien = NULL;
int node;
- node = numa_node_id();
+ node = numa_mem_id();
/*
* Make sure we are not freeing a object from another node to the array
}
#endif
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3;
+ const int memsize = sizeof(struct kmem_list3);
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ /*
+ * Set up the size64 kmemlist for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!l3)
+ return -ENOMEM;
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
+ cachep->nodelists[node] = l3;
+ }
+
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+ return 0;
+}
+
static void __cpuinit cpuup_canceled(long cpu)
{
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
+ int node = cpu_to_mem(cpu);
const struct cpumask *mask = cpumask_of_node(node);
list_for_each_entry(cachep, &cache_chain, next) {
if (nc)
free_block(cachep, nc->entry, nc->avail, node);
- if (!cpus_empty(*mask)) {
+ if (!cpumask_empty(mask)) {
spin_unlock_irq(&l3->list_lock);
goto free_array_cache;
}
{
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
- const int memsize = sizeof(struct kmem_list3);
+ int node = cpu_to_mem(cpu);
+ int err;
/*
* We need to do this right in the beginning since
* kmalloc_node allows us to add the slab to the right
* kmem_list3 and not this cpu's kmem_list3
*/
-
- list_for_each_entry(cachep, &cache_chain, next) {
- /*
- * Set up the size64 kmemlist for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
- */
- if (!cachep->nodelists[node]) {
- l3 = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!l3)
- goto bad;
- kmem_list3_init(l3);
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
- /*
- * The l3s don't come and go as CPUs come and
- * go. cache_chain_mutex is sufficient
- * protection here.
- */
- cachep->nodelists[node] = l3;
- }
-
- spin_lock_irq(&cachep->nodelists[node]->list_lock);
- cachep->nodelists[node]->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&cachep->nodelists[node]->list_lock);
- }
+ err = init_cache_nodelists_node(node);
+ if (err < 0)
+ goto bad;
/*
* Now we can go ahead with allocating the shared arrays and
kfree(shared);
free_alien_cache(alien);
}
+ init_node_lock_keys(node);
+
return 0;
bad:
cpuup_canceled(cpu);
* anything expensive but will only modify reap_work
* and reschedule the timer.
*/
- cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
/* Now the cache_reaper is guaranteed to be not running. */
- per_cpu(reap_work, cpu).work.func = NULL;
+ per_cpu(slab_reap_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
mutex_unlock(&cache_chain_mutex);
break;
}
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ return notifier_from_errno(err);
}
static struct notifier_block __cpuinitdata cpucache_notifier = {
&cpuup_callback, NULL, 0
};
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ int ret = 0;
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct kmem_list3 *l3;
+
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ drain_freelist(cachep, l3, l3->free_objects);
+
+ if (!list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int __meminit slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mnb = arg;
+ int ret = 0;
+ int nid;
+
+ nid = mnb->status_change_nid;
+ if (nid < 0)
+ goto out;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = init_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = drain_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+out:
+ return notifier_from_errno(ret);
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
/*
* swap the static kmem_list3 with kmalloced memory
*/
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
- int nodeid)
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+ int nodeid)
{
struct kmem_list3 *ptr;
int order;
int node;
- if (num_possible_nodes() == 1) {
+ if (num_possible_nodes() == 1)
use_alien_caches = 0;
- numa_platform = 0;
- }
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory.
*/
- if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+ if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
- node = numa_node_id();
+ node = numa_mem_id();
/* 1) create the cache_cache */
INIT_LIST_HEAD(&cache_chain);
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
- * struct kmem_cache size depends on nr_node_ids, which
- * can be less than MAX_NUMNODES.
+ * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
- cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
- nr_node_ids * sizeof(struct kmem_list3 *);
+ cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ nr_node_ids * sizeof(struct kmem_list3 *);
#if DEBUG
cache_cache.obj_size = cache_cache.buffer_size;
#endif
}
}
- /* 6) resize the head arrays to their final sizes */
- {
- struct kmem_cache *cachep;
- mutex_lock(&cache_chain_mutex);
- list_for_each_entry(cachep, &cache_chain, next)
- if (enable_cpucache(cachep, GFP_NOWAIT))
- BUG();
- mutex_unlock(&cache_chain_mutex);
- }
+ g_cpucache_up = EARLY;
+}
- /* Annotate slab for lockdep -- annotate the malloc caches */
- init_lock_keys();
+void __init kmem_cache_init_late(void)
+{
+ struct kmem_cache *cachep;
+ /* 6) resize the head arrays to their final sizes */
+ mutex_lock(&cache_chain_mutex);
+ list_for_each_entry(cachep, &cache_chain, next)
+ if (enable_cpucache(cachep, GFP_NOWAIT))
+ BUG();
+ mutex_unlock(&cache_chain_mutex);
/* Done! */
g_cpucache_up = FULL;
+ /* Annotate slab for lockdep -- annotate the malloc caches */
+ init_lock_keys();
+
/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+ /*
+ * Register a memory hotplug callback that initializes and frees
+ * nodelists.
+ */
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
- page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+ page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page)
return NULL;
NR_SLAB_UNRECLAIMABLE, nr_pages);
for (i = 0; i < nr_pages; i++)
__SetPageSlab(page + i);
+
+ if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
+ kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
+
+ if (cachep->ctor)
+ kmemcheck_mark_uninitialized_pages(page, nr_pages);
+ else
+ kmemcheck_mark_unallocated_pages(page, nr_pages);
+ }
+
return page_address(page);
}
struct page *page = virt_to_page(addr);
const unsigned long nr_freed = i;
+ kmemcheck_free_shadow(page, cachep->gfporder);
+
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
NR_SLAB_RECLAIMABLE, nr_freed);
for_each_online_node(node) {
cachep->nodelists[node] =
kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node);
+ gfp, node);
BUG_ON(!cachep->nodelists[node]);
kmem_list3_init(cachep->nodelists[node]);
}
}
}
- cachep->nodelists[numa_node_id()]->next_reap =
+ cachep->nodelists[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
*
* @name must be valid until the cache is destroyed. This implies that
* the module calling this has to destroy the cache before getting unloaded.
- * Note that kmem_cache_name() is not guaranteed to return the same pointer,
- * therefore applications must manage it themselves.
*
* The flags are
*
if (!cachep)
goto oops;
+ cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
#if DEBUG
cachep->obj_size = size;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - size;
+ && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
size = PAGE_SIZE;
}
#endif
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
- * it too early on.)
+ * it too early on. Always use on-slab management when
+ * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
+ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+ !(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
/* really off slab. No need for manual alignment */
slab_size =
cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
+
+#ifdef CONFIG_PAGE_POISONING
+ /* If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
}
cachep->colour_off = cache_line_size();
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
+ assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
#endif
}
{
struct kmem_cache *cachep = arg;
struct array_cache *ac;
- int node = numa_node_id();
+ int node = numa_mem_id();
check_irq_off();
ac = cpu_cache_get(cachep);
*
* The cache must be empty before calling this function.
*
- * The caller must guarantee that noone will allocate memory from the cache
+ * The caller must guarantee that no one will allocate memory from the cache
* during the kmem_cache_destroy().
*/
void kmem_cache_destroy(struct kmem_cache *cachep)
}
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
- synchronize_rcu();
+ rcu_barrier();
__kmem_cache_destroy(cachep);
mutex_unlock(&cache_chain_mutex);
* kmemleak does not treat the ->s_mem pointer as a reference
* to the object. Otherwise we will not report the leak.
*/
- kmemleak_scan_area(slabp, offsetof(struct slab, list),
- sizeof(struct list_head), local_flags);
+ kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
+ local_flags);
if (!slabp)
return NULL;
} else {
/*
* Map pages beginning at addr to the given cache and slab. This is required
* for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
*/
static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
void *addr)
retry:
check_irq_off();
- node = numa_node_id();
+ node = numa_mem_id();
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
- if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+ if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
+ l3->shared->touched = 1;
goto alloc_done;
+ }
while (batchcount > 0) {
struct list_head *entry;
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
-#if ARCH_SLAB_MINALIGN
- if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+ if (ARCH_SLAB_MINALIGN &&
+ ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
- objp, ARCH_SLAB_MINALIGN);
+ objp, (int)ARCH_SLAB_MINALIGN);
}
-#endif
return objp;
}
#else
if (cachep == &cache_cache)
return false;
- return should_failslab(obj_size(cachep), flags);
+ return should_failslab(obj_size(cachep), flags, cachep->flags);
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
}
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
- kmemleak_erase(&ac->entry[ac->avail]);
+ if (objp)
+ kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
- nid_alloc = nid_here = numa_node_id();
+ nid_alloc = nid_here = numa_mem_id();
+ get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
- nid_alloc = cpuset_mem_spread_node();
+ nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
+ put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
if (flags & __GFP_THISNODE)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, -1);
+ obj = kmem_getpages(cache, local_flags, numa_mem_id());
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
}
}
}
+ put_mems_allowed();
return obj;
}
{
unsigned long save_flags;
void *ptr;
+ int slab_node = numa_mem_id();
+
+ flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- if (unlikely(nodeid == -1))
- nodeid = numa_node_id();
+ if (nodeid == NUMA_NO_NODE)
+ nodeid = slab_node;
if (unlikely(!cachep->nodelists[nodeid])) {
/* Node not bootstrapped yet */
goto out;
}
- if (nodeid == numa_node_id()) {
+ if (nodeid == slab_node) {
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
flags);
+ if (likely(ptr))
+ kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
+
if (unlikely((flags & __GFP_ZERO) && ptr))
memset(ptr, 0, obj_size(cachep));
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
- if (!objp)
- objp = ____cache_alloc_node(cache, flags, numa_node_id());
+ if (!objp)
+ objp = ____cache_alloc_node(cache, flags, numa_mem_id());
out:
return objp;
unsigned long save_flags;
void *objp;
+ flags &= gfp_allowed_mask;
+
lockdep_trace_alloc(flags);
if (slab_should_failslab(cachep, flags))
flags);
prefetchw(objp);
+ if (likely(objp))
+ kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
+
if (unlikely((flags & __GFP_ZERO) && objp))
memset(objp, 0, obj_size(cachep));
{
int batchcount;
struct kmem_list3 *l3;
- int node = numa_node_id();
+ int node = numa_mem_id();
batchcount = ac->batchcount;
#if DEBUG
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+ void *caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
- objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+ objp = cache_free_debugcheck(cachep, objp, caller);
+
+ kmemcheck_slab_free(cachep, objp, obj_size(cachep));
/*
* Skip calling cache_free_alien() when the platform is not numa.
* variable to skip the call, which is mostly likely to be present in
* the cache.
*/
- if (numa_platform && cache_free_alien(cachep, objp))
+ if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
if (likely(ac->avail < ac->limit)) {
}
EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+#ifdef CONFIG_TRACING
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
{
- return __cache_alloc(cachep, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
+ void *ret;
-/**
- * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane;
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
-{
- unsigned long addr = (unsigned long)ptr;
- unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = BYTES_PER_WORD - 1;
- unsigned long size = cachep->buffer_size;
- struct page *page;
+ ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
- if (unlikely(addr < min_addr))
- goto out;
- if (unlikely(addr > (unsigned long)high_memory - size))
- goto out;
- if (unlikely(addr & align_mask))
- goto out;
- if (unlikely(!kern_addr_valid(addr)))
- goto out;
- if (unlikely(!kern_addr_valid(addr + size - 1)))
- goto out;
- page = virt_to_page(ptr);
- if (unlikely(!PageSlab(page)))
- goto out;
- if (unlikely(page_get_cache(page) != cachep))
- goto out;
- return 1;
-out:
- return 0;
+ trace_kmalloc(_RET_IP_, ret,
+ size, slab_buffer_size(cachep), flags);
+ return ret;
}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_KMEMTRACE
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid)
+#ifdef CONFIG_TRACING
+void *kmem_cache_alloc_node_trace(size_t size,
+ struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid)
{
- return __cache_alloc_node(cachep, flags, nodeid,
+ void *ret;
+
+ ret = __cache_alloc_node(cachep, flags, nodeid,
__builtin_return_address(0));
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, slab_buffer_size(cachep),
+ flags, nodeid);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
- void *ret;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
-
- trace_kmalloc_node((unsigned long) caller, ret,
- size, cachep->buffer_size, flags, node);
-
- return ret;
+ return kmem_cache_alloc_node_trace(size, cachep, flags, node);
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
return __do_kmalloc_node(size, flags, node,
return __do_kmalloc_node(size, flags, node, NULL);
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
#endif /* CONFIG_NUMA */
/**
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
return __do_kmalloc(size, flags, __builtin_return_address(0));
debug_check_no_locks_freed(objp, obj_size(cachep));
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, obj_size(cachep));
- __cache_free(cachep, objp);
+ __cache_free(cachep, objp, __builtin_return_address(0));
local_irq_restore(flags);
trace_kmem_cache_free(_RET_IP_, objp);
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, obj_size(c));
debug_check_no_obj_freed(objp, obj_size(c));
- __cache_free(c, (void *)objp);
+ __cache_free(c, (void *)objp, __builtin_return_address(0));
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
}
EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *cachep)
-{
- return cachep->name;
-}
-EXPORT_SYMBOL_GPL(kmem_cache_name);
-
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
struct ccupdate_struct {
struct kmem_cache *cachep;
- struct array_cache *new[NR_CPUS];
+ struct array_cache *new[0];
};
static void do_ccupdate_local(void *info)
struct ccupdate_struct *new;
int i;
- new = kzalloc(sizeof(*new), gfp);
+ new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+ gfp);
if (!new)
return -ENOMEM;
for_each_online_cpu(i) {
- new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
+ new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
- free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
- spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
+ spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
kfree(ccold);
}
kfree(new);
* necessary. Note that the l3 listlock also protects the array_cache
* if drain_array() is used on the shared array.
*/
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
struct array_cache *ac, int force, int node)
{
int tofree;
{
struct kmem_cache *searchp;
struct kmem_list3 *l3;
- int node = numa_node_id();
+ int node = numa_mem_id();
struct delayed_work *work = to_delayed_work(w);
if (!mutex_trylock(&cache_chain_mutex))
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
- reaped, errors, max_freeable, node_allocs,
- node_frees, overflows);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
+ "%4lu %4lu %4lu %4lu %4lu",
+ allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees, overflows);
}
/* cpu stats */
{
* @count: data length
* @ppos: unused
*/
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;