/* -*- linux-c -*- * Memory allocation functions * Copyright (C) 2005-2008 Red Hat Inc. * * This file is part of systemtap, and is free software. You can * redistribute it and/or modify it under the terms of the GNU General * Public License (GPL); either version 2, or (at your option) any * later version. */ #ifndef _STAPLINUX_ALLOC_C_ #define _STAPLINUX_ALLOC_C_ #include static int _stp_allocated_net_memory = 0; /* Default, and should be "safe" from anywhere. */ #define STP_ALLOC_FLAGS ((GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN) \ & ~__GFP_WAIT) /* May only be used in context that can sleep. __GFP_NORETRY is to suppress the oom-killer from kicking in. */ #define STP_ALLOC_SLEEP_FLAGS (GFP_KERNEL | __GFP_NORETRY) /* #define DEBUG_MEM */ /* * If DEBUG_MEM is defined (stap -DDEBUG_MEM ...) then full memory * tracking is used. Each allocation is recorded and matched with * a free. Also a fence is set around the allocated memory so overflows * and underflows can be detected. Errors are written to the system log * with printk. * * NOTE: if youy system is slow or your script makes a very large number * of allocations, you may get a warning in the system log: * BUG: soft lockup - CPU#1 stuck for 11s! [staprun:28269] * This is an expected side-effect of the overhead of tracking, especially * with a simple linked list of allocations. Optimization * would be nice, but DEBUG_MEM is only for testing. */ static int _stp_allocated_memory = 0; #ifdef DEBUG_MEM static DEFINE_SPINLOCK(_stp_mem_lock); #define MEM_MAGIC 0xc11cf77f #define MEM_FENCE_SIZE 32 enum _stp_memtype { MEM_KMALLOC, MEM_PERCPU }; typedef struct { char *alloc; char *free; } _stp_malloc_type; static const _stp_malloc_type const _stp_malloc_types[] = { {"kmalloc", "kfree"}, {"alloc_percpu", "free_percpu"} }; struct _stp_mem_entry { struct list_head list; int32_t magic; enum _stp_memtype type; size_t len; void *addr; }; #define MEM_DEBUG_SIZE (2*MEM_FENCE_SIZE+sizeof(struct _stp_mem_entry)) static LIST_HEAD(_stp_mem_list); static void _stp_check_mem_fence (char *addr, int size) { char *ptr; int i; ptr = addr - MEM_FENCE_SIZE; while (ptr < addr) { if (*ptr != 0x55) { printk("SYSTEMTAP ERROR: Memory fence corrupted before allocated memory\n"); printk("at addr %p. (Allocation starts at %p)", ptr, addr); return; } ptr++; } ptr = addr + size; while (ptr < addr + size + MEM_FENCE_SIZE) { if (*ptr != 0x55) { printk("SYSTEMTAP ERROR: Memory fence corrupted after allocated memory\n"); printk("at addr %p. (Allocation ends at %p)", ptr, addr + size - 1); return; } ptr++; } } static void *_stp_mem_debug_setup(void *addr, size_t size, enum _stp_memtype type) { struct list_head *p; struct _stp_mem_entry *m; memset(addr, 0x55, MEM_FENCE_SIZE); addr += MEM_FENCE_SIZE; memset(addr + size, 0x55, MEM_FENCE_SIZE); p = (struct list_head *)(addr + size + MEM_FENCE_SIZE); m = (struct _stp_mem_entry *)p; m->magic = MEM_MAGIC; m->type = type; m->len = size; m->addr = addr; spin_lock(&_stp_mem_lock); list_add(p, &_stp_mem_list); spin_unlock(&_stp_mem_lock); return addr; } /* Percpu allocations don't have the fence. Implementing it is problematic. */ static void _stp_mem_debug_percpu(struct _stp_mem_entry *m, void *addr, size_t size) { struct list_head *p = (struct list_head *)m; m->magic = MEM_MAGIC; m->type = MEM_PERCPU; m->len = size; m->addr = addr; spin_lock(&_stp_mem_lock); list_add(p, &_stp_mem_list); spin_unlock(&_stp_mem_lock); } static void _stp_mem_debug_free(void *addr, enum _stp_memtype type) { int found = 0; struct list_head *p, *tmp; struct _stp_mem_entry *m = NULL; spin_lock(&_stp_mem_lock); list_for_each_safe(p, tmp, &_stp_mem_list) { m = list_entry(p, struct _stp_mem_entry, list); if (m->addr == addr) { list_del(p); found = 1; break; } } spin_unlock(&_stp_mem_lock); if (!found) { printk("SYSTEMTAP ERROR: Free of unallocated memory %p type=%s\n", addr, _stp_malloc_types[type].free); return; } if (m->magic != MEM_MAGIC) { printk("SYSTEMTAP ERROR: Memory at %p corrupted!!\n", addr); return; } if (m->type != type) { printk("SYSTEMTAP ERROR: Memory allocated with %s and freed with %s\n", _stp_malloc_types[m->type].alloc, _stp_malloc_types[type].free); } switch (m->type) { case MEM_KMALLOC: _stp_check_mem_fence(addr, m->len); kfree(addr - MEM_FENCE_SIZE); break; case MEM_PERCPU: free_percpu(addr); kfree(p); break; default: printk("SYSTEMTAP ERROR: Attempted to free memory at addr %p len=%d with unknown allocation type.\n", addr, (int)m->len); } return; } static void _stp_mem_debug_validate(void *addr) { int found = 0; struct list_head *p, *tmp; struct _stp_mem_entry *m = NULL; spin_lock(&_stp_mem_lock); list_for_each_safe(p, tmp, &_stp_mem_list) { m = list_entry(p, struct _stp_mem_entry, list); if (m->addr == addr) { found = 1; break; } } spin_unlock(&_stp_mem_lock); if (!found) { printk("SYSTEMTAP ERROR: Couldn't validate memory %p\n", addr); return; } if (m->magic != MEM_MAGIC) { printk("SYSTEMTAP ERROR: Memory at %p corrupted!!\n", addr); return; } switch (m->type) { case MEM_KMALLOC: _stp_check_mem_fence(addr, m->len); break; case MEM_PERCPU: /* do nothing */ break; default: printk("SYSTEMTAP ERROR: Attempted to validate memory at addr %p len=%d with unknown allocation type.\n", addr, (int)m->len); } return; } #endif /* #define STP_MAXMEMORY 8192 */ /* * If STP_MAXMEMORY is defined to a value (stap -DSTP_MAXMEMORY=8192 * ...) then every memory allocation is checked to make sure the * systemtap module doesn't use more than STP_MAXMEMORY of memory. * STP_MAXMEMORY is specified in kilobytes, so, for example, '8192' * means that the systemtap module won't use more than 8 megabytes of * memory. * * Note 1: This size does include the size of the module itself, plus * any additional allocations. * * Note 2: Since we can't be ensured that the module transport is set * up when a memory allocation problem happens, this code can't * directly report an error back to a user (so instead it uses * 'printk'). If the modules transport has been set up, the code that * calls the memory allocation functions * (_stp_kmalloc/_stp_kzalloc/etc.) should report an error directly to * the user. * * Note 3: This only tracks direct allocations by the systemtap * runtime. This does not track indirect allocations (such as done by * kprobes/uprobes/etc. internals). */ #ifdef STP_MAXMEMORY #ifndef STAPCONF_GRSECURITY #define _STP_MODULE_CORE_SIZE (THIS_MODULE->core_size) #else #define _STP_MODULE_CORE_SIZE (THIS_MODULE->core_size_rw) #endif #endif static void *_stp_kmalloc_gfp(size_t size, gfp_t gfp_mask) { void *ret; #ifdef STP_MAXMEMORY if ((_STP_MODULE_CORE_SIZE + _stp_allocated_memory + size) > (STP_MAXMEMORY * 1024)) { return NULL; } #endif #ifdef DEBUG_MEM ret = kmalloc(size + MEM_DEBUG_SIZE, gfp_mask); if (likely(ret)) { _stp_allocated_memory += size; ret = _stp_mem_debug_setup(ret, size, MEM_KMALLOC); } #else ret = kmalloc(size, gfp_mask); if (likely(ret)) { _stp_allocated_memory += size; } #endif return ret; } static void *_stp_kmalloc(size_t size) { return _stp_kmalloc_gfp(size, STP_ALLOC_FLAGS); } static void *_stp_kzalloc_gfp(size_t size, gfp_t gfp_mask) #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) { void *ret; #ifdef STP_MAXMEMORY if ((_STP_MODULE_CORE_SIZE + _stp_allocated_memory + size) > (STP_MAXMEMORY * 1024)) { return NULL; } #endif #ifdef DEBUG_MEM ret = kmalloc(size + MEM_DEBUG_SIZE, gfp_mask); if (likely(ret)) { _stp_allocated_memory += size; ret = _stp_mem_debug_setup(ret, size, MEM_KMALLOC); memset (ret, 0, size); } #else ret = kmalloc(size, gfp_mask); if (likely(ret)) { _stp_allocated_memory += size; memset (ret, 0, size); } #endif /* DEBUG_MEM */ return ret; } #else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15) */ { void *ret; #ifdef STP_MAXMEMORY if ((_STP_MODULE_CORE_SIZE + _stp_allocated_memory + size) > (STP_MAXMEMORY * 1024)) { return NULL; } #endif #ifdef DEBUG_MEM ret = kzalloc(size + MEM_DEBUG_SIZE, gfp_mask); if (likely(ret)) { _stp_allocated_memory += size; ret = _stp_mem_debug_setup(ret, size, MEM_KMALLOC); } #else ret = kzalloc(size, gfp_mask); if (likely(ret)) { _stp_allocated_memory += size; } #endif return ret; } #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) */ static void *_stp_kzalloc(size_t size) { return _stp_kzalloc_gfp(size, STP_ALLOC_FLAGS); } #ifdef PCPU_MIN_UNIT_SIZE #define _STP_MAX_PERCPU_SIZE PCPU_MIN_UNIT_SIZE #else #define _STP_MAX_PERCPU_SIZE 131072 #endif /* Note, calls __alloc_percpu which may sleep and always uses GFP_KERNEL. */ static void *_stp_alloc_percpu(size_t size) { void *ret; if (size > _STP_MAX_PERCPU_SIZE) return NULL; #ifdef STP_MAXMEMORY if ((_STP_MODULE_CORE_SIZE + _stp_allocated_memory + (size * num_online_cpus())) > (STP_MAXMEMORY * 1024)) { return NULL; } #endif #ifdef STAPCONF_ALLOC_PERCPU_ALIGN ret = __alloc_percpu(size, 8); #else ret = __alloc_percpu(size); #endif #ifdef DEBUG_MEM if (likely(ret)) { struct _stp_mem_entry *m = kmalloc(sizeof(struct _stp_mem_entry), GFP_KERNEL); if (unlikely(m == NULL)) { free_percpu(ret); return NULL; } _stp_allocated_memory += size * num_online_cpus(); _stp_mem_debug_percpu(m, ret, size); } #else if (likely(ret)) { _stp_allocated_memory += size * num_online_cpus(); } #endif return ret; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) #define _stp_kmalloc_node(size,node) _stp_kmalloc(size) #define _stp_kmalloc_node_gfp(size,node,gfp) _stp_kmalloc_gfp(size,gfp) #define _stp_kzalloc_node(size,node) _stp_kzalloc(size) #define _stp_kzalloc_node_gfp(size,node,gfp) _stp_kzalloc_gfp(size,gfp) #else static void *_stp_kmalloc_node_gfp(size_t size, int node, gfp_t gfp_mask) { void *ret; #ifdef STP_MAXMEMORY if ((_STP_MODULE_CORE_SIZE + _stp_allocated_memory + size) > (STP_MAXMEMORY * 1024)) { return NULL; } #endif #ifdef DEBUG_MEM ret = kmalloc_node(size + MEM_DEBUG_SIZE, gfp_mask, node); if (likely(ret)) { _stp_allocated_memory += size; ret = _stp_mem_debug_setup(ret, size, MEM_KMALLOC); } #else ret = kmalloc_node(size, gfp_mask, node); if (likely(ret)) { _stp_allocated_memory += size; } #endif return ret; } static void *_stp_kzalloc_node_gfp(size_t size, int node, gfp_t gfp_mask) { /* This used to be simply: * return _stp_kmalloc_node_gfp(size, node, gfp_mask | __GFP_ZERO); * but rhel5-era kernels BUG out on that flag. (PR14957) * * We could make a big #if-alternation for kernels which have support * for kzalloc_node (kernel commit 979b0fea), as in _stp_kzalloc_gfp, * but IMO that's needlessly complex. * * So for now, just malloc and zero it manually. */ void *ret = _stp_kmalloc_node_gfp(size, node, gfp_mask); if (likely(ret)) { memset (ret, 0, size); } return ret; } static void *_stp_kmalloc_node(size_t size, int node) { return _stp_kmalloc_node_gfp(size, node, STP_ALLOC_FLAGS); } static void *_stp_kzalloc_node(size_t size, int node) { return _stp_kzalloc_node_gfp(size, node, STP_ALLOC_FLAGS); } #endif /* LINUX_VERSION_CODE */ static void _stp_kfree(void *addr) { #ifdef DEBUG_MEM _stp_mem_debug_free(addr, MEM_KMALLOC); #else kfree(addr); #endif } static void _stp_free_percpu(void *addr) { #ifdef DEBUG_MEM _stp_mem_debug_free(addr, MEM_PERCPU); #else free_percpu(addr); #endif } static void _stp_mem_debug_done(void) { #ifdef DEBUG_MEM struct list_head *p, *tmp; struct _stp_mem_entry *m; spin_lock(&_stp_mem_lock); list_for_each_safe(p, tmp, &_stp_mem_list) { m = list_entry(p, struct _stp_mem_entry, list); list_del(p); printk("SYSTEMTAP ERROR: Memory %p len=%d allocation type: %s. Not freed.\n", m->addr, (int)m->len, _stp_malloc_types[m->type].alloc); if (m->magic != MEM_MAGIC) { printk("SYSTEMTAP ERROR: Memory at %p len=%d corrupted!!\n", m->addr, (int)m->len); /* Don't free. Too dangerous */ goto done; } switch (m->type) { case MEM_KMALLOC: _stp_check_mem_fence(m->addr, m->len); kfree(m->addr - MEM_FENCE_SIZE); break; case MEM_PERCPU: free_percpu(m->addr); kfree(p); break; default: printk("SYSTEMTAP ERROR: Attempted to free memory at addr %p len=%d with unknown allocation type.\n", m->addr, (int)m->len); } } done: spin_unlock(&_stp_mem_lock); return; #endif } #endif /* _STAPLINUX_ALLOC_C_ */