Index: vm/gc_gen/src/common/gc_common.cpp =================================================================== --- vm/gc_gen/src/common/gc_common.cpp (revision 583342) +++ vm/gc_gen/src/common/gc_common.cpp (working copy) @@ -62,6 +62,13 @@ extern Boolean IS_MOVE_COMPACT; extern Boolean USE_CONCURRENT_GC; +#if defined(ALLOC_ZEROING) && defined(ALLOC_PREFETCH) +POINTER_SIZE_INT PREFETCH_DISTANCE = 1024; +POINTER_SIZE_INT ZEROING_SIZE = 256; +POINTER_SIZE_INT PREFETCH_STRIDE = 64; +Boolean PREFETCH_ENABLED = FALSE; +#endif + static int get_int_property(const char *property_name) { assert(property_name); @@ -280,6 +287,33 @@ USE_CONCURRENT_GC= get_boolean_property("gc.concurrent_gc"); } +#if defined(ALLOC_ZEROING) && defined(ALLOC_PREFETCH) + if(is_property_set("gc.prefetch",VM_PROPERTIES) ==1) { + PREFETCH_ENABLED=get_boolean_property("gc.prefetch"); + } + + if(is_property_set("gc.prefetch_distance",VM_PROPERTIES)==1) { + PREFETCH_DISTANCE = get_size_property("gc.prefetch_distance"); + if(!PREFETCH_ENABLED) { + WARN2("gc.prefetch_distance","Warning: Prefetch distance set with Prefetch disabled!"); + } + } + + if(is_property_set("gc.prefetch_stride",VM_PROPERTIES)==1) { + PREFETCH_STRIDE = get_size_property("gc.prefetch_stride"); + if(!PREFETCH_ENABLED) { + WARN2("gc.prefetch_stride","Warning: Prefetch stride set with Prefetch disabled!"); + } + } + + if(is_property_set("gc.zeroing_size",VM_PROPERTIES)==1) { + ZEROING_SIZE = get_size_property("gc.zeroing_size"); + if(!PREFETCH_ENABLED) { + WARN2("gc.zeroing_size","Warning: Zeroing size set with Prefetch disabled!"); + } + } +#endif + return; } Index: vm/gc_gen/src/thread/gc_thread.h =================================================================== --- vm/gc_gen/src/thread/gc_thread.h (revision 583351) +++ vm/gc_gen/src/thread/gc_thread.h (working copy) @@ -26,18 +26,37 @@ #include "../common/gc_metadata.h" #define ALLOC_ZEROING -#define ZEROING_SIZE 2*KB +#define ALLOC_PREFETCH +#ifdef ALLOC_ZEROING +#ifdef ALLOC_PREFETCH + +#ifdef _WINDOWS_ +#include +#define prefetchnta(pref_addr) _mm_prefetch((char*)(pref_addr), _MM_HINT_NTA ) +#else +#define prefetchnta(pref_addr) __asm__ ("prefetchnta (%0)"::"r"(pref_addr)) +#endif + +extern POINTER_SIZE_INT PREFETCH_DISTANCE; +extern POINTER_SIZE_INT ZEROING_SIZE; +extern POINTER_SIZE_INT PREFETCH_STRIDE; +extern Boolean PREFETCH_ENABLED; +#else +#define ZEROING_SIZE 256 +#endif +#endif + extern POINTER_SIZE_INT tls_gc_offset; inline void* gc_get_tls() -{ +{ void* tls_base = vm_thread_local(); return (void*)*(POINTER_SIZE_INT*)((char*)tls_base + tls_gc_offset); } inline void gc_set_tls(void* gc_tls_info) -{ +{ void* tls_base = vm_thread_local(); *(POINTER_SIZE_INT*)((char*)tls_base + tls_gc_offset) = (POINTER_SIZE_INT)gc_tls_info; } @@ -56,26 +75,37 @@ inline void thread_local_unalloc(unsigned int size, Allocator* allocator) { - void* free = allocator->free; + void* free = allocator->free; allocator->free = (void*)((POINTER_SIZE_INT)free - size); return; } #ifdef ALLOC_ZEROING -inline Partial_Reveal_Object* thread_local_alloc_zeroing(unsigned int size, Allocator* allocator) +FORCE_INLINE Partial_Reveal_Object* thread_local_alloc_zeroing(unsigned int size, Allocator* allocator) { POINTER_SIZE_INT free = (POINTER_SIZE_INT)allocator->free; POINTER_SIZE_INT ceiling = (POINTER_SIZE_INT)allocator->ceiling; - + POINTER_SIZE_INT new_free = free + size; - + POINTER_SIZE_INT block_ceiling = (POINTER_SIZE_INT)allocator->end; - if( new_free > block_ceiling) + if( new_free > block_ceiling) return NULL; POINTER_SIZE_INT new_ceiling; new_ceiling = new_free + ZEROING_SIZE; + +#ifdef ALLOC_PREFETCH + if(PREFETCH_ENABLED) { + POINTER_SIZE_INT pre_addr = new_free, pref_stride= PREFETCH_STRIDE, pref_dist= new_ceiling + PREFETCH_DISTANCE; + do{ + prefetchnta(pre_addr); + pre_addr += pref_stride; + }while(pre_addr< pref_dist); + } +#endif + if( new_ceiling > block_ceiling ) new_ceiling = block_ceiling; @@ -88,20 +118,20 @@ #endif /* ALLOC_ZEROING */ -inline Partial_Reveal_Object* thread_local_alloc(unsigned int size, Allocator* allocator) +FORCE_INLINE Partial_Reveal_Object* thread_local_alloc(unsigned int size, Allocator* allocator) { POINTER_SIZE_INT free = (POINTER_SIZE_INT)allocator->free; POINTER_SIZE_INT ceiling = (POINTER_SIZE_INT)allocator->ceiling; - + POINTER_SIZE_INT new_free = free + size; - + if (new_free <= ceiling){ - allocator->free= (void*)new_free; + allocator->free= (void*)new_free; return (Partial_Reveal_Object*)free; } #ifndef ALLOC_ZEROING - + return NULL; #else @@ -112,11 +142,11 @@ } -inline void allocator_init_free_block(Allocator* allocator, Block_Header* alloc_block) +FORCE_INLINE void allocator_init_free_block(Allocator* allocator, Block_Header* alloc_block) { assert(alloc_block->status == BLOCK_FREE); alloc_block->status = BLOCK_IN_USE; - + /* set allocation context */ void* new_free = alloc_block->free; allocator->free = new_free; @@ -127,16 +157,23 @@ memset(new_free, 0, GC_BLOCK_BODY_SIZE_BYTES); #else - /* the first-time zeroing area includes block header, to make subsequent allocs page aligned */ - unsigned int zeroing_size = ZEROING_SIZE - GC_BLOCK_HEADER_SIZE_BYTES; - allocator->ceiling = (void*)((POINTER_SIZE_INT)new_free + zeroing_size); - memset(new_free, 0, zeroing_size); +#ifdef ALLOC_PREFETCH + if(PREFETCH_ENABLED) { + POINTER_SIZE_INT pre_addr = (POINTER_SIZE_INT) new_free, pref_stride= PREFETCH_STRIDE, pref_dist= pre_addr + PREFETCH_DISTANCE; + do{ + prefetchnta(pre_addr); + pre_addr += pref_stride; + }while(pre_addr< pref_dist); + } +#endif + allocator->ceiling = (void*)((POINTER_SIZE_INT)new_free + ZEROING_SIZE); + memset(new_free, 0, ZEROING_SIZE); #endif /* #ifndef ALLOC_ZEROING */ allocator->end = alloc_block->ceiling; - allocator->alloc_block = (Block*)alloc_block; - + allocator->alloc_block = (Block*)alloc_block; + return; } @@ -144,17 +181,17 @@ { Block_Header* block = (Block_Header*)allocator->alloc_block; /* it can be NULL if GC happens before the mutator resumes, or called by collector */ - if( block != NULL ){ + if( block != NULL ){ assert(block->status == BLOCK_IN_USE); block->free = allocator->free; block->status = BLOCK_USED; allocator->alloc_block = NULL; } - + allocator->free = NULL; allocator->ceiling = NULL; allocator->end = NULL; - + return; }