diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8afae55..745f9ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -162,6 +162,8 @@ add_definitions(-DKUDU_HEADERS_USE_RICH_SLICE=1)
 # exported client headers).
 add_definitions(-DKUDU_HEADERS_NO_STUBS=1)
 
+add_definitions(-DJE_PREFIX=jemk_)
+
 # compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE=<type> .')
 # For all builds:
 # For CMAKE_BUILD_TYPE=Debug
@@ -445,19 +447,89 @@ endif()
 #
 # The gold linker is only for ELF binaries, which OSX doesn't use. We can
 # just skip.
-if (NOT APPLE)
-  execute_process(COMMAND ${CMAKE_CXX_COMPILER} -Wl,--version OUTPUT_VARIABLE LINKER_OUTPUT)
-endif ()
-if (LINKER_OUTPUT MATCHES "gold")
-  if ("${KUDU_LINK}" STREQUAL "d" AND
-      "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE")
-    message(SEND_ERROR "Cannot use gold with dynamic linking in a RELEASE build "
-      "as it would cause tcmalloc symbols to get dropped")
+function(GET_GOLD_VERSION)
+
+  # disable ld.gold due some compilation issue for numa
+  if (NUMA_LIB_PATH)
+    return()
+  endif()
+
+  # The gold linker is only for ELF binaries, which macOS doesn't use.
+  if (APPLE)
+    return()
+  endif()
+
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} "-Wl,--version" ${ARGN}
+          ERROR_QUIET
+          OUTPUT_VARIABLE LINKER_OUTPUT)
+  # We're expecting LINKER_OUTPUT to look like one of these:
+  #   GNU gold (version 2.24) 1.11
+  #   GNU gold (GNU Binutils for Ubuntu 2.30) 1.15
+  if (LINKER_OUTPUT MATCHES "GNU gold")
+    string(REGEX MATCH "GNU gold \\([^\\)]*\\) (([0-9]+\\.?)+)" _ "${LINKER_OUTPUT}")
+    if (NOT CMAKE_MATCH_1)
+      message(SEND_ERROR "Could not extract GNU gold version. "
+              "Linker version output: ${LINKER_OUTPUT}")
+    endif()
+    set(GOLD_VERSION "${CMAKE_MATCH_1}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Is the compiler hard-wired to use the gold linker?
+GET_GOLD_VERSION()
+if (GOLD_VERSION)
+  set(MUST_USE_GOLD 1)
+else()
+  # Can the compiler optionally enable the gold linker?
+  GET_GOLD_VERSION("-fuse-ld=gold")
+
+  # We can't use the gold linker if it's inside devtoolset because the compiler
+  # won't find it when invoked directly from make/ninja (which is typically
+  # done outside devtoolset).
+  execute_process(COMMAND which ld.gold
+          OUTPUT_VARIABLE GOLD_LOCATION
+          OUTPUT_STRIP_TRAILING_WHITESPACE
+          ERROR_QUIET)
+  if ("${GOLD_LOCATION}" MATCHES "^/opt/rh/devtoolset")
+    message("Skipping optional gold linker (version ${GOLD_VERSION}) because "
+            "it's in devtoolset")
+    set(GOLD_VERSION)
+  endif()
+endif()
+if (GOLD_VERSION)
+  # Older versions of the gold linker are vulnerable to a bug [1] which
+  # prevents weak symbols from being overridden properly. This leads to
+  # omitting of Kudu's tcmalloc dependency.
+  #
+  # How we handle this situation depends on other factors:
+  # - If gold is optional, we won't use it.
+  # - If gold is required, we'll either:
+  #   - Raise an error in RELEASE builds (we shouldn't release such a product), or
+  #   - Drop tcmalloc in all other builds.
+  #
+  # 1. https://sourceware.org/bugzilla/show_bug.cgi?id=16979.
+  if ("${GOLD_VERSION}" VERSION_LESS "1.12")
+    set(KUDU_BUGGY_GOLD 1)
+  endif()
+  if (MUST_USE_GOLD)
+    message("Using hard-wired gold linker (version ${GOLD_VERSION})")
+    if (KUDU_BUGGY_GOLD)
+      if ("${KUDU_LINK}" STREQUAL "d" AND
+              "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE")
+        message(SEND_ERROR "Configured to use buggy gold with dynamic linking "
+                "in a RELEASE build; this would cause tcmalloc symbols to get dropped")
+      endif()
+      message("Hard-wired gold linker is buggy, dropping tcmalloc dependency")
+    endif()
+  elseif (NOT KUDU_BUGGY_GOLD)
+    # The Gold linker must be manually enabled.
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fuse-ld=gold")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold")
+    message("Using optional gold linker (version ${GOLD_VERSION})")
   else()
     message("Using gold linker")
   endif()
   set(KUDU_USING_GOLD 1)
-else()
   message("Using ld linker")
 endif()
 
@@ -989,6 +1061,27 @@ ADD_THIRDPARTY_LIB(pmemobj
   DEPS ${PMEMOBJ_DEPS})
 endif()
 
+# Required memkind libraries
+if (NOT APPLE)
+  find_package(Memkind REQUIRED)
+  include_directories(SYSTEM ${MEMKIND_INCLUDE_DIR})
+  ADD_THIRDPARTY_LIB(memkind
+          STATIC_LIB "${MEMKIND_STATIC_LIB}"
+          SHARED_LIB "${MEMKIND_SHARED_LIB}")
+  list(APPEND KUDU_BASE_LIBS memkind)
+endif()
+
+## libnuma
+if (NOT APPLE)
+  find_library(NUMA_LIB_PATH numa)
+  if(NOT NUMA_LIB_PATH)
+    message(FATAL_ERROR "Could not find libnuma on the system path")
+  endif()
+  ADD_THIRDPARTY_LIB(numa
+          SHARED_LIB "${NUMA_LIB_PATH}")
+  list(APPEND KUDU_BASE_LIBS numa)
+endif()
+
 ## curl
 find_package(CURL REQUIRED)
 
diff --git a/cmake_modules/FindMemkind.cmake b/cmake_modules/FindMemkind.cmake
new file mode 100644
index 0000000..4018208
--- /dev/null
+++ b/cmake_modules/FindMemkind.cmake
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# - Find Required MEMKIND libraries
+# This module defines
+#  MEMKIND_INCLUDE_DIR, directory containing headers
+#  XXX_STATIC_LIBS, path to *.a
+#  XXX_SHARED_LIBS, path to *.so shared library
+
+find_path(MEMKIND_INCLUDE_DIR memkind.h
+        NO_CMAKE_SYSTEM_PATH
+        NO_SYSTEM_ENVIRONMENT_PATH)
+find_library(MEMKIND_SHARED_LIB memkind
+        NO_CMAKE_SYSTEM_PATH
+        NO_SYSTEM_ENVIRONMENT_PATH)
+find_library(MEMKIND_STATIC_LIB libmemkind.a
+        NO_CMAKE_SYSTEM_PATH
+        NO_SYSTEM_ENVIRONMENT_PATH)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MEMKIND REQUIRED_VARS
+        MEMKIND_SHARED_LIB MEMKIND_STATIC_LIB MEMKIND_INCLUDE_DIR)
\ No newline at end of file
diff --git a/src/kudu/cfile/cfile-test.cc b/src/kudu/cfile/cfile-test.cc
index db129a5..30189fc 100644
--- a/src/kudu/cfile/cfile-test.cc
+++ b/src/kudu/cfile/cfile-test.cc
@@ -78,7 +78,7 @@ DECLARE_bool(cfile_verify_checksums);
 
 #if defined(__linux__)
 DECLARE_string(nvm_cache_path);
-DECLARE_bool(nvm_cache_simulate_allocation_failure);
+//DECLARE_bool(nvm_cache_simulate_allocation_failure);
 #endif
 
 METRIC_DECLARE_counter(block_cache_hits_caching);
@@ -1023,7 +1023,7 @@ TEST_P(TestCFileBothCacheTypes, TestCacheKeysAreStable) {
 // Inject failures in nvm allocation and ensure that we can still read a file.
 TEST_P(TestCFileBothCacheTypes, TestNvmAllocationFailure) {
   if (GetParam() != NVM_CACHE) return;
-  FLAGS_nvm_cache_simulate_allocation_failure = true;
+  //FLAGS_nvm_cache_simulate_allocation_failure = true;
   TestReadWriteFixedSizeTypes<UInt32DataGenerator<false> >(PLAIN_ENCODING);
 }
 #endif
diff --git a/src/kudu/util/CMakeLists.txt b/src/kudu/util/CMakeLists.txt
index a6e0497..b8652b7 100644
--- a/src/kudu/util/CMakeLists.txt
+++ b/src/kudu/util/CMakeLists.txt
@@ -231,6 +231,7 @@ if(NOT APPLE)
     breakpad_client
     dl
     rt
+    memkind
     vmem)
 endif()
 
@@ -275,6 +276,7 @@ target_link_libraries(kudu_test_util
   gflags
   glog
   gmock
+  memkind
   kudu_util)
 
 if(NOT APPLE)
diff --git a/src/kudu/util/cache.h b/src/kudu/util/cache.h
index 462307c..a96f6e0 100644
--- a/src/kudu/util/cache.h
+++ b/src/kudu/util/cache.h
@@ -34,7 +34,7 @@ class MetricEntity;
 
 enum CacheType {
   DRAM_CACHE,
-  NVM_CACHE
+  NVM_CACHE,
 };
 
 // Create a new cache with a fixed size capacity.  This implementation
diff --git a/src/kudu/util/nvm_cache.cc b/src/kudu/util/nvm_cache.cc
index ae8a871..7b0ae11 100644
--- a/src/kudu/util/nvm_cache.cc
+++ b/src/kudu/util/nvm_cache.cc
@@ -1,79 +1,57 @@
-// This file is derived from cache.cc in the LevelDB project:
-//
-//   Some portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
-//   Use of this source code is governed by a BSD-style license that can be
-//   found in the LICENSE file.
-//
-// ------------------------------------------------------------
-// This file implements a cache based on the NVML library (http://pmem.io),
-// specifically its "libvmem" component. This library makes it easy to program
-// against persistent memory hardware by exposing an API which parallels
-// malloc/free, but allocates from persistent memory instead of DRAM.
-//
-// We use this API to implement a cache which treats persistent memory or
-// non-volatile memory as if it were a larger cheaper bank of volatile memory. We
-// currently make no use of its persistence properties.
-//
-// Currently, we only store key/value in NVM. All other data structures such as the
-// ShardedLRUCache instances, hash table, etc are in DRAM. The assumption is that
-// the ratio of data stored vs overhead is quite high.
+// Some portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
 
 #include "kudu/util/nvm_cache.h"
 
-#include <cstdint>
 #include <cstring>
-#include <iostream>
 #include <memory>
 #include <mutex>
+#include <ostream>
 #include <string>
 #include <vector>
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <libvmem.h>
+#include <memkind.h>
+#include <memkind/internal/memkind_pmem.h>
 
 #include "kudu/gutil/atomicops.h"
 #include "kudu/gutil/atomic_refcount.h"
 #include "kudu/gutil/dynamic_annotations.h"
-#include "kudu/gutil/gscoped_ptr.h"
+#include "kudu/gutil/bits.h"
 #include "kudu/gutil/hash/city.h"
-#include "kudu/gutil/macros.h"
+#include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/port.h"
-#include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/stl_util.h"
+#include "kudu/gutil/strings/substitute.h"
+#include "kudu/gutil/sysinfo.h"
+#include "kudu/util/alignment.h"
 #include "kudu/util/cache.h"
 #include "kudu/util/cache_metrics.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/locks.h"
+#include "kudu/util/mem_tracker.h"
 #include "kudu/util/metrics.h"
-#include "kudu/util/slice.h"
+
+// Useful in tests that require accurate cache capacity accounting.
+DEFINE_bool(nvm_cache_force_single_shard, false,
+            "Override all cache implementations to use just one shard");
+TAG_FLAG(nvm_cache_force_single_shard, hidden);
 
 DEFINE_string(nvm_cache_path, "/vmem",
               "The path at which the NVM cache will try to allocate its memory. "
               "This can be a tmpfs or ramfs for testing purposes.");
 TAG_FLAG(nvm_cache_path, experimental);
 
-DEFINE_int32(nvm_cache_allocation_retry_count, 10,
-             "The number of times that the NVM cache will retry attempts to allocate "
-             "memory for new entries. In between attempts, a cache entry will be "
-             "evicted.");
-TAG_FLAG(nvm_cache_allocation_retry_count, advanced);
-TAG_FLAG(nvm_cache_allocation_retry_count, experimental);
-
-DEFINE_bool(nvm_cache_simulate_allocation_failure, false,
-            "If true, the NVM cache will inject failures in calls to vmem_malloc "
-            "for testing.");
-TAG_FLAG(nvm_cache_simulate_allocation_failure, unsafe);
-
+using std::shared_ptr;
+using std::string;
+using std::vector;
 
 namespace kudu {
 
 namespace {
 
-using std::shared_ptr;
-using std::string;
-using std::vector;
-
 typedef simple_spinlock MutexType;
 
 // LRU cache implementation
@@ -90,18 +68,35 @@ struct LRUHandle {
   uint32_t val_length;
   Atomic32 refs;
   uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
-  uint8_t* kv_data;
+
+  // The storage for the value itself. The data is stored as:
+  //  [value bytes ...]
+  // uint8_t *val_data;   // Beginning of key/value pair
+
+  // The storage for the key itself. The data is stored as:
+  //   [key bytes ...] [padding up to 8-byte boundary]
+  uint8_t *key_data;
+  // The storage for the value itself. The data is stored as:
+  //  [value bytes ...]
+  uint8_t *val_data;
 
   Slice key() const {
-    return Slice(kv_data, key_length);
+    //return Slice(kv_data, key_length);
+    return Slice(key_data, key_length);
   }
 
-  Slice value() const {
-    return Slice(&kv_data[key_length], val_length);
+  uint8_t* mutable_val_ptr() {
+//    int val_offset = KUDU_ALIGN_UP(key_length, sizeof(void*));
+//    return &kv_data[val_offset];
+    return val_data;
   }
 
-  uint8_t* val_ptr() {
-    return &kv_data[key_length];
+  const uint8_t* val_ptr() const {
+    return const_cast<LRUHandle*>(this)->mutable_val_ptr();
+  }
+
+  Slice value() const {
+    return Slice(val_ptr(), val_length);
   }
 };
 
@@ -112,7 +107,7 @@ struct LRUHandle {
 // 4.4.3's builtin hashtable.
 class HandleTable {
  public:
-  HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); }
+  HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
   ~HandleTable() { delete[] list_; }
 
   LRUHandle* Lookup(const Slice& key, uint32_t hash) {
@@ -122,9 +117,9 @@ class HandleTable {
   LRUHandle* Insert(LRUHandle* h) {
     LRUHandle** ptr = FindPointer(h->key(), h->hash);
     LRUHandle* old = *ptr;
-    h->next_hash = (old == NULL ? NULL : old->next_hash);
+    h->next_hash = (old == nullptr ? nullptr : old->next_hash);
     *ptr = h;
-    if (old == NULL) {
+    if (old == nullptr) {
       ++elems_;
       if (elems_ > length_) {
         // Since each cache entry is fairly large, we aim for a small
@@ -138,7 +133,7 @@ class HandleTable {
   LRUHandle* Remove(const Slice& key, uint32_t hash) {
     LRUHandle** ptr = FindPointer(key, hash);
     LRUHandle* result = *ptr;
-    if (result != NULL) {
+    if (result != nullptr) {
       *ptr = result->next_hash;
       --elems_;
     }
@@ -157,7 +152,7 @@ class HandleTable {
   // pointer to the trailing slot in the corresponding linked list.
   LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
     LRUHandle** ptr = &list_[hash & (length_ - 1)];
-    while (*ptr != NULL &&
+    while (*ptr != nullptr &&
            ((*ptr)->hash != hash || key != (*ptr)->key())) {
       ptr = &(*ptr)->next_hash;
     }
@@ -169,12 +164,12 @@ class HandleTable {
     while (new_length < elems_ * 1.5) {
       new_length *= 2;
     }
-    LRUHandle** new_list = new LRUHandle*[new_length];
+    auto new_list = new LRUHandle*[new_length];
     memset(new_list, 0, sizeof(new_list[0]) * new_length);
     uint32_t count = 0;
     for (uint32_t i = 0; i < length_; i++) {
       LRUHandle* h = list_[i];
-      while (h != NULL) {
+      while (h != nullptr) {
         LRUHandle* next = h->next_hash;
         uint32_t hash = h->hash;
         LRUHandle** ptr = &new_list[hash & (new_length - 1)];
@@ -194,7 +189,7 @@ class HandleTable {
 // A single shard of sharded cache.
 class NvmLRUCache {
  public:
-  explicit NvmLRUCache(VMEM *vmp);
+  explicit NvmLRUCache(MemTracker* tracker, memkind *vmp);
   ~NvmLRUCache();
 
   // Separate from constructor so caller can easily make an array of LRUCache
@@ -202,13 +197,11 @@ class NvmLRUCache {
 
   void SetMetrics(CacheMetrics* metrics) { metrics_ = metrics; }
 
-  Cache::Handle* Insert(LRUHandle* h, Cache::EvictionCallback* eviction_callback);
-
+  Cache::Handle* Insert(LRUHandle* handle, Cache::EvictionCallback* eviction_callback);
   // Like Cache::Lookup, but with an extra "hash" parameter.
   Cache::Handle* Lookup(const Slice& key, uint32_t hash, bool caching);
   void Release(Cache::Handle* handle);
   void Erase(const Slice& key, uint32_t hash);
-  void* AllocateAndRetry(size_t size);
 
  private:
   void NvmLRU_Remove(LRUHandle* e);
@@ -216,19 +209,9 @@ class NvmLRUCache {
   // Just reduce the reference count by 1.
   // Return true if last reference
   bool Unref(LRUHandle* e);
+  // Call the user's eviction callback, if it exists, and free the entry.
   void FreeEntry(LRUHandle* e);
 
-  // Evict the LRU item in the cache, adding it to the linked list
-  // pointed to by 'to_remove_head'.
-  void EvictOldestUnlocked(LRUHandle** to_remove_head);
-
-  // Free all of the entries in the linked list that has to_free_head
-  // as its head.
-  void FreeLRUEntries(LRUHandle* to_free_head);
-
-  // Wrapper around vmem_malloc which injects failures based on a flag.
-  void* VmemMalloc(size_t size);
-
   // Initialized before use.
   size_t capacity_;
 
@@ -242,15 +225,18 @@ class NvmLRUCache {
 
   HandleTable table_;
 
-  VMEM* vmp_;
+  MemTracker* mem_tracker_;
+
+  memkind* vmp_;
 
   CacheMetrics* metrics_;
 };
 
-NvmLRUCache::NvmLRUCache(VMEM* vmp)
-  : usage_(0),
-  vmp_(vmp),
-  metrics_(NULL) {
+NvmLRUCache::NvmLRUCache(MemTracker* tracker, memkind* vmp)
+ : usage_(0),
+   vmp_(vmp),
+   mem_tracker_(tracker),
+   metrics_(nullptr) {
   // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
@@ -267,13 +253,6 @@ NvmLRUCache::~NvmLRUCache() {
   }
 }
 
-void* NvmLRUCache::VmemMalloc(size_t size) {
-  if (PREDICT_FALSE(FLAGS_nvm_cache_simulate_allocation_failure)) {
-    return NULL;
-  }
-  return vmem_malloc(vmp_, size);
-}
-
 bool NvmLRUCache::Unref(LRUHandle* e) {
   DCHECK_GT(ANNOTATE_UNPROTECTED_READ(e->refs), 0);
   return !base::RefCountDec(&e->refs);
@@ -284,44 +263,14 @@ void NvmLRUCache::FreeEntry(LRUHandle* e) {
   if (e->eviction_callback) {
     e->eviction_callback->EvictedEntry(e->key(), e->value());
   }
+  mem_tracker_->Release(e->charge);
   if (PREDICT_TRUE(metrics_)) {
     metrics_->cache_usage->DecrementBy(e->charge);
     metrics_->evictions->Increment();
   }
-  vmem_free(vmp_, e);
-}
-
-// Allocate nvm memory. Try until successful or FLAGS_nvm_cache_allocation_retry_count
-// has been exceeded.
-void *NvmLRUCache::AllocateAndRetry(size_t size) {
-  void *tmp;
-  // There may be times that an allocation fails. With NVM we have
-  // a fixed size to allocate from. If we cannot allocate the size
-  // that was asked for, we will remove entries from the cache and
-  // retry up to the configured number of retries. If this fails, we
-  // return NULL, which will cause the caller to not insert anything
-  // into the cache.
-  LRUHandle *to_remove_head = NULL;
-  tmp = VmemMalloc(size);
-
-  if (tmp == NULL) {
-    std::unique_lock<MutexType> l(mutex_);
-
-    int retries_remaining = FLAGS_nvm_cache_allocation_retry_count;
-    while (tmp == NULL && retries_remaining-- > 0 && lru_.next != &lru_) {
-      EvictOldestUnlocked(&to_remove_head);
-
-      // Unlock while allocating memory.
-      l.unlock();
-      tmp = VmemMalloc(size);
-      l.lock();
-    }
-  }
-
-  // we free the entries here outside of mutex for
-  // performance reasons
-  FreeLRUEntries(to_remove_head);
-  return tmp;
+  memkind_free(vmp_, e->val_data);
+  delete [] e->key_data;
+  delete [] e;
 }
 
 void NvmLRUCache::NvmLRU_Remove(LRUHandle* e) {
@@ -340,13 +289,11 @@ void NvmLRUCache::NvmLRU_Append(LRUHandle* e) {
 }
 
 Cache::Handle* NvmLRUCache::Lookup(const Slice& key, uint32_t hash, bool caching) {
- LRUHandle* e;
+  LRUHandle* e;
   {
     std::lock_guard<MutexType> l(mutex_);
     e = table_.Lookup(key, hash);
-    if (e != NULL) {
-      // If an entry exists, remove the old entry from the cache
-      // and re-add to the end of the linked list.
+    if (e != nullptr) {
       base::RefCountInc(&e->refs);
       NvmLRU_Remove(e);
       NvmLRU_Append(e);
@@ -356,7 +303,7 @@ Cache::Handle* NvmLRUCache::Lookup(const Slice& key, uint32_t hash, bool caching
   // Do the metrics outside of the lock.
   if (metrics_) {
     metrics_->lookups->Increment();
-    bool was_hit = (e != NULL);
+    bool was_hit = (e != nullptr);
     if (was_hit) {
       if (caching) {
         metrics_->cache_hits_caching->Increment();
@@ -383,43 +330,26 @@ void NvmLRUCache::Release(Cache::Handle* handle) {
   }
 }
 
-void NvmLRUCache::EvictOldestUnlocked(LRUHandle** to_remove_head) {
-  LRUHandle* old = lru_.next;
-  NvmLRU_Remove(old);
-  table_.Remove(old->key(), old->hash);
-  if (Unref(old)) {
-    old->next = *to_remove_head;
-    *to_remove_head = old;
-  }
-}
+Cache::Handle* NvmLRUCache::Insert(LRUHandle* e, Cache::EvictionCallback *eviction_callback) {
 
-void NvmLRUCache::FreeLRUEntries(LRUHandle* to_free_head) {
-  while (to_free_head != NULL) {
-    LRUHandle* next = to_free_head->next;
-    FreeEntry(to_free_head);
-    to_free_head = next;
-  }
-}
-
-Cache::Handle* NvmLRUCache::Insert(LRUHandle* e,
-                                   Cache::EvictionCallback* eviction_callback) {
-  DCHECK(e);
-  LRUHandle* to_remove_head = NULL;
-
-  e->refs = 2;  // One from LRUCache, one for the returned handle
+  // Set the remaining LRUHandle members which were not already allocated during
+  // Allocate().
   e->eviction_callback = eviction_callback;
+  e->refs = 2;  // One from LRUCache, one for the returned handle
+  mem_tracker_->Consume(e->charge);
   if (PREDICT_TRUE(metrics_)) {
     metrics_->cache_usage->IncrementBy(e->charge);
     metrics_->inserts->Increment();
   }
 
+  LRUHandle* to_remove_head = nullptr;
   {
     std::lock_guard<MutexType> l(mutex_);
 
     NvmLRU_Append(e);
 
     LRUHandle* old = table_.Insert(e);
-    if (old != NULL) {
+    if (old != nullptr) {
       NvmLRU_Remove(old);
       if (Unref(old)) {
         old->next = to_remove_head;
@@ -428,13 +358,23 @@ Cache::Handle* NvmLRUCache::Insert(LRUHandle* e,
     }
 
     while (usage_ > capacity_ && lru_.next != &lru_) {
-      EvictOldestUnlocked(&to_remove_head);
+      LRUHandle* old = lru_.next;
+      NvmLRU_Remove(old);
+      table_.Remove(old->key(), old->hash);
+      if (Unref(old)) {
+        old->next = to_remove_head;
+        to_remove_head = old;
+      }
     }
   }
 
   // we free the entries here outside of mutex for
   // performance reasons
-  FreeLRUEntries(to_remove_head);
+  while (to_remove_head != nullptr) {
+    LRUHandle* next = to_remove_head->next;
+    FreeEntry(to_remove_head);
+    to_remove_head = next;
+  }
 
   return reinterpret_cast<Cache::Handle*>(e);
 }
@@ -445,7 +385,7 @@ void NvmLRUCache::Erase(const Slice& key, uint32_t hash) {
   {
     std::lock_guard<MutexType> l(mutex_);
     e = table_.Remove(key, hash);
-    if (e != NULL) {
+    if (e != nullptr) {
       NvmLRU_Remove(e);
       last_reference = Unref(e);
     }
@@ -456,31 +396,54 @@ void NvmLRUCache::Erase(const Slice& key, uint32_t hash) {
     FreeEntry(e);
   }
 }
-static const int kNumShardBits = 4;
-static const int kNumShards = 1 << kNumShardBits;
+
+// Determine the number of bits of the hash that should be used to determine
+// the cache shard. This, in turn, determines the number of shards.
+int DetermineShardBits() {
+  int bits = PREDICT_FALSE(FLAGS_nvm_cache_force_single_shard) ?
+      0 : Bits::Log2Ceiling(base::NumCPUs());
+  VLOG(1) << "Will use " << (1 << bits) << " shards for LRU cache.";
+  return bits;
+}
 
 class ShardedLRUCache : public Cache {
  private:
+  shared_ptr<MemTracker> mem_tracker_;
   gscoped_ptr<CacheMetrics> metrics_;
   vector<NvmLRUCache*> shards_;
-  VMEM* vmp_;
+  MutexType id_mutex_;
+  uint64_t last_id_;
+  memkind* vmp_;
+
+  // Number of bits of hash used to determine the shard.
+  const int shard_bits_;
 
   static inline uint32_t HashSlice(const Slice& s) {
     return util_hash::CityHash64(
       reinterpret_cast<const char *>(s.data()), s.size());
   }
 
-  static uint32_t Shard(uint32_t hash) {
-    return hash >> (32 - kNumShardBits);
+  uint32_t Shard(uint32_t hash) {
+    // Widen to uint64 before shifting, or else on a single CPU,
+    // we would try to shift a uint32_t by 32 bits, which is undefined.
+    return static_cast<uint64_t>(hash) >> (32 - shard_bits_);
   }
 
  public:
-  explicit ShardedLRUCache(size_t capacity, const string& /*id*/, VMEM* vmp)
-        : vmp_(vmp) {
-
-    const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards;
-    for (int s = 0; s < kNumShards; s++) {
-      gscoped_ptr<NvmLRUCache> shard(new NvmLRUCache(vmp_));
+  explicit ShardedLRUCache(size_t capacity, const string& id, memkind* vmp)
+      : last_id_(0),
+        vmp_(vmp),
+        shard_bits_(DetermineShardBits()) {
+    // A cache is often a singleton, so:
+    // 1. We reuse its MemTracker if one already exists, and
+    // 2. It is directly parented to the root MemTracker.
+    mem_tracker_ = MemTracker::FindOrCreateGlobalTracker(
+        -1, strings::Substitute("$0-sharded_lru_cache", id));
+
+    int num_shards = 1 << shard_bits_;
+    const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
+    for (int s = 0; s < num_shards; s++) {
+      gscoped_ptr<NvmLRUCache> shard(new NvmLRUCache(mem_tracker_.get(), vmp_));
       shard->SetCapacity(per_shard);
       shards_.push_back(shard.release());
     }
@@ -488,10 +451,7 @@ class ShardedLRUCache : public Cache {
 
   virtual ~ShardedLRUCache() {
     STLDeleteElements(&shards_);
-    // Per the note at the top of this file, our cache is entirely volatile.
-    // Hence, when the cache is destructed, we delete the underlying
-    // VMEM pool.
-    vmem_delete(vmp_);
+    memkind_pmem_destroy(vmp_);
   }
 
   virtual Handle* Insert(PendingHandle* handle,
@@ -514,8 +474,9 @@ class ShardedLRUCache : public Cache {
   virtual Slice Value(Handle* handle) OVERRIDE {
     return reinterpret_cast<LRUHandle*>(handle)->value();
   }
-  virtual uint8_t* MutableValue(PendingHandle* handle) OVERRIDE {
-    return reinterpret_cast<LRUHandle*>(handle)->val_ptr();
+  virtual uint64_t NewId() OVERRIDE {
+    std::lock_guard<MutexType> l(id_mutex_);
+    return ++(last_id_);
   }
 
   virtual void SetMetrics(const scoped_refptr<MetricEntity>& entity) OVERRIDE {
@@ -524,51 +485,48 @@ class ShardedLRUCache : public Cache {
       cache->SetMetrics(metrics_.get());
     }
   }
+
   virtual PendingHandle* Allocate(Slice key, int val_len, int charge) OVERRIDE {
     int key_len = key.size();
     DCHECK_GE(key_len, 0);
     DCHECK_GE(val_len, 0);
-    LRUHandle* handle = nullptr;
+    LRUHandle *handle = (LRUHandle*) malloc(sizeof(LRUHandle));
+    handle->key_length = key_len;
+    handle->key_data = (uint8_t*) malloc(key_len);
+    handle->val_length = val_len;
+    handle->val_data = (uint8_t*)memkind_malloc(vmp_, val_len);;
+    handle->charge = charge;
+    handle->hash = HashSlice(key);
+    memcpy(handle->key_data, key.data(), key_len);
 
-    // Try allocating from each of the shards -- if vmem is tight,
-    // this can cause eviction, so we might have better luck in different
-    // shards.
-    for (NvmLRUCache* cache : shards_) {
-      uint8_t* buf = static_cast<uint8_t*>(cache->AllocateAndRetry(
-          sizeof(LRUHandle) + key_len + val_len));
-      if (buf) {
-        handle = reinterpret_cast<LRUHandle*>(buf);
-        handle->kv_data = &buf[sizeof(LRUHandle)];
-        handle->val_length = val_len;
-        handle->key_length = key_len;
-        handle->charge = charge + key.size();
-        handle->hash = HashSlice(key);
-        memcpy(handle->kv_data, key.data(), key.size());
-        return reinterpret_cast<PendingHandle*>(handle);
-      }
-    }
-    // TODO: increment a metric here on allocation failure.
-    return nullptr;
+    return reinterpret_cast<PendingHandle*>(handle);
   }
 
-  virtual void Free(PendingHandle* ph) OVERRIDE {
-    vmem_free(vmp_, ph);
+  virtual void Free(PendingHandle* h) OVERRIDE {
+    LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
+    memkind_free(vmp_, e->val_data);
+    delete [] e->key_data;
+    delete [] e;
   }
+
+  virtual uint8_t* MutableValue(PendingHandle* h) OVERRIDE {
+    return reinterpret_cast<LRUHandle*>(h)->mutable_val_ptr();
+  }
+
 };
 
-} // end anonymous namespace
+}  // end anonymous namespace
 
 Cache* NewLRUNvmCache(size_t capacity, const std::string& id) {
-  // vmem_create() will fail if the capacity is too small, but with
-  // an inscrutable error. So, we'll check ourselves.
-  CHECK_GE(capacity, VMEM_MIN_POOL)
+  CHECK_GE(capacity, MEMKIND_PMEM_MIN_SIZE)
     << "configured capacity " << capacity << " bytes is less than "
-    << "the minimum capacity for an NVM cache: " << VMEM_MIN_POOL;
+    << "the minimum capacity for an NVM cache: " << MEMKIND_PMEM_MIN_SIZE;
 
-  VMEM* vmp = vmem_create(FLAGS_nvm_cache_path.c_str(), capacity);
+  memkind* vmp;
+  int err = memkind_create_pmem(FLAGS_nvm_cache_path.c_str(), capacity, &vmp);
   // If we cannot create the cache pool we should not retry.
-  PLOG_IF(FATAL, vmp == NULL) << "Could not initialize NVM cache library in path "
-                              << FLAGS_nvm_cache_path.c_str();
+  PLOG_IF(FATAL, err) << "Could not initialize NVM cache library in path "
+                      << FLAGS_nvm_cache_path.c_str();
 
   return new ShardedLRUCache(capacity, id, vmp);
 }
diff --git a/thirdparty/build-definitions.sh b/thirdparty/build-definitions.sh
index e38d54a..88138c5 100644
--- a/thirdparty/build-definitions.sh
+++ b/thirdparty/build-definitions.sh
@@ -612,6 +612,25 @@ build_trace_viewer() {
   cp -a $TRACE_VIEWER_SOURCE/* $TP_DIR/../www/
 }
 
+build_memkind() {
+  MEMKIND_BDIR=$TP_BUILD_DIR/$MEMKIND_NAME$MODE_SUFFIX
+  mkdir -p $MEMKIND_BDIR $PREFIX/include/jemalloc $PREFIX/include/memkind/internal
+  pushd $MEMKIND_BDIR
+
+  # It doesn't appear possible to isolate source and build directories, so just
+  # prepopulate the latter using the former.
+  rsync -av --delete $MEMKIND_SOURCE/ .
+  cd $MEMKIND_SOURCE
+  ./build.sh "--prefix=$PREFIX"
+
+  # we manually install the required built artifacts.
+  cp -a $MEMKIND_SOURCE/jemalloc/obj/include/jemalloc/jemalloc.h $PREFIX/include/jemalloc
+  cp -a $MEMKIND_BDIR/include/memkind/internal/* $PREFIX/include/memkind/internal
+  cp -a $MEMKIND_BDIR/include/* $PREFIX/include
+  cp -a $MEMKIND_BDIR/.libs/* $PREFIX/lib
+  popd
+}
+
 build_nvml() {
   NVML_BDIR=$TP_BUILD_DIR/$NVML_NAME$MODE_SUFFIX
   mkdir -p $NVML_BDIR
diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index e239984..9bb6471 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -91,6 +91,7 @@ else
       "llvm")         F_LLVM=1 ;;
       "trace-viewer") F_TRACE_VIEWER=1 ;;
       "nvml")         F_NVML=1 ;;
+      "memkind")      F_MEMKIND=1 ;;
       "boost")        F_BOOST=1 ;;
       "breakpad")     F_BREAKPAD=1 ;;
       "sparsehash")   F_SPARSEHASH=1 ;;
@@ -300,6 +301,10 @@ if [ -n "$OS_LINUX" ] && [ -n "$F_UNINSTRUMENTED" -o -n "$F_NVML" ]; then
   build_nvml
 fi
 
+if [ -n "$OS_LINUX" ] && [ -n "$F_UNINSTRUMENTED" -o -n "$F_MEMKIND" ]; then
+  build_memkind
+fi
+
 restore_env
 
 ### Build C++ dependencies without instrumentation
@@ -451,6 +456,10 @@ if [ -n "$OS_LINUX" ] && [ -n "$F_TSAN" -o -n "$F_NVML" ]; then
   build_nvml
 fi
 
+if [ -n "$OS_LINUX" ] && [ -n "$F_TSAN" -o -n "$F_MEMKIND" ]; then
+  build_memkind
+fi
+
 restore_env
 
 ### Build C++ dependencies with TSAN instrumentation
diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh
index 11d2025..d0cc910 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -293,6 +293,10 @@ if [ -n "$OS_LINUX" -a ! -d $NVML_SOURCE ]; then
   fetch_and_expand nvml-${NVML_VERSION}.tar.gz
 fi
 
+if [ -n "$OS_LINUX" -a ! -d $MEMKIND_SOURCE ]; then
+  $TP_DIR/install-memkind-from-git.sh
+fi
+
 BOOST_PATCHLEVEL=1
 delete_if_wrong_patchlevel $BOOST_SOURCE $BOOST_PATCHLEVEL
 if [ ! -d $BOOST_SOURCE ]; then
diff --git a/thirdparty/install-memkind-from-git.sh b/thirdparty/install-memkind-from-git.sh
new file mode 100755
index 0000000..c1d57fa
--- /dev/null
+++ b/thirdparty/install-memkind-from-git.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+TP_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
+source $TP_DIR/vars.sh
+
+cd $TP_SOURCE_DIR
+
+if [ ! -d "$MEMKIND_NAME" ]; then
+  curl -L https://codeload.github.com/memkind/memkind/tar.gz/v1.7.0 -o $MEMKIND_NAME.tar.gz
+  tar -zxf $MEMKIND_NAME.tar.gz > $MEMKIND_NAME  
+  rm -r $MEMKIND_NAME.tar.gz
+else
+  echo $MEMKIND_NAME already exists. Not re-downloading.  
+fi
diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh
index e5995f6..ca6561e 100644
--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@@ -166,6 +166,10 @@ NVML_VERSION=1.1
 NVML_NAME=nvml-$NVML_VERSION
 NVML_SOURCE=$TP_SOURCE_DIR/$NVML_NAME
 
+MEMKIND_VERSION=1.7.0
+MEMKIND_NAME=memkind-$MEMKIND_VERSION
+MEMKIND_SOURCE=$TP_SOURCE_DIR/$MEMKIND_NAME
+
 BOOST_VERSION=1_61_0
 BOOST_NAME=boost_$BOOST_VERSION
 BOOST_SOURCE=$TP_SOURCE_DIR/$BOOST_NAME
