polardbxengine/storage/innobase/include/ut0lock_free_hash.h

/*****************************************************************************

Copyright (c) 2015, 2018, Oracle and/or its affiliates. All Rights Reserved.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License, version 2.0, as published by the
Free Software Foundation.

This program is also distributed with certain software (including but not
limited to OpenSSL) that is licensed under separate terms, as designated in a
particular file or component or in included license documentation. The authors
of MySQL hereby grant you an additional permission to link the program and
your derivative works with the separately licensed software that they have
included with MySQL.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA

*****************************************************************************/

/** @file include/ut0lock_free_hash.h
 Lock free hash implementation

 Created Mar 16, 2015 Vasil Dimov
 *******************************************************/

#ifndef ut0lock_free_hash_h
#define ut0lock_free_hash_h

#define BOOST_ATOMIC_NO_LIB

#include "univ.i"

#include <atomic>
#include <list>

#include "os0numa.h"  /* os_numa_*() */
#include "ut0mutex.h" /* ib_mutex_t */
#include "ut0new.h"   /* UT_NEW*(), UT_DELETE*() */
#include "ut0rnd.h"   /* ut_fold_ull() */

/** An interface class to a basic hash table, that ut_lock_free_hash_t is. */
class ut_hash_interface_t {
 public:
  /** The value that is returned when the searched for key is not
  found. */
  static const int64_t NOT_FOUND = INT64_MAX;

  /** Destructor. */
  virtual ~ut_hash_interface_t() {}

  /** Get the value mapped to a given key.
  @param[in]	key	key to look for
  @return the value that corresponds to key or NOT_FOUND. */
  virtual int64_t get(uint64_t key) const = 0;

  /** Set the value for a given key, either inserting a new (key, val)
  tuple or overwriting an existent value.
  @param[in]	key	key whose value to set
  @param[in]	val	value to be set */
  virtual void set(uint64_t key, int64_t val) = 0;

  /** Delete a (key, val) pair from the hash.
  @param[in]	key	key whose pair to delete */
  virtual void del(uint64_t key) = 0;

  /** Increment the value for a given key with 1 or insert a new tuple
  (key, 1).
  @param[in]	key	key whose value to increment or insert as 1 */
  virtual void inc(uint64_t key) = 0;

  /** Decrement the value of a given key with 1 or insert a new tuple
  (key, -1).
  @param[in]	key	key whose value to decrement */
  virtual void dec(uint64_t key) = 0;

#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
  /** Print statistics about how many searches have been done on the hash
  and how many collisions. */
  virtual void print_stats() = 0;
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */
};

/** Lock free counter. A counter class that uses a few counter variables
internally to improve performance on machines with lots of CPUs. The get()
method sums all the internal counters without taking any locks, so due to
concurrent modification of the counter, get() may return a number which
never was the sum of all the internal counters. */
class ut_lock_free_cnt_t {
 public:
  /** Constructor. */
  ut_lock_free_cnt_t() {
    /* It is possible that the machine has NUMA available for use.  But the
    process/thread might not have sufficient permissions to use the same.
    Hence, we need to check whether the current thread has permissions to
    use them. Currently, disabling the use of numa here.  */
    m_numa_available = false;

    if (m_numa_available) {
      m_cnt_size = os_numa_num_configured_cpus();
    } else {
      /* Just pick up some number that is supposedly larger
      than the number of CPUs on the system or close to it.
      That many pointers and 64 bit integers will be
      allocated once in the hash table lifetime.
      Ie 256 * 8 * 8 = 16 KiB. */
      m_cnt_size = 256;
    }

    m_cnt = UT_NEW_ARRAY(std::atomic<int64_t> *, m_cnt_size,
                         mem_key_ut_lock_free_hash_t);

    for (size_t i = 0; i < m_cnt_size; i++) {
      const size_t s = sizeof(std::atomic<int64_t>);
      void *mem;

      if (m_numa_available) {
        const int node = os_numa_node_of_cpu(static_cast<int>(i));

        mem = os_numa_alloc_onnode(s, node);
      } else {
        mem = ut_malloc(s, mem_key_ut_lock_free_hash_t);
      }

      ut_a(mem != NULL);

      m_cnt[i] = new (mem) std::atomic<int64_t>;

      m_cnt[i]->store(0, std::memory_order_relaxed);
    }
  }

  /** Destructor. */
  ~ut_lock_free_cnt_t() {
    using namespace std;

    for (size_t i = 0; i < m_cnt_size; i++) {
      m_cnt[i]->~atomic<int64_t>();

      if (m_numa_available) {
        os_numa_free(m_cnt[i], sizeof(std::atomic<int64_t>));
      } else {
        ut_free(m_cnt[i]);
      }
    }

    UT_DELETE_ARRAY(m_cnt);
  }

  /** Increment the counter. */
  void inc() {
    const size_t i = n_cnt_index();

    m_cnt[i]->fetch_add(1, std::memory_order_relaxed);
  }

  /** Decrement the counter. */
  void dec() {
    const size_t i = n_cnt_index();

    m_cnt[i]->fetch_sub(1, std::memory_order_relaxed);
  }

  /** Get the value of the counter.
  @return counter's value */
  int64_t get() const {
    int64_t ret = 0;

    for (size_t i = 0; i < m_cnt_size; i++) {
      ret += m_cnt[i]->load(std::memory_order_relaxed);
    }

    return (ret);
  }

 private:
  /** Derive an appropriate index in m_cnt[] for the current thread.
  @return index in m_cnt[] for this thread to use */
  size_t n_cnt_index() const {
    size_t cpu;

#ifdef HAVE_OS_GETCPU
    cpu = static_cast<size_t>(os_getcpu());

    if (cpu >= m_cnt_size) {
      /* Could happen (rarely) if more CPUs get
      enabled after m_cnt_size is initialized. */
      cpu %= m_cnt_size;
    }
#else  /* HAVE_OS_GETCPU */
    cpu = static_cast<size_t>(ut_rnd_gen_ulint() % m_cnt_size);
#endif /* HAVE_OS_GETCPU */

    return (cpu);
  }

  /** Designate whether os_numa_*() functions can be used. */
  bool m_numa_available;

  /** The sum of all the counters in m_cnt[] designates the overall
  count. */
  std::atomic<int64_t> **m_cnt;
  size_t m_cnt_size;
};

/** A node in a linked list of arrays. The pointer to the next node is
atomically set (CAS) when a next element is allocated. */
template <typename T>
class ut_lock_free_list_node_t {
 public:
  typedef ut_lock_free_list_node_t<T> *next_t;

  /** Constructor.
  @param[in]	n_elements	number of elements to create */
  explicit ut_lock_free_list_node_t(size_t n_elements)
      : m_n_base_elements(n_elements), m_pending_free(false), m_next(NULL) {
    m_base = UT_NEW_ARRAY(T, m_n_base_elements, mem_key_ut_lock_free_hash_t);

    ut_ad(n_elements > 0);
  }

  /** Destructor. */
  ~ut_lock_free_list_node_t() { UT_DELETE_ARRAY(m_base); }

  /** Create and append a new array to this one and store a pointer
  to it in 'm_next'. This is done in a way that multiple threads can
  attempt this at the same time and only one will succeed. When this
  method returns, the caller can be sure that the job is done (either
  by this or another thread).
  @param[in]	deleted_val	the constant that designates that
  a value is deleted
  @param[out]	grown_by_this_thread	set to true if the next
  array was created and appended by this thread; set to false if
  created and appended by another thread.
  @return the next array, appended by this or another thread */
  next_t grow(int64_t deleted_val, bool *grown_by_this_thread) {
    size_t new_size;

    if (m_n_base_elements > 1024 &&
        n_deleted(deleted_val) > m_n_base_elements * 3 / 4) {
      /* If there are too many deleted elements (more than
      75%), then do not double the size. */
      new_size = m_n_base_elements;
    } else {
      new_size = m_n_base_elements * 2;
    }

    next_t new_arr = UT_NEW(ut_lock_free_list_node_t<T>(new_size),
                            mem_key_ut_lock_free_hash_t);

    /* Publish the allocated entry. If somebody did this in the
    meantime then just discard the allocated entry and do
    nothing. */
    next_t expected = NULL;
    if (!m_next.compare_exchange_strong(expected, new_arr,
                                        std::memory_order_relaxed)) {
      /* Somebody just did that. */
      UT_DELETE(new_arr);

      /* 'expected' has the current value which
      must be != NULL because the CAS failed. */
      ut_ad(expected != NULL);

      *grown_by_this_thread = false;

      return (expected);
    } else {
      *grown_by_this_thread = true;
      return (new_arr);
    }
  }

  /* This object is only ever destroyed after it is removed from the
  list of arrays in the hash table (which means that new threads cannot
  start using it) and the number of threads that use it has decreased
  to zero. */

  /** Mark the beginning of an access to this object. Used to prevent a
  destruction of this object while some threads may be accessing it.
  @retval true	access is granted, the caller should invoke
  end_access() when done
  @retval false	access is denied, this object is to be removed from
  the list and thus new access to it is not allowed. The caller should
  retry from the head of the list and need not to call end_access(). */
  bool begin_access() {
    m_n_ref.inc();

    std::atomic_thread_fence(std::memory_order_acq_rel);

    if (m_pending_free.load(std::memory_order_acquire)) {
      /* Don't allow access if freeing is pending. Ie if
      another thread is waiting for readers to go away
      before it can free the m_base's member of this
      object. */
      m_n_ref.dec();
      return (false);
    }

    return (true);
  }

  /** Mark the ending of an access to this object. */
  void end_access() {
    std::atomic_thread_fence(std::memory_order_release);

    m_n_ref.dec();
  }

  /** Get the number of threads that are accessing this object now.
  @return number of users (threads) of this object */
  int64_t n_ref() {
    int64_t ret = m_n_ref.get();

    std::atomic_thread_fence(std::memory_order_acq_rel);

    return (ret);
  }

  /** Base array. */
  T *m_base;

  /** Number of elements in 'm_base'. */
  size_t m_n_base_elements;

  /** Indicate whether some thread is waiting for readers to go away
  before it can free the memory occupied by the m_base member. */
  std::atomic_bool m_pending_free;

  /** Pointer to the next node if any or NULL. This begins its life
  as NULL and is only changed once to some real value. Never changed
  to another value after that. */
  std::atomic<next_t> m_next;

 private:
  /** Count the number of deleted elements. The value returned could
  be inaccurate because it is obtained without any locks.
  @param[in]	deleted_val	the constant that designates that
  a value is deleted
  @return the number of deleted elements */
  size_t n_deleted(int64_t deleted_val) const {
    size_t ret = 0;

    for (size_t i = 0; i < m_n_base_elements; i++) {
      const int64_t val = m_base[i].m_val.load(std::memory_order_relaxed);

      if (val == deleted_val) {
        ret++;
      }
    }

    return (ret);
  }

  /** Counter for the current number of readers and writers to this
  object. This object is destroyed only after it is removed from the
  list, so that no new readers or writers may arrive, and after this
  counter has dropped to zero. */
  ut_lock_free_cnt_t m_n_ref;
};

/** Lock free hash table. It stores (key, value) pairs where both the key
and the value are of integer type.
* The possible keys are:
  * UNUSED: a (key = UNUSED, val) tuple means that it is empty/unused. This is
    the initial state of all keys.
  * AVOID: a (key = AVOID, val = any) tuple means that this tuple is disabled,
    does not contain a real data and should be avoided by searches and
    inserts. This is used when migrating all elements of an array to the
    next array.
  * real key: anything other than the above means this is a real key,
    specified by the user.

* The possible values are:
  * NOT_FOUND: a (key, val = NOT_FOUND) tuple means that it is just being
    inserted and returning "not found" is ok. This is the initial state of all
    values.
  * DELETED: a (key, val = DELETED) tuple means that a tuple with this key
    existed before but was deleted at some point. Searches for this key return
    "not found" and inserts reuse the tuple, replacing the DELETED value with
    something else.
  * GOTO_NEXT_ARRAY: a (key, val = GOTO_NEXT_ARRAY) tuple means that the
    searches for this tuple (get and insert) should go to the next array and
    repeat the search there. Used when migrating all tuples from one array to
    the next.
  * real value: anything other than the above means this is a real value,
    specified by the user.

* Transitions for keys (a real key is anything other than UNUSED and AVOID):
  * UNUSED -> real key -- allowed
  * UNUSED -> AVOID -- allowed
  anything else is not allowed:
  * real key -> UNUSED -- not allowed
  * real key -> AVOID -- not allowed
  * real key -> another real key -- not allowed
  * AVOID -> UNUSED -- not allowed
  * AVOID -> real key -- not allowed

* Transitions for values (a real value is anything other than NOT_FOUND,
  DELETED and GOTO_NEXT_ARRAY):
  * NOT_FOUND -> real value -- allowed
  * NOT_FOUND -> DELETED -- allowed
  * real value -> another real value -- allowed
  * real value -> DELETED -- allowed
  * real value -> GOTO_NEXT_ARRAY -- allowed
  * DELETED -> real value -- allowed
  * DELETED -> GOTO_NEXT_ARRAY -- allowed
  anything else is not allowed:
  * NOT_FOUND -> GOTO_NEXT_ARRAY -- not allowed
  * real value -> NOT_FOUND -- not allowed
  * DELETED -> NOT_FOUND -- not allowed
  * GOTO_NEXT_ARRAY -> real value -- not allowed
  * GOTO_NEXT_ARRAY -> NOT_FOUND -- not allowed
  * GOTO_NEXT_ARRAY -> DELETED -- not allowed
*/
class ut_lock_free_hash_t : public ut_hash_interface_t {
 public:
  /** Constructor. Not thread safe.
  @param[in]	initial_size	number of elements to allocate
  initially. Must be a power of 2, greater than 0.
  @param[in]	del_when_zero	if true then automatically delete a
  tuple from the hash if due to increment or decrement its value becomes
  zero. */
  explicit ut_lock_free_hash_t(size_t initial_size, bool del_when_zero)
      : m_del_when_zero(del_when_zero)
#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
        ,
        m_n_search(0),
        m_n_search_iterations(0)
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */
  {
    ut_a(initial_size > 0);
    ut_a(ut_is_2pow(initial_size));

    m_data.store(UT_NEW(arr_node_t(initial_size), mem_key_ut_lock_free_hash_t),
                 std::memory_order_relaxed);

    mutex_create(LATCH_ID_LOCK_FREE_HASH, &m_optimize_latch);

    m_hollow_objects =
        UT_NEW(hollow_t(hollow_alloc_t(mem_key_ut_lock_free_hash_t)),
               mem_key_ut_lock_free_hash_t);
  }

  /** Destructor. Not thread safe. */
  ~ut_lock_free_hash_t() {
    mutex_destroy(&m_optimize_latch);

    arr_node_t *arr = m_data.load(std::memory_order_relaxed);

    do {
      arr_node_t *next = arr->m_next.load(std::memory_order_relaxed);

      UT_DELETE(arr);

      arr = next;
    } while (arr != NULL);

    while (!m_hollow_objects->empty()) {
      UT_DELETE(m_hollow_objects->front());
      m_hollow_objects->pop_front();
    }
    UT_DELETE(m_hollow_objects);
  }

  /** Get the value mapped to a given key.
  @param[in]	key	key to look for
  @return the value that corresponds to key or NOT_FOUND. */
  int64_t get(uint64_t key) const {
    ut_ad(key != UNUSED);
    ut_ad(key != AVOID);

    arr_node_t *arr = m_data.load(std::memory_order_relaxed);

    for (;;) {
      const key_val_t *tuple = get_tuple(key, &arr);

      if (tuple == NULL) {
        return (NOT_FOUND);
      }

      /* Here if another thread is just setting this key
      for the first time, then the tuple could be
      (key, NOT_FOUND) (remember all vals are initialized
      to NOT_FOUND initially) in which case we will return
      NOT_FOUND below which is fine. */

      int64_t v = tuple->m_val.load(std::memory_order_relaxed);

      if (v == DELETED) {
        arr->end_access();
        return (NOT_FOUND);
      } else if (v != GOTO_NEXT_ARRAY) {
        arr->end_access();
        return (v);
      }

      /* Prevent reorder of the below m_next.load() with
      the above m_val.load().
      We want to be sure that if m_val is GOTO_NEXT_ARRAY,
      then the next array exists. It would be the same to
      m_val.load(memory_order_acquire)
      but that would impose the more expensive
      memory_order_acquire in all cases, whereas in the most
      common execution path m_val is not GOTO_NEXT_ARRAY and
      we return earlier, only using the cheaper
      memory_order_relaxed. */
      std::atomic_thread_fence(std::memory_order_acquire);

      arr_node_t *next = arr->m_next.load(std::memory_order_relaxed);

      ut_a(next != NULL);

      arr->end_access();

      arr = next;
    }
  }

  /** Set the value for a given key, either inserting a new (key, val)
  tuple or overwriting an existent value. If two threads call this
  method at the same time with the key, but different val, then when
  both methods have finished executing the value will be one of the
  two ones, but undeterministic which one. E.g.
  Thread 1: set(key, val_a)
  Thread 2: set(key, val_b)
  when both have finished, then a tuple with the given key will be
  present with value either val_a or val_b.
  @param[in]	key	key whose value to set
  @param[in]	val	value to be set */
  void set(uint64_t key, int64_t val) {
    ut_ad(key != UNUSED);
    ut_ad(key != AVOID);
    ut_ad(val != NOT_FOUND);
    ut_ad(val != DELETED);
    ut_ad(val != GOTO_NEXT_ARRAY);

    insert_or_update(key, val, false, m_data.load(std::memory_order_relaxed));
  }

  /** Delete a (key, val) pair from the hash.
  If this gets called concurrently with get(), inc(), dec() or set(),
  then to the caller it will look like the calls executed in isolation,
  the hash structure itself will not be damaged, but it is undefined in
  what order the calls will be executed. For example:
  Let this tuple exist in the hash: (key == 5, val == 10)
  Thread 1: inc(key == 5)
  Thread 2: del(key == 5)
  [1] If inc() executes first then the tuple will become
  (key == 5, val == 11) and then del() will make it
  (key == 5, val == DELETED), which get()s for key == 5 will return as
  NOT_FOUND.
  [2] If del() executes first then the tuple will become
  (key == 5, val == DELETED) and then inc() will change it to
  (key == 5, value == 1).
  It is undefined which one of [1] or [2] will happen. It is up to the
  caller to accept this behavior or prevent it at a higher level.
  @param[in]	key	key whose pair to delete */
  void del(uint64_t key) {
    ut_ad(key != UNUSED);
    ut_ad(key != AVOID);

    arr_node_t *arr = m_data.load(std::memory_order_relaxed);

    for (;;) {
      key_val_t *tuple = get_tuple(key, &arr);

      if (tuple == NULL) {
        /* Nothing to delete. */
        return;
      }

      int64_t v = tuple->m_val.load(std::memory_order_relaxed);

      for (;;) {
        if (v == GOTO_NEXT_ARRAY) {
          break;
        }

        if (tuple->m_val.compare_exchange_strong(v, DELETED,
                                                 std::memory_order_relaxed)) {
          arr->end_access();
          return;
        }

        /* CAS stored the most recent value of 'm_val'
        into 'v'. */
      }

      /* Prevent reorder of the below m_next.load() with the
      above m_val.load() or the load from
      m_val.compare_exchange_strong().
      We want to be sure that if m_val is GOTO_NEXT_ARRAY,
      then the next array exists. It would be the same to
      m_val.load(memory_order_acquire) or
      m_val.compare_exchange_strong(memory_order_acquire)
      but that would impose the more expensive
      memory_order_acquire in all cases, whereas in the most
      common execution path m_val is not GOTO_NEXT_ARRAY and
      we return earlier, only using the cheaper
      memory_order_relaxed. */
      std::atomic_thread_fence(std::memory_order_acquire);

      arr_node_t *next = arr->m_next.load(std::memory_order_relaxed);

      ut_a(next != NULL);

      arr->end_access();

      arr = next;
    }
  }

  /** Increment the value for a given key with 1 or insert a new tuple
  (key, 1).
  If two threads call this method at the same time with the same key,
  then it is guaranteed that when both calls have finished, the value
  will be incremented with 2.
  If two threads call this method and set() at the same time with the
  same key it is undeterministic whether the value will be what was
  given to set() or what was given to set() + 1. E.g.
  Thread 1: set(key, val)
  Thread 2: inc(key)
  or
  Thread 1: inc(key)
  Thread 2: set(key, val)
  when both have finished the value will be either val or val + 1.
  @param[in]	key	key whose value to increment or insert as 1 */
  void inc(uint64_t key) {
    ut_ad(key != UNUSED);
    ut_ad(key != AVOID);

    insert_or_update(key, 1, true, m_data.load(std::memory_order_relaxed));
  }

  /** Decrement the value of a given key with 1 or insert a new tuple
  (key, -1).
  With respect to calling this together with set(), inc() or dec() the
  same applies as with inc(), see its comment. The only guarantee is
  that the calls will execute in isolation, but the order in which they
  will execute is undeterministic.
  @param[in]	key	key whose value to decrement */
  void dec(uint64_t key) {
    ut_ad(key != UNUSED);
    ut_ad(key != AVOID);

    insert_or_update(key, -1, true, m_data.load(std::memory_order_relaxed));
  }

#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
  /** Print statistics about how many searches have been done on the hash
  and how many collisions. */
  void print_stats() {
    const ulint n_search = m_n_search;
    const ulint n_search_iterations = m_n_search_iterations;

    ib::info info(ER_IB_MSG_LOCK_FREE_HASH_USAGE_STATS);
    info << "Lock free hash usage stats: number of searches=" << n_search
         << ", number of search iterations=" << n_search_iterations;
    if (n_search != 0) {
      info << "average iterations per search: "
           << (double)n_search_iterations / n_search;
    }
  }
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */

 private:
  /** A key == UNUSED designates that this cell in the array is empty. */
  static const uint64_t UNUSED = UINT64_MAX;

  /** A key == AVOID designates an unusable cell. This cell of the array
  has been empty (key == UNUSED), but was then marked as AVOID in order
  to prevent new inserts into it. Searches should treat this like
  UNUSED (ie if they encounter it before the key they are searching for
  then stop the search and declare 'not found'). */
  static const uint64_t AVOID = UNUSED - 1;

  /** A val == DELETED designates that this cell in the array has been
  used in the past, but it was deleted later. Searches should return
  NOT_FOUND when they encounter it. */
  static const int64_t DELETED = NOT_FOUND - 1;

  /** A val == GOTO_NEXT_ARRAY designates that this tuple (key, whatever)
  has been moved to the next array. The search for it should continue
  there. */
  static const int64_t GOTO_NEXT_ARRAY = DELETED - 1;

  /** (key, val) tuple type. */
  struct key_val_t {
    key_val_t() : m_key(UNUSED), m_val(NOT_FOUND) {}

    /** Key. */
    std::atomic<uint64_t> m_key;

    /** Value. */
    std::atomic<int64_t> m_val;
  };

  /** An array node in the hash. The hash table consists of a linked
  list of such nodes. */
  typedef ut_lock_free_list_node_t<key_val_t> arr_node_t;

  /** A hash function used to map a key to its suggested position in the
  array. A linear search to the right is done after this position to find
  the tuple with the given key or find a tuple with key == UNUSED or
  AVOID which means that the key is not present in the array.
  @param[in]	key		key to map into a position
  @param[in]	arr_size	number of elements in the array
  @return a position (index) in the array where the tuple is guessed
  to be */
  size_t guess_position(uint64_t key, size_t arr_size) const {
    /* Implement a better hashing function to map
    [0, UINT64_MAX] -> [0, arr_size - 1] if this one turns
    out to generate too many collisions. */

    /* arr_size is a power of 2. */
    return (static_cast<size_t>(ut_fold_ull(key) & (arr_size - 1)));
  }

  /** Get the array cell of a key from a given array.
  @param[in]	arr		array to search into
  @param[in]	arr_size	number of elements in the array
  @param[in]	key		search for a tuple with this key
  @return pointer to the array cell or NULL if not found */
  key_val_t *get_tuple_from_array(key_val_t *arr, size_t arr_size,
                                  uint64_t key) const {
#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
    ++m_n_search;
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */

    const size_t start = guess_position(key, arr_size);
    const size_t end = start + arr_size;

    for (size_t i = start; i < end; i++) {
#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
      ++m_n_search_iterations;
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */

      /* arr_size is a power of 2. */
      const size_t cur_pos = i & (arr_size - 1);

      key_val_t *cur_tuple = &arr[cur_pos];

      const uint64_t cur_key = cur_tuple->m_key.load(std::memory_order_relaxed);

      if (cur_key == key) {
        return (cur_tuple);
      } else if (cur_key == UNUSED || cur_key == AVOID) {
        return (NULL);
      }
    }

    return (NULL);
  }

  /** Get the array cell of a key.
  @param[in]	key	key to search for
  @param[in,out]	arr	start the search from this array; when this
  method ends, *arr will point to the array in which the search
  ended (in which the returned key_val resides)
  @return pointer to the array cell or NULL if not found */
  key_val_t *get_tuple(uint64_t key, arr_node_t **arr) const {
    for (;;) {
      while (!(*arr)->begin_access()) {
        /* The array has been garbaged, restart
        the search from the beginning. */
        *arr = m_data.load(std::memory_order_relaxed);
      }

      key_val_t *t =
          get_tuple_from_array((*arr)->m_base, (*arr)->m_n_base_elements, key);

      if (t != NULL) {
        /* end_access() will be invoked by the
        caller. */
        return (t);
      }

      arr_node_t *next = (*arr)->m_next.load(std::memory_order_relaxed);

      (*arr)->end_access();

      if (next == NULL) {
        return (NULL);
      }

      *arr = next;
    }
  }

  /** Insert the given key into a given array or return its cell if
  already present.
  @param[in]	arr		array into which to search and insert
  @param[in]	arr_size	number of elements in the array
  @param[in]	key		key to insert or whose cell to retrieve
  @return a pointer to the inserted or previously existent tuple or NULL
  if a tuple with this key is not present in the array and the array is
  full, without any unused cells and thus insertion cannot be done into
  it. */
  key_val_t *insert_or_get_position_in_array(key_val_t *arr, size_t arr_size,
                                             uint64_t key) {
#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
    ++m_n_search;
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */

    const size_t start = guess_position(key, arr_size);
    const size_t end = start + arr_size;

    for (size_t i = start; i < end; i++) {
#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
      ++m_n_search_iterations;
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */

      /* arr_size is a power of 2. */
      const size_t cur_pos = i & (arr_size - 1);

      key_val_t *cur_tuple = &arr[cur_pos];

      const uint64_t cur_key = cur_tuple->m_key.load(std::memory_order_relaxed);

      if (cur_key == key) {
        return (cur_tuple);
      }

      if (cur_key == UNUSED) {
        uint64_t expected = UNUSED;
        if (cur_tuple->m_key.compare_exchange_strong(
                expected, key, std::memory_order_relaxed)) {
          return (cur_tuple);
        }

        /* CAS failed, which means that some other
        thread just changed the current key from UNUSED
        to something else (which is stored in
        'expected'). See if the new value is 'key'. */
        if (expected == key) {
          return (cur_tuple);
        }

        /* The current key, which was UNUSED, has been
        replaced with something else (!= key) by a
        concurrently executing insert. Keep searching
        for a free slot. */
      }

      /* Skip through tuples with key == AVOID. */
    }

    return (NULL);
  }

  /** Copy all used elements from one array to another. Flag the ones
  in the old array as 'go to the next array'.
  @param[in,out]	src_arr	array to copy from
  @param[in,out]	dst_arr	array to copy to */
  void copy_to_another_array(arr_node_t *src_arr, arr_node_t *dst_arr) {
    for (size_t i = 0; i < src_arr->m_n_base_elements; i++) {
      key_val_t *t = &src_arr->m_base[i];

      uint64_t k = t->m_key.load(std::memory_order_relaxed);

      /* Prevent further inserts into empty cells. */
      if (k == UNUSED && t->m_key.compare_exchange_strong(
                             k, AVOID, std::memory_order_relaxed)) {
        continue;
      }

      int64_t v = t->m_val.load(std::memory_order_relaxed);

      /* Insert (k, v) into the destination array. We know
      that nobody else will try this concurrently with this
      thread because:
      * this code is being executed by just one thread (the
        thread that managed to grow the list of arrays) and
      * other normal inserts/updates with (key == k) will
        pick the entry in src_arr. */

      /* If we copied the tuple to dst_arr once then we must
      keep trying to transfer its most recent value to
      dst_arr, even if its value becomes DELETED. Otherwise
      a delete operation on this tuple which executes
      concurrently with the loop below may be lost. */
      bool copied = false;

      for (;;) {
        if (v != DELETED || copied) {
          insert_or_update(k, v, false, dst_arr, false);
          copied = true;
        }

        /* Prevent any preceding memory operations (the
        stores from insert_or_update() in particular)
        to be reordered past the store from
        m_val.compare_exchange_strong() below. We want
        to be sure that if m_val is GOTO_NEXT_ARRAY,
        then the entry is indeed present in some of the
        next arrays (ie that insert_or_update() has
        completed and that its effects are visible to
        other threads). */
        std::atomic_thread_fence(std::memory_order_release);

        /* Now that we know (k, v) is present in some
        of the next arrays, try to CAS the tuple
        (k, v) to (k, GOTO_NEXT_ARRAY) in the current
        array. */

        if (t->m_val.compare_exchange_strong(v, GOTO_NEXT_ARRAY,
                                             std::memory_order_relaxed)) {
          break;
        }

        /* If CAS fails, this means that m_val has been
        changed in the meantime and the CAS will store
        m_val's most recent value in 'v'. Retry both
        operations (this time the insert_or_update()
        call will be an update, rather than an
        insert). */
      }
    }
  }

  /** Update the value of a given tuple.
  @param[in,out]	t		tuple whose value to update
  @param[in]	val_to_set	value to set or delta to apply
  @param[in]	is_delta	if true then set the new value to
  old + val, otherwise just set to val
  @retval		true		update succeeded
  @retval		false		update failed due to GOTO_NEXT_ARRAY
  @return whether the update succeeded or not */
  bool update_tuple(key_val_t *t, int64_t val_to_set, bool is_delta) {
    int64_t cur_val = t->m_val.load(std::memory_order_relaxed);

    for (;;) {
      if (cur_val == GOTO_NEXT_ARRAY) {
        return (false);
      }

      int64_t new_val;

      if (is_delta && cur_val != NOT_FOUND && cur_val != DELETED) {
        if (m_del_when_zero && cur_val + val_to_set == 0) {
          new_val = DELETED;
        } else {
          new_val = cur_val + val_to_set;
        }
      } else {
        new_val = val_to_set;
      }

      if (t->m_val.compare_exchange_strong(cur_val, new_val,
                                           std::memory_order_relaxed)) {
        return (true);
      }

      /* When CAS fails it sets the most recent value of
      m_val into cur_val. */
    }
  }

  /** Optimize the hash table. Called after a new array is appended to
  the list of arrays (grow). Having more than one array in the list
  of arrays works, but is suboptimal because searches
  (for get/insert/update) have to search in more than one array. This
  method starts from the head of the list and migrates all tuples from
  this array to the next arrays. Then it removes the array from the
  head of the list, waits for readers to go away and frees it's m_base
  member. */
  void optimize() {
    mutex_enter(&m_optimize_latch);

    for (;;) {
      arr_node_t *arr = m_data.load(std::memory_order_relaxed);

      arr_node_t *next = arr->m_next.load(std::memory_order_relaxed);

      if (next == NULL) {
        break;
      }

      /* begin_access() / end_access() for 'arr' and 'next'
      around copy_to_another_array() is not needed here
      because the only code that frees memory is below,
      serialized with a mutex. */

      copy_to_another_array(arr, next);

      arr->m_pending_free.store(true, std::memory_order_release);

      arr_node_t *expected = arr;

      /* Detach 'arr' from the list. Ie move the head of the
      list 'm_data' from 'arr' to 'arr->m_next'. */
      ut_a(m_data.compare_exchange_strong(expected, next,
                                          std::memory_order_relaxed));

      /* Spin/wait for all threads to stop looking at
      this array. If at some point this turns out to be
      sub-optimal (ie too long busy wait), then 'arr' could
      be added to some lazy deletion list
      arrays-awaiting-destruction-once-no-readers. */
      while (arr->n_ref() > 0) {
        ;
      }

      UT_DELETE_ARRAY(arr->m_base);
      /* The destructor of arr will call UT_DELETE_ARRAY()
      on m_base again. Make sure it is a noop and avoid
      double free. */
      arr->m_base = NULL;

      m_hollow_objects->push_back(arr);
    }

    mutex_exit(&m_optimize_latch);
  }

  /** Insert a new tuple or update an existent one. If a tuple with this
  key does not exist then a new one is inserted (key, val) and is_delta
  is ignored. If a tuple with this key exists and is_delta is true, then
  the current value is changed to be current value + val, otherwise it
  is overwritten to be val.
  @param[in]	key			key to insert or whose value
  to update
  @param[in]	val			value to set; if the tuple
  does not exist or if is_delta is false, then the new value is set
  to val, otherwise it is set to old + val
  @param[in]	is_delta		if true then set the new
  value to old + val, otherwise just set to val.
  @param[in]	arr			array to start the search from
  @param[in]	optimize_allowed	if true then call optimize()
  after an eventual grow(), if false, then never call optimize(). Used
  to prevent recursive optimize() call by insert_or_update() ->
  optimize() -> copy_to_another_array() -> insert_or_update() ->
  optimize(). */
  void insert_or_update(uint64_t key, int64_t val, bool is_delta,
                        arr_node_t *arr, bool optimize_allowed = true) {
    bool call_optimize = false;

    /* Loop through the arrays until we find a free slot to insert
    or until we find a tuple with the specified key and manage to
    update it. */
    for (;;) {
      while (!arr->begin_access()) {
        /* The array has been garbaged, try the next
        one. */
        arr = arr->m_next.load(std::memory_order_relaxed);
      }

      key_val_t *t = insert_or_get_position_in_array(
          arr->m_base, arr->m_n_base_elements, key);

      /* t == NULL means that the array is full, must expand
      and go to the next array. */

      /* update_tuple() returning false means that the value
      of the tuple is GOTO_NEXT_ARRAY, so we must go to the
      next array. */

      if (t != NULL && update_tuple(t, val, is_delta)) {
        arr->end_access();
        break;
      }

      arr_node_t *next = arr->m_next.load(std::memory_order_relaxed);

      if (next != NULL) {
        arr->end_access();
        arr = next;
        /* Prevent any subsequent memory operations
        (the reads from the next array in particular)
        to be reordered before the m_next.load()
        above. */
        std::atomic_thread_fence(std::memory_order_acquire);
        continue;
      }

      bool grown_by_this_thread;

      next = arr->grow(DELETED, &grown_by_this_thread);

      if (grown_by_this_thread) {
        call_optimize = true;
      }

      arr->end_access();

      arr = next;
    }

    if (optimize_allowed && call_optimize) {
      optimize();
    }
  }

  /** Storage for the (key, val) tuples. */
  std::atomic<arr_node_t *> m_data;

  typedef ut_allocator<arr_node_t *> hollow_alloc_t;
  typedef std::list<arr_node_t *, hollow_alloc_t> hollow_t;

  /** Container for hollow (semi-destroyed) objects that have been
  removed from the list that starts at m_data. Those objects have their
  m_base member freed and are entirely destroyed at the end of the hash
  table life time. The access to this member is protected by
  m_optimize_latch (adding of new elements) and the removal of elements
  is done in the destructor of the hash table. */
  hollow_t *m_hollow_objects;

  /** Concurrent copy-all-elements-to-the-next-array, removal of the
  head of the list and freeing of its m_base member are serialized with
  this latch. Those operations could be implemented without serialization,
  but this immensely increases the complexity of the code. Growing of the
  hash table is not too hot operation and thus we chose simplicity and
  maintainability instead of top performance in this case. "Get"
  operations and "insert/update" ones that do not cause grow still
  run concurrently even if this latch is locked. */
  ib_mutex_t m_optimize_latch;

  /** True if a tuple should be automatically deleted from the hash
  if its value becomes 0 after an increment or decrement. */
  bool m_del_when_zero;

#ifdef UT_HASH_IMPLEMENT_PRINT_STATS
  /* The atomic type gives correct results, but has a _huge_
  performance impact. The unprotected operation gives a significant
  skew, but has almost no performance impact. */

  /** Number of searches performed in this hash. */
  mutable std::atomic<uint64_t> m_n_search;

  /** Number of elements processed for all searches. */
  mutable std::atomic<uint64_t> m_n_search_iterations;
#endif /* UT_HASH_IMPLEMENT_PRINT_STATS */
};

#endif /* ut0lock_free_hash_h */