/*****************************************************************************

Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License, version 2.0, as published by the
Free Software Foundation.

This program is also distributed with certain software (including but not
limited to OpenSSL) that is licensed under separate terms, as designated in a
particular file or component or in included license documentation. The authors
of MySQL hereby grant you an additional permission to link the program and
your derivative works with the separately licensed software that they have
included with MySQL.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA

*****************************************************************************/

/** @file lock/lock0wait.cc
 The transaction lock system

 Created 25/5/2010 Sunny Bains
 *******************************************************/

#define LOCK_MODULE_IMPLEMENTATION

#include <mysql/service_thd_wait.h>
#include <sys/types.h>

#include "ha_prototypes.h"
#include "lock0lock.h"
#include "lock0priv.h"
#include "os0thread-create.h"
#include "que0que.h"
#include "row0mysql.h"
#include "srv0mon.h"
#include "srv0start.h"

#include "my_dbug.h"

/** Print the contents of the lock_sys_t::waiting_threads array. */
static void lock_wait_table_print(void) {
  ut_ad(lock_wait_mutex_own());

  const srv_slot_t *slot = lock_sys->waiting_threads;

  for (ulint i = 0; i < srv_max_n_threads; i++, ++slot) {
    fprintf(stderr,
            "Slot %lu: thread type %lu,"
            " in use %lu, susp %lu, timeout %lu, time %lu\n",
            (ulong)i, (ulong)slot->type, (ulong)slot->in_use,
            (ulong)slot->suspended, slot->wait_timeout,
            (ulong)(ut_time_monotonic() - slot->suspend_time));
  }
}

/** Release a slot in the lock_sys_t::waiting_threads. Adjust the array last
 pointer if there are empty slots towards the end of the table. */
static void lock_wait_table_release_slot(
    srv_slot_t *slot) /*!< in: slot to release */
{
#ifdef UNIV_DEBUG
  srv_slot_t *upper = lock_sys->waiting_threads + srv_max_n_threads;
#endif /* UNIV_DEBUG */

  lock_wait_mutex_enter();
  /* We omit trx_mutex_enter and lock_mutex_enter here, because we are only
  going to touch thr->slot, which is a member used only by lock0wait.cc and is
  sufficiently protected by lock_wait_mutex. Yes, there are readers who read
  the thr->slot holding only trx->mutex and lock_sys->mutex, but they do so,
  when they are sure that we were not woken up yet, so our thread can't be here.
  See comments in lock_wait_release_thread_if_suspended() for more details. */

  ut_ad(slot->in_use);
  ut_ad(slot->thr != NULL);
  ut_ad(slot->thr->slot != NULL);
  ut_ad(slot->thr->slot == slot);

  /* Must be within the array boundaries. */
  ut_ad(slot >= lock_sys->waiting_threads);
  ut_ad(slot < upper);

  slot->thr->slot = NULL;
  slot->thr = NULL;
  slot->in_use = FALSE;

  /* This operation is guarded by lock_wait_mutex_enter/exit and we don't care
  about its relative ordering with other operations in this critical section. */
  lock_sys->n_waiting.fetch_sub(1, std::memory_order_relaxed);
  /* Scan backwards and adjust the last free slot pointer. */
  for (slot = lock_sys->last_slot;
       slot > lock_sys->waiting_threads && !slot->in_use; --slot) {
    /* No op */
  }

  /* Either the array is empty or the last scanned slot is in use. */
  ut_ad(slot->in_use || slot == lock_sys->waiting_threads);

  lock_sys->last_slot = slot + 1;

  /* The last slot is either outside of the array boundary or it's
  on an empty slot. */
  ut_ad(lock_sys->last_slot == upper || !lock_sys->last_slot->in_use);

  ut_ad(lock_sys->last_slot >= lock_sys->waiting_threads);
  ut_ad(lock_sys->last_slot <= upper);

  lock_wait_mutex_exit();
}

/** Counts number of calls to lock_wait_table_reserve_slot.
It is protected by lock_wait_mutex.
Current value of this counter is stored in the slot a transaction has chosen
for sleeping during suspension, and thus serves as "reservation number" which
can be used to check if the owner of the slot has changed (perhaps multiple
times, in "ABA" manner). */
static uint64_t lock_wait_table_reservations = 0;

/** Reserves a slot in the thread table for the current user OS thread.
 @return reserved slot */
static srv_slot_t *lock_wait_table_reserve_slot(
    que_thr_t *thr,     /*!< in: query thread associated
                        with the user OS thread */
    ulong wait_timeout) /*!< in: lock wait timeout value */
{
  srv_slot_t *slot;

  ut_ad(lock_wait_mutex_own());
  ut_ad(trx_mutex_own(thr_get_trx(thr)));

  slot = lock_sys->waiting_threads;

  for (ulint i = srv_max_n_threads; i--; ++slot) {
    if (!slot->in_use) {
      slot->reservation_no = lock_wait_table_reservations++;
      slot->in_use = TRUE;
      slot->thr = thr;
      slot->thr->slot = slot;

      if (slot->event == NULL) {
        slot->event = os_event_create(0);
        ut_a(slot->event);
      }

      os_event_reset(slot->event);
      slot->suspended = TRUE;
      slot->suspend_time = ut_time_monotonic();
      slot->wait_timeout = wait_timeout;

      if (slot == lock_sys->last_slot) {
        ++lock_sys->last_slot;
      }

      ut_ad(lock_sys->last_slot <=
            lock_sys->waiting_threads + srv_max_n_threads);

      /* We call lock_wait_request_check_for_cycles() because the
      node representing the `thr` only now becomes visible to the thread which
      analyzes contents of lock_sys->waiting_threads. The edge itself was
      created by lock_create_wait_for_edge() during RecLock::add_to_waitq() or
      lock_table(), but at that moment the source of the edge was not yet in the
      lock_sys->waiting_threads, so the node and the outgoing edge were not yet
      visible.
      I hope this explains why we do waste time on calling
      lock_wait_request_check_for_cycles() from lock_create_wait_for_edge().*/
      lock_wait_request_check_for_cycles();
      return (slot);
    }
  }

  ib::error(ER_IB_MSG_646)
      << "There appear to be " << srv_max_n_threads
      << " user"
         " threads currently waiting inside InnoDB, which is the upper"
         " limit. Cannot continue operation. Before aborting, we print"
         " a list of waiting threads.";
  lock_wait_table_print();

  ut_error;
}

void lock_wait_request_check_for_cycles() { lock_set_timeout_event(); }

/** Puts a user OS thread to wait for a lock to be released. If an error
 occurs during the wait trx->error_state associated with thr is
 != DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
 are possible errors. DB_DEADLOCK is returned if selective deadlock
 resolution chose this transaction as a victim. */
void lock_wait_suspend_thread(que_thr_t *thr) /*!< in: query thread associated
                                              with the user OS thread */
{
  srv_slot_t *slot;
  trx_t *trx;
  ibool was_declared_inside_innodb;
  ib_time_monotonic_ms_t start_time = 0;
  ulong lock_wait_timeout;

  trx = thr_get_trx(thr);

  if (trx->mysql_thd != 0) {
    DEBUG_SYNC_C("lock_wait_suspend_thread_enter");
  }

  /* InnoDB system transactions (such as the purge, and
  incomplete transactions that are being rolled back after crash
  recovery) will use the global value of
  innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
  lock_wait_timeout = trx_lock_wait_timeout_get(trx);

  lock_wait_mutex_enter();

  trx_mutex_enter(trx);

  trx->error_state = DB_SUCCESS;

  if (thr->state == QUE_THR_RUNNING) {
    ut_ad(thr->is_active);

    /* The lock has already been released or this transaction
    was chosen as a deadlock victim: no need to suspend */

    if (trx->lock.was_chosen_as_deadlock_victim) {
      trx->error_state = DB_DEADLOCK;
      trx->lock.was_chosen_as_deadlock_victim = false;

      ut_d(trx->lock.in_rollback = true);
    }

    lock_wait_mutex_exit();
    trx_mutex_exit(trx);
    return;
  }

  ut_ad(!thr->is_active);

  slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout);

  if (thr->lock_state == QUE_THR_LOCK_ROW) {
    srv_stats.n_lock_wait_count.inc();
    srv_stats.n_lock_wait_current_count.inc();

    start_time = ut_time_monotonic_us();
  }
  /* This operation is guarded by lock_wait_mutex_enter/exit and we don't care
  about its relative ordering with other operations in this critical section. */
  lock_sys->n_waiting.fetch_add(1, std::memory_order_relaxed);
  lock_wait_mutex_exit();

  /* We hold trx->mutex here, which is required to call
  lock_set_lock_and_trx_wait. This means that the value in
  trx->lock.wait_lock_type which we are about to read comes from the latest
  call to lock_set_lock_and_trx_wait before we obtained the trx->mutex, which is
  precisely what we want for our stats */
  auto lock_type = trx->lock.wait_lock_type;
  trx_mutex_exit(trx);

  ulint had_dict_lock = trx->dict_operation_lock_mode;

  switch (had_dict_lock) {
    case 0:
      break;
    case RW_S_LATCH:
      /* Release foreign key check latch */
      row_mysql_unfreeze_data_dictionary(trx);

      DEBUG_SYNC_C("lock_wait_release_s_latch_before_sleep");
      break;
    case RW_X_LATCH:
      /* We may wait for rec lock in dd holding
      dict_operation_lock for creating FTS AUX table */
      ut_ad(!mutex_own(&dict_sys->mutex));
      rw_lock_x_unlock(dict_operation_lock);
      break;
  }

  /* Suspend this thread and wait for the event. */

  was_declared_inside_innodb = trx->declared_to_be_inside_innodb;

  if (was_declared_inside_innodb) {
    /* We must declare this OS thread to exit InnoDB, since a
    possible other thread holding a lock which this thread waits
    for must be allowed to enter, sooner or later */

    srv_conc_force_exit_innodb(trx);
  }

  ut_a(lock_type == LOCK_REC || lock_type == LOCK_TABLE);
  thd_wait_begin(trx->mysql_thd, lock_type == LOCK_REC ? THD_WAIT_ROW_LOCK
                                                       : THD_WAIT_TABLE_LOCK);

  DEBUG_SYNC_C("lock_wait_will_wait");

  os_event_wait(slot->event);

  DEBUG_SYNC_C("lock_wait_has_finished_waiting");

  thd_wait_end(trx->mysql_thd);

  /* After resuming, reacquire the data dictionary latch if
  necessary. */

  if (was_declared_inside_innodb) {
    /* Return back inside InnoDB */

    srv_conc_force_enter_innodb(trx);
  }

  if (had_dict_lock == RW_S_LATCH) {
    row_mysql_freeze_data_dictionary(trx);
  } else if (had_dict_lock == RW_X_LATCH) {
    rw_lock_x_lock(dict_operation_lock);
  }

  const auto wait_time = ut_time_monotonic() - slot->suspend_time;

  /* Release the slot for others to use */

  lock_wait_table_release_slot(slot);

  if (thr->lock_state == QUE_THR_LOCK_ROW) {
    const auto finish_time = ut_time_monotonic_us();

    const uint64_t diff_time =
        (finish_time > start_time) ? (uint64_t)(finish_time - start_time) : 0;

    srv_stats.n_lock_wait_current_count.dec();
    srv_stats.n_lock_wait_time.add(diff_time);

    /* Only update the variable if we successfully
    retrieved the start and finish times. See Bug#36819. */
    if (diff_time > lock_sys->n_lock_max_wait_time && start_time != 0) {
      lock_sys->n_lock_max_wait_time = diff_time;
    }

    /* Record the lock wait time for this thread */
    thd_set_lock_wait_time(trx->mysql_thd, diff_time);

    DBUG_EXECUTE_IF("lock_instrument_slow_query_log", os_thread_sleep(1000););
  }

  /* The transaction is chosen as deadlock victim during sleep. */
  if (trx->error_state == DB_DEADLOCK) {
    ut_d(trx->lock.in_rollback = true);
    return;
  }

  if (lock_wait_timeout < 100000000 && wait_time > (double)lock_wait_timeout &&
      !trx_is_high_priority(trx)) {
    trx->error_state = DB_LOCK_WAIT_TIMEOUT;

    MONITOR_INC(MONITOR_TIMEOUT);
  }

  if (trx_is_interrupted(trx)) {
    trx->error_state = DB_INTERRUPTED;
  }
}

/** Releases a user OS thread waiting for a lock to be released, if the
 thread is already suspended. Please do not call it directly, but rather use the
 lock_reset_wait_and_release_thread_if_suspended() wrapper.
 @param[in]   thr   query thread associated with the user OS thread */
static void lock_wait_release_thread_if_suspended(que_thr_t *thr) {
  auto trx = thr_get_trx(thr);
  /* We need a guarantee that for each time a thread is suspended there is at
  most one time it gets released - or more precisely: that there is at most
  one reason for it to be woken up. Otherwise it could happen that two
  different threads will think that they successfully woken up the transaction
  and that the transaction understands the reason it was woken up is the one
  they had in mind, say: one thread woken it up because of deadlock and another
  because of timeout. If the two reasons require different behavior after
  waking up, then we will be in trouble. Current implementation makes sure
  that we wake up a thread only once by observing several rules:
    1. the only way to wake up a trx is to call os_event_set
    2. the only call to os_event_set is in lock_wait_release_thread_if_suspended
    3. calls to lock_wait_release_thread_if_suspended are always performed after
    a call to lock_reset_lock_and_trx_wait(lock), and the sequence of the two is
    in a critical section guarded by lock_mutex_enter
    4. the lock_reset_lock_and_trx_wait(lock) asserts that
    lock->trx->lock.wait_lock == lock and sets lock->trx->lock.wait_lock = NULL
  Together all this facts imply, that it is impossible for a single trx to be
  woken up twice (unless it got to sleep again) because doing so requires
  reseting wait_lock to NULL.

  We now hold exclusive lock_sys latch. */
  ut_ad(lock_mutex_own());
  ut_ad(trx_mutex_own(trx));

  /* We don't need the lock_wait_mutex here, because we know that the thread
  had a reason to go to sleep (we have seen trx->lock.wait_lock !=NULL), and we
  know that we are the first ones to wake it up (we are the thread which has
  changed the trx->lock.wait_lock to NULL), so it either sleeps, or did not
  yet started the sleep. We hold the trx->mutex which is required to go to
  sleep. So, while holding the trx->mutex we can check if thr->slot is already
  assigned and if so, then we need to wake up the thread. If the thr->slot is
  not yet assigned, then we know that the thread had not yet gone to sleep, and
  we know that before doing so, it will need to acquire trx->mutex and will
  verify once more if it has to go to sleep by checking if thr->state is
  QUE_THR_RUNNING which we indeed have already set before calling
  lock_wait_release_thread_if_suspended, so we don't need to do anything in this
  case - trx will simply not go to sleep. */
  ut_ad(thr->state == QUE_THR_RUNNING);
  ut_ad(trx->lock.wait_lock == NULL);

  if (thr->slot != NULL && thr->slot->in_use && thr->slot->thr == thr) {
    if (trx->lock.was_chosen_as_deadlock_victim) {
      trx->error_state = DB_DEADLOCK;
      trx->lock.was_chosen_as_deadlock_victim = false;

      ut_d(trx->lock.in_rollback = true);
    }

    os_event_set(thr->slot->event);
  }
}

void lock_reset_wait_and_release_thread_if_suspended(lock_t *lock) {
  ut_ad(lock_mutex_own());
  ut_ad(trx_mutex_own(lock->trx));
  ut_ad(lock->trx->lock.wait_lock == lock);

  /* Reset the wait flag and the back pointer to lock in trx */

  lock_reset_lock_and_trx_wait(lock);

  /* We clear blocking_trx here and not in lock_reset_lock_and_trx_wait(), as
  lock_reset_lock_and_trx_wait() is called also when the wait_lock is being
  moved from one page to another during B-tree reorganization, in which case
  blocking_trx should not change - in such cases a new wait lock is created
  and assigned to trx->lock.wait_lock, but the information about blocking trx
  is not so easy to restore, so it is easier to simply not clear blocking_trx
  until we are 100% sure that we want to wake up the trx, which is now.
  Actually, clearing blocking_trx is not strictly required from correctness
  perspective, it rather serves for:
  1. performance optimization, as lock_wait_snapshot_waiting_threads() can omit
     this trx when building wait-for-graph
  2. debugging, as reseting blocking_trx makes it easier to spot it was not
     properly set on subsequent waits. */
  lock->trx->lock.blocking_trx.store(nullptr);

  /* We only release locks for which someone is waiting, and the trx which
  decided to wait for the lock should have already set trx->lock.que_state to
  TRX_QUE_LOCK_WAIT and called que_thr_stop() before releasing the lock-sys
  latch. */
  ut_ad(lock->trx_que_state() == TRX_QUE_LOCK_WAIT);

  /* The following function releases the trx from lock wait */

  que_thr_t *thr = que_thr_end_lock_wait(lock->trx);

  if (thr != nullptr) {
    lock_wait_release_thread_if_suspended(thr);
  }
}

/** Check if the thread lock wait has timed out. Release its locks if the
 wait has actually timed out. */
static void lock_wait_check_and_cancel(
    const srv_slot_t *slot) /*!< in: slot reserved by a user
                            thread when the wait started */
{
  trx_t *trx;

  const auto suspend_time = slot->suspend_time;

  ut_ad(lock_wait_mutex_own());

  ut_ad(slot->in_use);

  ut_ad(slot->suspended);

  const auto wait_time = ut_time_monotonic() - suspend_time;

  trx = thr_get_trx(slot->thr);

  if (trx_is_interrupted(trx) ||
      (slot->wait_timeout < 100000000 &&
       (wait_time > (int64_t)slot->wait_timeout || wait_time < 0))) {
    /* Timeout exceeded or a wrap-around in system
    time counter: cancel the lock request queued
    by the transaction and release possible
    other transactions waiting behind; it is
    possible that the lock has already been
    granted: in that case do nothing */

    lock_mutex_enter();

    trx_mutex_enter(trx);

    if (trx->lock.wait_lock != NULL && !trx_is_high_priority(trx)) {
      ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT);

      lock_cancel_waiting_and_release(trx->lock.wait_lock, false);
    }

    lock_mutex_exit();

    trx_mutex_exit(trx);
  }
}

/** A snapshot of information about a single slot which was in use at the moment
of taking the snapshot */
struct waiting_trx_info_t {
  /** The transaction which was using this slot. */
  trx_t *trx;
  /** The transaction for which the owner of the slot is waiting for. */
  trx_t *waits_for;
  /** The slot this info is about */
  srv_slot_t *slot;
  /** The slot->reservation_no at the moment of taking the snapshot */
  uint64_t reservation_no;
};

/** As we want to quickly find a given trx_t within the snapshot, we use a
sorting criterion which is based on trx only. We use the pointer address, as
any deterministic rule without ties will do. */
bool operator<(const waiting_trx_info_t &a, const waiting_trx_info_t &b) {
  return a.trx < b.trx;
}

/** Check all slots for user threads that are waiting on locks, and if they have
exceeded the time limit. */
static void lock_wait_check_slots_for_timeouts() {
  ut_ad(!lock_wait_mutex_own());
  lock_wait_mutex_enter();

  for (auto slot = lock_sys->waiting_threads; slot < lock_sys->last_slot;
       ++slot) {
    /* We are doing a read without the lock mutex and/or the trx mutex. This is
    OK because a slot can't be freed or reserved without the lock wait mutex. */
    if (slot->in_use) {
      lock_wait_check_and_cancel(slot);
    }
  }

  lock_wait_mutex_exit();
}

/** Takes a snapshot of the content of slots which are in use
@param[out]   infos   Will contain the information about slots which are in use
*/
static void lock_wait_snapshot_waiting_threads(
    ut::vector<waiting_trx_info_t> &infos) {
  ut_ad(!lock_wait_mutex_own());
  infos.clear();
  lock_wait_mutex_enter();
  /*
  We own lock_wait_mutex, which protects lock_wait_table_reservations and
  reservation_no.
  We want to make a snapshot of the wait-for graph as quick as possible to not
  keep the lock_wait_mutex too long.
  Anything more fancy than push_back seems to impact performance.

  Note: one should be able to prove that we don't really need a "consistent"
  snapshot - the algorithm should still work if we split the loop into several
  smaller "chunks" snapshotted independently and stitch them together. Care must
  be taken to "merge" duplicates keeping the freshest version (reservation_no)
  of slot for each trx.
  So, if (in future) this loop turns out to be a bottleneck (say, by increasing
  congestion on lock_wait_mutex), one can try to release and require the lock
  every X iterations and modify the lock_wait_build_wait_for_graph() to handle
  duplicates in a smart way.
  */
  for (auto slot = lock_sys->waiting_threads; slot < lock_sys->last_slot;
       ++slot) {
    if (slot->in_use) {
      auto from = thr_get_trx(slot->thr);
      auto to = from->lock.blocking_trx.load();
      if (to != nullptr) {
        infos.push_back({from, to, slot, slot->reservation_no});
      }
    }
  }
  lock_wait_mutex_exit();
}

/** Analyzes content of the snapshot with information about slots in use, and
builds a (subset of) list of edges from waiting transactions to blocking
transactions, such that for each waiter we have one outgoing edge.
@param[in]    infos     information about all waiting transactions
@param[out]   outgoing  The outgoing[from] will contain either the index such
                        that infos[outgoing[from]].trx is the reason
                        infos[from].trx has to wait, or -1 if the reason for
                        waiting is not among transactions in infos[].trx. */
static void lock_wait_build_wait_for_graph(
    ut::vector<waiting_trx_info_t> &infos, ut::vector<int> &outgoing) {
  /** We are going to use int and uint to store positions within infos */
  ut_ad(infos.size() < (1 << 20));
  const auto n = static_cast<uint>(infos.size());
  outgoing.clear();
  outgoing.resize(n, -1);
  /* This particular implementation sorts infos by ::trx, and then uses
  lower_bound to find index in infos corresponding to ::wait_for, which has
  O(nlgn) complexity and modifies infos, but has a nice property of avoiding any
  allocations.
  An alternative O(n) approach would be to use a hash table to map infos[i].trx
  to i.
  Using unordered_map<trx_t*,int> however causes too much (de)allocations as
  its bucket chains are implemented as a linked lists - overall it works much
  slower than sort.
  The fastest implementation was to use custom implementation of a hash table
  with open addressing and double hashing with a statically allocated
  2 * srv_max_n_threads buckets. This however did not increase transactions per
  second, so introducing a custom implementation seems unjustified here. */
  sort(infos.begin(), infos.end());
  waiting_trx_info_t needle{};
  for (uint from = 0; from < n; ++from) {
    ut_ad(from == 0 || infos[from - 1].trx < infos[from].trx);
    needle.trx = infos[from].waits_for;
    auto it = std::lower_bound(infos.begin(), infos.end(), needle);

    if (it == infos.end() || it->trx != needle.trx) {
      continue;
    }
    auto to = it - infos.begin();
    ut_ad(from != static_cast<uint>(to));
    outgoing[from] = static_cast<int>(to);
  }
}

/** Notifies the chosen_victim that it should roll back
@param[in,out]    chosen_victim   the transaction that should be rolled back */
static void lock_wait_rollback_deadlock_victim(trx_t *chosen_victim) {
  ut_ad(!trx_mutex_own(chosen_victim));
  /* The call to lock_cancel_waiting_and_release requires exclusive latch on
  whole lock_sys in case of table locks.*/
  ut_ad(lock_mutex_own());
  trx_mutex_enter(chosen_victim);
  chosen_victim->lock.was_chosen_as_deadlock_victim = true;
  ut_a(chosen_victim->lock.wait_lock);
  ut_a(chosen_victim->lock.que_state == TRX_QUE_LOCK_WAIT);
  lock_cancel_waiting_and_release(chosen_victim->lock.wait_lock, true);
  trx_mutex_exit(chosen_victim);
}

/** Given the `infos` about transactions and indexes in `infos` which form a
deadlock cycle, identifies the transaction with the largest `reservation_no`,
that is the one which was the latest to join the cycle.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return position in cycle_ids, such that infos[cycle_ids[pos]].reservation_no is
        the largest */
static size_t lock_wait_find_latest_pos_on_cycle(
    const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  size_t latest_pos = 0;
  for (size_t pos = 1; pos < cycle_ids.size(); ++pos) {
    if (infos[cycle_ids[latest_pos]].reservation_no <
        infos[cycle_ids[pos]].reservation_no) {
      latest_pos = pos;
    }
  }
  return latest_pos;
}

/** Rotates the deadlock cycle so that it starts from desired item.
@param[in]    first_pos   the position which should become first after rotation
@param[in]    cycle_ids   the array to rotate
@return a copy of cycle_ids rotated in such a way, that the element which
was at position `first_pos` is now the first */
static ut::vector<uint> lock_wait_rotate_so_pos_is_first(
    size_t first_pos, const ut::vector<uint> &cycle_ids) {
  auto rotated_ids = cycle_ids;
  std::rotate(rotated_ids.begin(), rotated_ids.begin() + first_pos,
              rotated_ids.end());
  return rotated_ids;
}

/** A helper, which extracts transactions with given indexes from the `infos`
array.
@param[in]    ids     indexes of transactions in the `infos` array to extract
@param[in]    infos   information about all waiting transactions
@return  An array formed by infos[ids[i]].trx */
template <typename T>
static ut::vector<T> lock_wait_map_ids_to_trxs(
    const ut::vector<uint> &ids, const ut::vector<waiting_trx_info_t> &infos) {
  ut::vector<T> trxs;
  trxs.reserve(ids.size());
  for (auto id : ids) {
    trxs.push_back(infos[id].trx);
  }
  return trxs;
}

/** Orders the transactions from deadlock cycle in such a backward-compatible
way, from the point of view of algorithm with picks the victim. From correctness
point of view this could be a no-op, but we have test cases which assume some
determinism in that which trx is selected as the victim. These tests usually
depend on the old behaviour in which the order in which trxs attempted to wait
was meaningful. In the past we have only considered two candidates for victim:
  (a) the transaction which closed the cycle by adding last wait-for edge
  (b) the transaction which is waiting for (a)
and it favored to pick (a) in case of ties.
To make these test pass we find the trx with most recent reservation_no (a),
and the one before it in the cycle (b), and move them to the end of collection
So that we consider them in order: ...,...,...,(b),(a), resolving ties by
picking the latest one as victim.
In other words we rotate the cycle so that the most recent waiter becomes the
last.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return transactions from the deadlock cycle sorted in the optimal way for
        choosing a victim */
static ut::vector<trx_t *> lock_wait_order_for_choosing_victim(
    const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  size_t latest_pos = lock_wait_find_latest_pos_on_cycle(cycle_ids, infos);
  size_t first_pos = (latest_pos + 1) % cycle_ids.size();
  return lock_wait_map_ids_to_trxs<trx_t *>(
      lock_wait_rotate_so_pos_is_first(first_pos, cycle_ids), infos);
}

/** Given an array with information about all waiting transactions and indexes
in it which form a deadlock cycle, picks the transaction to rollback.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return the transaction chosen as a victim */
static trx_t *lock_wait_choose_victim(
    const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  /* We are iterating over various transactions comparing their trx_weight_ge,
  which is computed based on number of locks held thus we need exclusive latch
  on the whole lock_sys. In theory number of locks should not change while the
  transaction is waiting, but instead of proving that they can not wake up, it
  is easier to assert that we hold the mutex */
  ut_ad(lock_mutex_own());
  ut_ad(!cycle_ids.empty());
  trx_t *chosen_victim = nullptr;
  auto sorted_trxs = lock_wait_order_for_choosing_victim(cycle_ids, infos);

  for (auto *trx : sorted_trxs) {
    if (chosen_victim == nullptr) {
      chosen_victim = trx;
      continue;
    }

    if (trx_is_high_priority(chosen_victim) || trx_is_high_priority(trx)) {
      auto victim = trx_arbitrate(trx, chosen_victim);

      if (victim != nullptr) {
        if (victim == trx) {
          chosen_victim = trx;
        } else {
          ut_a(victim == chosen_victim);
        }
        continue;
      }
    }

    if (trx_weight_ge(chosen_victim, trx)) {
      /* The joining transaction is 'smaller',
      choose it as the victim and roll it back. */
      chosen_victim = trx;
    }
  }

  ut_a(chosen_victim);
  return chosen_victim;
}

/** Given an array with information about all waiting transactions and indexes
in it which form a deadlock cycle, checks if the transactions allegedly forming
the deadlock have actually stayed in slots since we've last checked, as opposed
to say, not leaving a slot, and/or re-entering the slot ("ABA" situation).
This is done by comparing the current reservation_no for each slot, with the
reservation_no from the `info` snapshot.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return true if all given transactions resided in their slots for the whole time
since the snapshot was taken, and in particular are still there right now */
static bool lock_wait_trxs_are_still_in_slots(
    const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  ut_ad(lock_wait_mutex_own());
  for (auto id : cycle_ids) {
    const auto slot = infos[id].slot;
    if (!slot->in_use || slot->reservation_no != infos[id].reservation_no) {
      return false;
    }
    ut_ad(thr_get_trx(slot->thr) == infos[id].trx);
  }
  return true;
}

/** Given an array with information about all waiting transactions and indexes
in it which form a deadlock cycle, checks if the transactions allegedly forming
the deadlock have actually still wait for a lock, as opposed to being already
notified about lock being granted or timeout, but still being present in the
slot. This is done by checking trx->lock.wait_lock under lock_sys mutex.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return true if all given transactions are still waiting for locks*/
static bool lock_wait_trxs_are_still_waiting(
    const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  ut_ad(lock_wait_mutex_own());
  /* We are iterating over various transaction which may have locks in different
  tables/rows, thus we need exclusive latch on the whole lock_sys to make sure
  no one will wake them up (say, a high priority trx could abort them) */
  ut_ad(lock_mutex_own());

  for (auto id : cycle_ids) {
    const auto trx = infos[id].trx;
    if (trx->lock.wait_lock == nullptr) {
      /* trx is on its way to being woken up, so this cycle is a false positive.
      As this particular cycle will resolve itself, we ignore it. */
      return false;
    }
    ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
  }
  return true;
}

/* A helper function which rotates the deadlock cycle, so that the order of
transactions in it is suitable for notification. From correctness perspective
this could be a no-op, but we have test cases which assume some determinism in
that in which order trx from cycle are reported. These tests usually depend on
the old behaviour in which the order in which transactions attempted to wait was
meaningful. In the past we reported:
- the transaction which closed the cycle by adding last wait-for edge as (2)
- the transaction which is waiting for (2) as (1)
To make these test pass we find the trx with most recent reservation_no (2),
and the one before it in the cycle (1), and move (1) to the beginning.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return the indexes from cycle_ids sorted rotated in backward-compatible way
*/
static ut::vector<uint> lock_wait_rotate_cycle_ids_for_notification(
    const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  size_t latest_pos = lock_wait_find_latest_pos_on_cycle(cycle_ids, infos);
  size_t previous_pos = (latest_pos - 1 + cycle_ids.size()) % cycle_ids.size();
  return lock_wait_rotate_so_pos_is_first(previous_pos, cycle_ids);
}

/** A helper function which rotates the deadlock cycle, so that the order of
transactions in it is suitable for notification. From correctness perspective
this could be a no-op, but we have tests which depend on deterministic output
from such notifications, and we want to be backward compatible.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return the transactions from cycle_ids rotated in backward-compatible way */
static ut::vector<const trx_t *> lock_wait_trxs_rotated_for_notification(
    const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  return lock_wait_map_ids_to_trxs<const trx_t *>(
      lock_wait_rotate_cycle_ids_for_notification(cycle_ids, infos), infos);
}

/** Handles a deadlock found, by notifying about it, and rolling back the
chosen victim.
@param[in,out]  chosen_victim the transaction to roll back
@param[in]      cycle_ids     indexes in `infos` array, of transactions forming
                              the deadlock cycle
@param[in]      infos         information about all waiting transactions */
static void lock_wait_handle_deadlock(
    trx_t *chosen_victim, const ut::vector<uint> &cycle_ids,
    const ut::vector<waiting_trx_info_t> &infos) {
  lock_notify_about_deadlock(
      lock_wait_trxs_rotated_for_notification(cycle_ids, infos), chosen_victim);

  lock_wait_rollback_deadlock_victim(chosen_victim);
}

/** Given an array with information about all waiting transactions and indexes
in it which form a deadlock cycle, checks if the transactions allegedly forming
the deadlock cycle, indeed are still waiting, and if so, chooses a victim and
handles the deadlock.
@param[in]    cycle_ids   indexes in `infos` array, of transactions forming the
                          deadlock cycle
@param[in]    infos       information about all waiting transactions
@return true if the cycle found was indeed a deadlock cycle, false if it was a
false positive */
static bool lock_wait_check_candidate_cycle(
    ut::vector<uint> &cycle_ids, const ut::vector<waiting_trx_info_t> &infos) {
  ut_ad(!lock_wait_mutex_own());
  ut_ad(!lock_mutex_own());
  lock_wait_mutex_enter();
  /*
  We have released all mutexes after we have built the `infos` snapshot and
  before we've got here. So, while it is true that the edges form a cycle, it
  may also be true that some of these transactions were already rolled back, and
  memory pointed by infos[i].trx or infos[i].waits_for is no longer the trx it
  used to be (as we reuse trx_t objects). It may even segfault if we try to
  access it (because trx_t object could be freed). So we need to somehow verify
  that the pointer is still valid without accessing it. We do that by checking
  if slot->reservation_no has changed since taking a snapshot.
  If it has not changed, then we know that the trx's pointer still points to the
  same trx as the trx is sleeping, and thus has not finished and wasn't freed.
  So, we start by first checking that the slots still contain the trxs we are
  interested in. This requires lock_wait_mutex, but not lock_mutex.
  */
  if (!lock_wait_trxs_are_still_in_slots(cycle_ids, infos)) {
    lock_wait_mutex_exit();
    return false;
  }
  /*
  At this point we are sure that we can access memory pointed by infos[i].trx
  and that transactions are still in their slots. (And, as `cycle_ids` is a
  cycle, we also know that infos[cycle_ids[i]].wait_for is equal to
  infos[cycle_ids[i+1]].trx, so infos[cycle_ids[i]].wait_for can also be safely
  accessed).
  This however does not mean necessarily that they are still waiting.
  They might have been already notified that they should wake up (by calling
  lock_wait_release_thread_if_suspended()), but they had not yet chance to act
  upon it (it is the trx being woken up who is responsible for cleaning up the
  `slot` it used).
  So, the slot can be still in use and contain a transaction, which was already
  decided to be rolledback for example. However, we can recognize this situation
  by looking at trx->lock.wait_lock, as each call to
  lock_wait_release_thread_if_suspended() is performed only after
  lock_reset_lock_and_trx_wait() resets trx->lock.wait_lock to NULL.
  Checking trx->lock.wait_lock must be done under lock_mutex.
  */
  lock_mutex_enter();
  if (!lock_wait_trxs_are_still_waiting(cycle_ids, infos)) {
    lock_mutex_exit();
    lock_wait_mutex_exit();
    return false;
  }

  /*
  We can now release lock_wait_mutex, because:

  1. we have verified that trx->lock.wait_lock is not NULL for cycle_ids
  2. we hold lock_sys->mutex
  3. lock_sys->mutex is required to change trx->lock.wait_lock to NULL
  4. only after changing trx->lock.wait_lock to NULL a trx can finish

  So as long as we hold lock_sys->mutex we can access trxs.
  */

  lock_wait_mutex_exit();

  trx_t *const chosen_victim = lock_wait_choose_victim(cycle_ids, infos);
  ut_a(chosen_victim);

  lock_wait_handle_deadlock(chosen_victim, cycle_ids, infos);

  lock_mutex_exit();
  return true;
}

/** Generates a list of `cycle_ids` by following `outgoing` edges from the
`start`.
@param[in,out]    cycle_ids   will contain the list of ids on cycle
@param[in]        start       the first id on the cycle
@param[in]        outgoing    the ids of edge endpoints: id -> outgoing[id] */
static void lock_wait_extract_cycle_ids(ut::vector<uint> &cycle_ids,
                                        const uint start,
                                        const ut::vector<int> &outgoing) {
  cycle_ids.clear();
  uint id = start;
  do {
    cycle_ids.push_back(id);
    id = outgoing[id];
  } while (id != start);
}

/** Assumming that `infos` contains information about all waiting transactions,
and `outgoing[i]` is the endpoint of wait-for edge going out of infos[i].trx,
or -1 if the transaction is not waiting, it identifies and handles all cycles
in the wait-for graph
@param[in]      infos         Information about all waiting transactions
@param[in]      outgoing      The ids of edge endpoints. If outgoing[id] == -1,
                              then there is no edge going out of id, otherwise
                              infos[id].trx waits for infos[outgoing[id]].trx */
static void lock_wait_find_and_handle_deadlocks(
    const ut::vector<waiting_trx_info_t> &infos,
    const ut::vector<int> &outgoing) {
  ut_ad(infos.size() == outgoing.size());
  /** We are going to use int and uint to store positions within infos */
  ut_ad(infos.size() < (1 << 20));
  const auto n = static_cast<uint>(infos.size());
  ut::vector<uint> cycle_ids;
  cycle_ids.clear();
  ut::vector<uint> colors;
  colors.clear();
  colors.resize(n, 0);
  uint current_color = 0;
  for (uint start = 0; start < n; ++start) {
    if (colors[start] != 0) {
      /* This node was already fully processed*/
      continue;
    }
    ++current_color;
    for (int id = start; 0 <= id; id = outgoing[id]) {
      /* We don't expect transaction to deadlock with itself only
      and we do not handle cycles of length=1 correctly */
      ut_ad(id != outgoing[id]);
      if (colors[id] == 0) {
        /* This node was never visited yet */
        colors[id] = current_color;
        continue;
      }
      /* This node was already visited:
      - either it has current_color which means we've visited it during current
        DFS descend, which means we have found a cycle, which we need to verify,
      - or, it has a color used in a previous DFS which means that current DFS
        path merges into an already processed portion of wait-for graph, so we
        can stop now */
      if (colors[id] == current_color) {
        /* found a candidate cycle! */
        lock_wait_extract_cycle_ids(cycle_ids, id, outgoing);
        if (lock_wait_check_candidate_cycle(cycle_ids, infos)) {
          MONITOR_INC(MONITOR_DEADLOCK);
        } else {
          MONITOR_INC(MONITOR_DEADLOCK_FALSE_POSITIVES);
        }
      }
      break;
    }
  }
  MONITOR_INC(MONITOR_DEADLOCK_ROUNDS);
  MONITOR_SET(MONITOR_LOCK_THREADS_WAITING, n);
}

/** Takes a snapshot of transactions in waiting currently in slots, searches for
deadlocks among them and resolves them. */
static void lock_wait_check_for_deadlocks() {
  /*
  Note: I was tempted to declare `infos` as `static`, or at least
  declare it in lock_wait_timeout_thread() and reuse the same instance
  over and over again to avoid allocator calls caused by push_back()
  calls inside lock_wait_snapshot_waiting_threads() while we hold
  lock_sys->lock_wait_mutex. I was afraid, that allocator might need
  to block on some internal mutex in order to synchronize with other
  threads using allocator, and this could in turn cause contention on
  lock_wait_mutex. I hoped, that since vectors never shrink,
  and only grow, then keeping a single instance of `infos` alive for the whole
  lifetime of the thread should increase performance, because after some
  initial period of growing, the allocations will never have to occur again.
  But, I've run many many various experiments, with/without static,
  with infos declared outside, with reserve(n) using various values of n
  (128, srv_max_n_threads, even a simple ML predictor), and nothing, NOTHING
  was faster than just using local vector as we do here (at least on
  tetra01, tetra02, when comparing ~70 runs of each algorithm on uniform,
  pareto, 128 and 1024 usrs).
  This was counter-intuitive to me, so I've checked how malloc is implemented
  and it turns out that modern malloc is a variant of ptmalloc2 and uses a
  large number of independent arenas (larger than number of threads), and
  various smart heuristics to map threads to these arenas in a way which
  avoids blocking.
  So, before changing the way we handle allocation of memory in these
  lock_wait_* routines, please, PLEASE, first check empirically if it really
  helps by running many tests and checking for statistical significance,
  as variance seems to be large.
  (And the downside of using `static` is that it is not very obvious
  how our custom deallocator, which tries to update some stats, would
  behave during shutdown).
  Anyway, in case some tests will indicate that taking a snapshot is a
  bottleneck, one can check if declaring this vectors as static solves the
  issue.
  */
  if (innobase_deadlock_detect) {
    ut::vector<waiting_trx_info_t> infos;
    ut::vector<int> outgoing;

    lock_wait_snapshot_waiting_threads(infos);
    lock_wait_build_wait_for_graph(infos, outgoing);

    lock_wait_find_and_handle_deadlocks(infos, outgoing);
  }
}

/** A thread which wakes up threads whose lock wait may have lasted too long,
analyzes wait-for-graph changes, and checks for deadlocks and resolves them */
void lock_wait_timeout_thread() {
  int64_t sig_count = 0;
  os_event_t event = lock_sys->timeout_event;

  ut_ad(!srv_read_only_mode);

  /** The last time we've checked for timeouts. */
  auto last_checked_for_timeouts_at = ut_time();
  do {
    auto now = ut_time();
    if (0.5 < ut_difftime(now, last_checked_for_timeouts_at)) {
      last_checked_for_timeouts_at = now;
      lock_wait_check_slots_for_timeouts();
    }

    lock_wait_check_for_deadlocks();

    /* When someone is waiting for a lock, we wake up every second (at worst)
    and check if a timeout has passed for a lock wait */
    os_event_wait_time_low(event, 1000000, sig_count);
    sig_count = os_event_reset(event);

  } while (srv_shutdown_state.load() == SRV_SHUTDOWN_NONE);
}