/***************************************************************************** Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the Free Software Foundation. This program is also distributed with certain software (including but not limited to OpenSSL) that is licensed under separate terms, as designated in a particular file or component or in included license documentation. The authors of MySQL hereby grant you an additional permission to link the program and your derivative works with the separately licensed software that they have included with MySQL. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *****************************************************************************/ /** @file lock/lock0wait.cc The transaction lock system Created 25/5/2010 Sunny Bains *******************************************************/ #define LOCK_MODULE_IMPLEMENTATION #include #include #include "ha_prototypes.h" #include "lock0lock.h" #include "lock0priv.h" #include "os0thread-create.h" #include "que0que.h" #include "row0mysql.h" #include "srv0mon.h" #include "srv0start.h" #include "my_dbug.h" /** Print the contents of the lock_sys_t::waiting_threads array. */ static void lock_wait_table_print(void) { ut_ad(lock_wait_mutex_own()); const srv_slot_t *slot = lock_sys->waiting_threads; for (ulint i = 0; i < srv_max_n_threads; i++, ++slot) { fprintf(stderr, "Slot %lu: thread type %lu," " in use %lu, susp %lu, timeout %lu, time %lu\n", (ulong)i, (ulong)slot->type, (ulong)slot->in_use, (ulong)slot->suspended, slot->wait_timeout, (ulong)(ut_time_monotonic() - slot->suspend_time)); } } /** Release a slot in the lock_sys_t::waiting_threads. Adjust the array last pointer if there are empty slots towards the end of the table. */ static void lock_wait_table_release_slot( srv_slot_t *slot) /*!< in: slot to release */ { #ifdef UNIV_DEBUG srv_slot_t *upper = lock_sys->waiting_threads + srv_max_n_threads; #endif /* UNIV_DEBUG */ lock_wait_mutex_enter(); /* We omit trx_mutex_enter and lock_mutex_enter here, because we are only going to touch thr->slot, which is a member used only by lock0wait.cc and is sufficiently protected by lock_wait_mutex. Yes, there are readers who read the thr->slot holding only trx->mutex and lock_sys->mutex, but they do so, when they are sure that we were not woken up yet, so our thread can't be here. See comments in lock_wait_release_thread_if_suspended() for more details. */ ut_ad(slot->in_use); ut_ad(slot->thr != NULL); ut_ad(slot->thr->slot != NULL); ut_ad(slot->thr->slot == slot); /* Must be within the array boundaries. */ ut_ad(slot >= lock_sys->waiting_threads); ut_ad(slot < upper); slot->thr->slot = NULL; slot->thr = NULL; slot->in_use = FALSE; /* This operation is guarded by lock_wait_mutex_enter/exit and we don't care about its relative ordering with other operations in this critical section. */ lock_sys->n_waiting.fetch_sub(1, std::memory_order_relaxed); /* Scan backwards and adjust the last free slot pointer. */ for (slot = lock_sys->last_slot; slot > lock_sys->waiting_threads && !slot->in_use; --slot) { /* No op */ } /* Either the array is empty or the last scanned slot is in use. */ ut_ad(slot->in_use || slot == lock_sys->waiting_threads); lock_sys->last_slot = slot + 1; /* The last slot is either outside of the array boundary or it's on an empty slot. */ ut_ad(lock_sys->last_slot == upper || !lock_sys->last_slot->in_use); ut_ad(lock_sys->last_slot >= lock_sys->waiting_threads); ut_ad(lock_sys->last_slot <= upper); lock_wait_mutex_exit(); } /** Counts number of calls to lock_wait_table_reserve_slot. It is protected by lock_wait_mutex. Current value of this counter is stored in the slot a transaction has chosen for sleeping during suspension, and thus serves as "reservation number" which can be used to check if the owner of the slot has changed (perhaps multiple times, in "ABA" manner). */ static uint64_t lock_wait_table_reservations = 0; /** Reserves a slot in the thread table for the current user OS thread. @return reserved slot */ static srv_slot_t *lock_wait_table_reserve_slot( que_thr_t *thr, /*!< in: query thread associated with the user OS thread */ ulong wait_timeout) /*!< in: lock wait timeout value */ { srv_slot_t *slot; ut_ad(lock_wait_mutex_own()); ut_ad(trx_mutex_own(thr_get_trx(thr))); slot = lock_sys->waiting_threads; for (ulint i = srv_max_n_threads; i--; ++slot) { if (!slot->in_use) { slot->reservation_no = lock_wait_table_reservations++; slot->in_use = TRUE; slot->thr = thr; slot->thr->slot = slot; if (slot->event == NULL) { slot->event = os_event_create(0); ut_a(slot->event); } os_event_reset(slot->event); slot->suspended = TRUE; slot->suspend_time = ut_time_monotonic(); slot->wait_timeout = wait_timeout; if (slot == lock_sys->last_slot) { ++lock_sys->last_slot; } ut_ad(lock_sys->last_slot <= lock_sys->waiting_threads + srv_max_n_threads); /* We call lock_wait_request_check_for_cycles() because the node representing the `thr` only now becomes visible to the thread which analyzes contents of lock_sys->waiting_threads. The edge itself was created by lock_create_wait_for_edge() during RecLock::add_to_waitq() or lock_table(), but at that moment the source of the edge was not yet in the lock_sys->waiting_threads, so the node and the outgoing edge were not yet visible. I hope this explains why we do waste time on calling lock_wait_request_check_for_cycles() from lock_create_wait_for_edge().*/ lock_wait_request_check_for_cycles(); return (slot); } } ib::error(ER_IB_MSG_646) << "There appear to be " << srv_max_n_threads << " user" " threads currently waiting inside InnoDB, which is the upper" " limit. Cannot continue operation. Before aborting, we print" " a list of waiting threads."; lock_wait_table_print(); ut_error; } void lock_wait_request_check_for_cycles() { lock_set_timeout_event(); } /** Puts a user OS thread to wait for a lock to be released. If an error occurs during the wait trx->error_state associated with thr is != DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK are possible errors. DB_DEADLOCK is returned if selective deadlock resolution chose this transaction as a victim. */ void lock_wait_suspend_thread(que_thr_t *thr) /*!< in: query thread associated with the user OS thread */ { srv_slot_t *slot; trx_t *trx; ibool was_declared_inside_innodb; ib_time_monotonic_ms_t start_time = 0; ulong lock_wait_timeout; trx = thr_get_trx(thr); if (trx->mysql_thd != 0) { DEBUG_SYNC_C("lock_wait_suspend_thread_enter"); } /* InnoDB system transactions (such as the purge, and incomplete transactions that are being rolled back after crash recovery) will use the global value of innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */ lock_wait_timeout = trx_lock_wait_timeout_get(trx); lock_wait_mutex_enter(); trx_mutex_enter(trx); trx->error_state = DB_SUCCESS; if (thr->state == QUE_THR_RUNNING) { ut_ad(thr->is_active); /* The lock has already been released or this transaction was chosen as a deadlock victim: no need to suspend */ if (trx->lock.was_chosen_as_deadlock_victim) { trx->error_state = DB_DEADLOCK; trx->lock.was_chosen_as_deadlock_victim = false; ut_d(trx->lock.in_rollback = true); } lock_wait_mutex_exit(); trx_mutex_exit(trx); return; } ut_ad(!thr->is_active); slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout); if (thr->lock_state == QUE_THR_LOCK_ROW) { srv_stats.n_lock_wait_count.inc(); srv_stats.n_lock_wait_current_count.inc(); start_time = ut_time_monotonic_us(); } /* This operation is guarded by lock_wait_mutex_enter/exit and we don't care about its relative ordering with other operations in this critical section. */ lock_sys->n_waiting.fetch_add(1, std::memory_order_relaxed); lock_wait_mutex_exit(); /* We hold trx->mutex here, which is required to call lock_set_lock_and_trx_wait. This means that the value in trx->lock.wait_lock_type which we are about to read comes from the latest call to lock_set_lock_and_trx_wait before we obtained the trx->mutex, which is precisely what we want for our stats */ auto lock_type = trx->lock.wait_lock_type; trx_mutex_exit(trx); ulint had_dict_lock = trx->dict_operation_lock_mode; switch (had_dict_lock) { case 0: break; case RW_S_LATCH: /* Release foreign key check latch */ row_mysql_unfreeze_data_dictionary(trx); DEBUG_SYNC_C("lock_wait_release_s_latch_before_sleep"); break; case RW_X_LATCH: /* We may wait for rec lock in dd holding dict_operation_lock for creating FTS AUX table */ ut_ad(!mutex_own(&dict_sys->mutex)); rw_lock_x_unlock(dict_operation_lock); break; } /* Suspend this thread and wait for the event. */ was_declared_inside_innodb = trx->declared_to_be_inside_innodb; if (was_declared_inside_innodb) { /* We must declare this OS thread to exit InnoDB, since a possible other thread holding a lock which this thread waits for must be allowed to enter, sooner or later */ srv_conc_force_exit_innodb(trx); } ut_a(lock_type == LOCK_REC || lock_type == LOCK_TABLE); thd_wait_begin(trx->mysql_thd, lock_type == LOCK_REC ? THD_WAIT_ROW_LOCK : THD_WAIT_TABLE_LOCK); DEBUG_SYNC_C("lock_wait_will_wait"); os_event_wait(slot->event); DEBUG_SYNC_C("lock_wait_has_finished_waiting"); thd_wait_end(trx->mysql_thd); /* After resuming, reacquire the data dictionary latch if necessary. */ if (was_declared_inside_innodb) { /* Return back inside InnoDB */ srv_conc_force_enter_innodb(trx); } if (had_dict_lock == RW_S_LATCH) { row_mysql_freeze_data_dictionary(trx); } else if (had_dict_lock == RW_X_LATCH) { rw_lock_x_lock(dict_operation_lock); } const auto wait_time = ut_time_monotonic() - slot->suspend_time; /* Release the slot for others to use */ lock_wait_table_release_slot(slot); if (thr->lock_state == QUE_THR_LOCK_ROW) { const auto finish_time = ut_time_monotonic_us(); const uint64_t diff_time = (finish_time > start_time) ? (uint64_t)(finish_time - start_time) : 0; srv_stats.n_lock_wait_current_count.dec(); srv_stats.n_lock_wait_time.add(diff_time); /* Only update the variable if we successfully retrieved the start and finish times. See Bug#36819. */ if (diff_time > lock_sys->n_lock_max_wait_time && start_time != 0) { lock_sys->n_lock_max_wait_time = diff_time; } /* Record the lock wait time for this thread */ thd_set_lock_wait_time(trx->mysql_thd, diff_time); DBUG_EXECUTE_IF("lock_instrument_slow_query_log", os_thread_sleep(1000);); } /* The transaction is chosen as deadlock victim during sleep. */ if (trx->error_state == DB_DEADLOCK) { ut_d(trx->lock.in_rollback = true); return; } if (lock_wait_timeout < 100000000 && wait_time > (double)lock_wait_timeout && !trx_is_high_priority(trx)) { trx->error_state = DB_LOCK_WAIT_TIMEOUT; MONITOR_INC(MONITOR_TIMEOUT); } if (trx_is_interrupted(trx)) { trx->error_state = DB_INTERRUPTED; } } /** Releases a user OS thread waiting for a lock to be released, if the thread is already suspended. Please do not call it directly, but rather use the lock_reset_wait_and_release_thread_if_suspended() wrapper. @param[in] thr query thread associated with the user OS thread */ static void lock_wait_release_thread_if_suspended(que_thr_t *thr) { auto trx = thr_get_trx(thr); /* We need a guarantee that for each time a thread is suspended there is at most one time it gets released - or more precisely: that there is at most one reason for it to be woken up. Otherwise it could happen that two different threads will think that they successfully woken up the transaction and that the transaction understands the reason it was woken up is the one they had in mind, say: one thread woken it up because of deadlock and another because of timeout. If the two reasons require different behavior after waking up, then we will be in trouble. Current implementation makes sure that we wake up a thread only once by observing several rules: 1. the only way to wake up a trx is to call os_event_set 2. the only call to os_event_set is in lock_wait_release_thread_if_suspended 3. calls to lock_wait_release_thread_if_suspended are always performed after a call to lock_reset_lock_and_trx_wait(lock), and the sequence of the two is in a critical section guarded by lock_mutex_enter 4. the lock_reset_lock_and_trx_wait(lock) asserts that lock->trx->lock.wait_lock == lock and sets lock->trx->lock.wait_lock = NULL Together all this facts imply, that it is impossible for a single trx to be woken up twice (unless it got to sleep again) because doing so requires reseting wait_lock to NULL. We now hold exclusive lock_sys latch. */ ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(trx)); /* We don't need the lock_wait_mutex here, because we know that the thread had a reason to go to sleep (we have seen trx->lock.wait_lock !=NULL), and we know that we are the first ones to wake it up (we are the thread which has changed the trx->lock.wait_lock to NULL), so it either sleeps, or did not yet started the sleep. We hold the trx->mutex which is required to go to sleep. So, while holding the trx->mutex we can check if thr->slot is already assigned and if so, then we need to wake up the thread. If the thr->slot is not yet assigned, then we know that the thread had not yet gone to sleep, and we know that before doing so, it will need to acquire trx->mutex and will verify once more if it has to go to sleep by checking if thr->state is QUE_THR_RUNNING which we indeed have already set before calling lock_wait_release_thread_if_suspended, so we don't need to do anything in this case - trx will simply not go to sleep. */ ut_ad(thr->state == QUE_THR_RUNNING); ut_ad(trx->lock.wait_lock == NULL); if (thr->slot != NULL && thr->slot->in_use && thr->slot->thr == thr) { if (trx->lock.was_chosen_as_deadlock_victim) { trx->error_state = DB_DEADLOCK; trx->lock.was_chosen_as_deadlock_victim = false; ut_d(trx->lock.in_rollback = true); } os_event_set(thr->slot->event); } } void lock_reset_wait_and_release_thread_if_suspended(lock_t *lock) { ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(lock->trx)); ut_ad(lock->trx->lock.wait_lock == lock); /* Reset the wait flag and the back pointer to lock in trx */ lock_reset_lock_and_trx_wait(lock); /* We clear blocking_trx here and not in lock_reset_lock_and_trx_wait(), as lock_reset_lock_and_trx_wait() is called also when the wait_lock is being moved from one page to another during B-tree reorganization, in which case blocking_trx should not change - in such cases a new wait lock is created and assigned to trx->lock.wait_lock, but the information about blocking trx is not so easy to restore, so it is easier to simply not clear blocking_trx until we are 100% sure that we want to wake up the trx, which is now. Actually, clearing blocking_trx is not strictly required from correctness perspective, it rather serves for: 1. performance optimization, as lock_wait_snapshot_waiting_threads() can omit this trx when building wait-for-graph 2. debugging, as reseting blocking_trx makes it easier to spot it was not properly set on subsequent waits. */ lock->trx->lock.blocking_trx.store(nullptr); /* We only release locks for which someone is waiting, and the trx which decided to wait for the lock should have already set trx->lock.que_state to TRX_QUE_LOCK_WAIT and called que_thr_stop() before releasing the lock-sys latch. */ ut_ad(lock->trx_que_state() == TRX_QUE_LOCK_WAIT); /* The following function releases the trx from lock wait */ que_thr_t *thr = que_thr_end_lock_wait(lock->trx); if (thr != nullptr) { lock_wait_release_thread_if_suspended(thr); } } /** Check if the thread lock wait has timed out. Release its locks if the wait has actually timed out. */ static void lock_wait_check_and_cancel( const srv_slot_t *slot) /*!< in: slot reserved by a user thread when the wait started */ { trx_t *trx; const auto suspend_time = slot->suspend_time; ut_ad(lock_wait_mutex_own()); ut_ad(slot->in_use); ut_ad(slot->suspended); const auto wait_time = ut_time_monotonic() - suspend_time; trx = thr_get_trx(slot->thr); if (trx_is_interrupted(trx) || (slot->wait_timeout < 100000000 && (wait_time > (int64_t)slot->wait_timeout || wait_time < 0))) { /* Timeout exceeded or a wrap-around in system time counter: cancel the lock request queued by the transaction and release possible other transactions waiting behind; it is possible that the lock has already been granted: in that case do nothing */ lock_mutex_enter(); trx_mutex_enter(trx); if (trx->lock.wait_lock != NULL && !trx_is_high_priority(trx)) { ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT); lock_cancel_waiting_and_release(trx->lock.wait_lock, false); } lock_mutex_exit(); trx_mutex_exit(trx); } } /** A snapshot of information about a single slot which was in use at the moment of taking the snapshot */ struct waiting_trx_info_t { /** The transaction which was using this slot. */ trx_t *trx; /** The transaction for which the owner of the slot is waiting for. */ trx_t *waits_for; /** The slot this info is about */ srv_slot_t *slot; /** The slot->reservation_no at the moment of taking the snapshot */ uint64_t reservation_no; }; /** As we want to quickly find a given trx_t within the snapshot, we use a sorting criterion which is based on trx only. We use the pointer address, as any deterministic rule without ties will do. */ bool operator<(const waiting_trx_info_t &a, const waiting_trx_info_t &b) { return a.trx < b.trx; } /** Check all slots for user threads that are waiting on locks, and if they have exceeded the time limit. */ static void lock_wait_check_slots_for_timeouts() { ut_ad(!lock_wait_mutex_own()); lock_wait_mutex_enter(); for (auto slot = lock_sys->waiting_threads; slot < lock_sys->last_slot; ++slot) { /* We are doing a read without the lock mutex and/or the trx mutex. This is OK because a slot can't be freed or reserved without the lock wait mutex. */ if (slot->in_use) { lock_wait_check_and_cancel(slot); } } lock_wait_mutex_exit(); } /** Takes a snapshot of the content of slots which are in use @param[out] infos Will contain the information about slots which are in use */ static void lock_wait_snapshot_waiting_threads( ut::vector &infos) { ut_ad(!lock_wait_mutex_own()); infos.clear(); lock_wait_mutex_enter(); /* We own lock_wait_mutex, which protects lock_wait_table_reservations and reservation_no. We want to make a snapshot of the wait-for graph as quick as possible to not keep the lock_wait_mutex too long. Anything more fancy than push_back seems to impact performance. Note: one should be able to prove that we don't really need a "consistent" snapshot - the algorithm should still work if we split the loop into several smaller "chunks" snapshotted independently and stitch them together. Care must be taken to "merge" duplicates keeping the freshest version (reservation_no) of slot for each trx. So, if (in future) this loop turns out to be a bottleneck (say, by increasing congestion on lock_wait_mutex), one can try to release and require the lock every X iterations and modify the lock_wait_build_wait_for_graph() to handle duplicates in a smart way. */ for (auto slot = lock_sys->waiting_threads; slot < lock_sys->last_slot; ++slot) { if (slot->in_use) { auto from = thr_get_trx(slot->thr); auto to = from->lock.blocking_trx.load(); if (to != nullptr) { infos.push_back({from, to, slot, slot->reservation_no}); } } } lock_wait_mutex_exit(); } /** Analyzes content of the snapshot with information about slots in use, and builds a (subset of) list of edges from waiting transactions to blocking transactions, such that for each waiter we have one outgoing edge. @param[in] infos information about all waiting transactions @param[out] outgoing The outgoing[from] will contain either the index such that infos[outgoing[from]].trx is the reason infos[from].trx has to wait, or -1 if the reason for waiting is not among transactions in infos[].trx. */ static void lock_wait_build_wait_for_graph( ut::vector &infos, ut::vector &outgoing) { /** We are going to use int and uint to store positions within infos */ ut_ad(infos.size() < (1 << 20)); const auto n = static_cast(infos.size()); outgoing.clear(); outgoing.resize(n, -1); /* This particular implementation sorts infos by ::trx, and then uses lower_bound to find index in infos corresponding to ::wait_for, which has O(nlgn) complexity and modifies infos, but has a nice property of avoiding any allocations. An alternative O(n) approach would be to use a hash table to map infos[i].trx to i. Using unordered_map however causes too much (de)allocations as its bucket chains are implemented as a linked lists - overall it works much slower than sort. The fastest implementation was to use custom implementation of a hash table with open addressing and double hashing with a statically allocated 2 * srv_max_n_threads buckets. This however did not increase transactions per second, so introducing a custom implementation seems unjustified here. */ sort(infos.begin(), infos.end()); waiting_trx_info_t needle{}; for (uint from = 0; from < n; ++from) { ut_ad(from == 0 || infos[from - 1].trx < infos[from].trx); needle.trx = infos[from].waits_for; auto it = std::lower_bound(infos.begin(), infos.end(), needle); if (it == infos.end() || it->trx != needle.trx) { continue; } auto to = it - infos.begin(); ut_ad(from != static_cast(to)); outgoing[from] = static_cast(to); } } /** Notifies the chosen_victim that it should roll back @param[in,out] chosen_victim the transaction that should be rolled back */ static void lock_wait_rollback_deadlock_victim(trx_t *chosen_victim) { ut_ad(!trx_mutex_own(chosen_victim)); /* The call to lock_cancel_waiting_and_release requires exclusive latch on whole lock_sys in case of table locks.*/ ut_ad(lock_mutex_own()); trx_mutex_enter(chosen_victim); chosen_victim->lock.was_chosen_as_deadlock_victim = true; ut_a(chosen_victim->lock.wait_lock); ut_a(chosen_victim->lock.que_state == TRX_QUE_LOCK_WAIT); lock_cancel_waiting_and_release(chosen_victim->lock.wait_lock, true); trx_mutex_exit(chosen_victim); } /** Given the `infos` about transactions and indexes in `infos` which form a deadlock cycle, identifies the transaction with the largest `reservation_no`, that is the one which was the latest to join the cycle. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return position in cycle_ids, such that infos[cycle_ids[pos]].reservation_no is the largest */ static size_t lock_wait_find_latest_pos_on_cycle( const ut::vector &cycle_ids, const ut::vector &infos) { size_t latest_pos = 0; for (size_t pos = 1; pos < cycle_ids.size(); ++pos) { if (infos[cycle_ids[latest_pos]].reservation_no < infos[cycle_ids[pos]].reservation_no) { latest_pos = pos; } } return latest_pos; } /** Rotates the deadlock cycle so that it starts from desired item. @param[in] first_pos the position which should become first after rotation @param[in] cycle_ids the array to rotate @return a copy of cycle_ids rotated in such a way, that the element which was at position `first_pos` is now the first */ static ut::vector lock_wait_rotate_so_pos_is_first( size_t first_pos, const ut::vector &cycle_ids) { auto rotated_ids = cycle_ids; std::rotate(rotated_ids.begin(), rotated_ids.begin() + first_pos, rotated_ids.end()); return rotated_ids; } /** A helper, which extracts transactions with given indexes from the `infos` array. @param[in] ids indexes of transactions in the `infos` array to extract @param[in] infos information about all waiting transactions @return An array formed by infos[ids[i]].trx */ template static ut::vector lock_wait_map_ids_to_trxs( const ut::vector &ids, const ut::vector &infos) { ut::vector trxs; trxs.reserve(ids.size()); for (auto id : ids) { trxs.push_back(infos[id].trx); } return trxs; } /** Orders the transactions from deadlock cycle in such a backward-compatible way, from the point of view of algorithm with picks the victim. From correctness point of view this could be a no-op, but we have test cases which assume some determinism in that which trx is selected as the victim. These tests usually depend on the old behaviour in which the order in which trxs attempted to wait was meaningful. In the past we have only considered two candidates for victim: (a) the transaction which closed the cycle by adding last wait-for edge (b) the transaction which is waiting for (a) and it favored to pick (a) in case of ties. To make these test pass we find the trx with most recent reservation_no (a), and the one before it in the cycle (b), and move them to the end of collection So that we consider them in order: ...,...,...,(b),(a), resolving ties by picking the latest one as victim. In other words we rotate the cycle so that the most recent waiter becomes the last. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return transactions from the deadlock cycle sorted in the optimal way for choosing a victim */ static ut::vector lock_wait_order_for_choosing_victim( const ut::vector &cycle_ids, const ut::vector &infos) { size_t latest_pos = lock_wait_find_latest_pos_on_cycle(cycle_ids, infos); size_t first_pos = (latest_pos + 1) % cycle_ids.size(); return lock_wait_map_ids_to_trxs( lock_wait_rotate_so_pos_is_first(first_pos, cycle_ids), infos); } /** Given an array with information about all waiting transactions and indexes in it which form a deadlock cycle, picks the transaction to rollback. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return the transaction chosen as a victim */ static trx_t *lock_wait_choose_victim( const ut::vector &cycle_ids, const ut::vector &infos) { /* We are iterating over various transactions comparing their trx_weight_ge, which is computed based on number of locks held thus we need exclusive latch on the whole lock_sys. In theory number of locks should not change while the transaction is waiting, but instead of proving that they can not wake up, it is easier to assert that we hold the mutex */ ut_ad(lock_mutex_own()); ut_ad(!cycle_ids.empty()); trx_t *chosen_victim = nullptr; auto sorted_trxs = lock_wait_order_for_choosing_victim(cycle_ids, infos); for (auto *trx : sorted_trxs) { if (chosen_victim == nullptr) { chosen_victim = trx; continue; } if (trx_is_high_priority(chosen_victim) || trx_is_high_priority(trx)) { auto victim = trx_arbitrate(trx, chosen_victim); if (victim != nullptr) { if (victim == trx) { chosen_victim = trx; } else { ut_a(victim == chosen_victim); } continue; } } if (trx_weight_ge(chosen_victim, trx)) { /* The joining transaction is 'smaller', choose it as the victim and roll it back. */ chosen_victim = trx; } } ut_a(chosen_victim); return chosen_victim; } /** Given an array with information about all waiting transactions and indexes in it which form a deadlock cycle, checks if the transactions allegedly forming the deadlock have actually stayed in slots since we've last checked, as opposed to say, not leaving a slot, and/or re-entering the slot ("ABA" situation). This is done by comparing the current reservation_no for each slot, with the reservation_no from the `info` snapshot. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return true if all given transactions resided in their slots for the whole time since the snapshot was taken, and in particular are still there right now */ static bool lock_wait_trxs_are_still_in_slots( const ut::vector &cycle_ids, const ut::vector &infos) { ut_ad(lock_wait_mutex_own()); for (auto id : cycle_ids) { const auto slot = infos[id].slot; if (!slot->in_use || slot->reservation_no != infos[id].reservation_no) { return false; } ut_ad(thr_get_trx(slot->thr) == infos[id].trx); } return true; } /** Given an array with information about all waiting transactions and indexes in it which form a deadlock cycle, checks if the transactions allegedly forming the deadlock have actually still wait for a lock, as opposed to being already notified about lock being granted or timeout, but still being present in the slot. This is done by checking trx->lock.wait_lock under lock_sys mutex. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return true if all given transactions are still waiting for locks*/ static bool lock_wait_trxs_are_still_waiting( const ut::vector &cycle_ids, const ut::vector &infos) { ut_ad(lock_wait_mutex_own()); /* We are iterating over various transaction which may have locks in different tables/rows, thus we need exclusive latch on the whole lock_sys to make sure no one will wake them up (say, a high priority trx could abort them) */ ut_ad(lock_mutex_own()); for (auto id : cycle_ids) { const auto trx = infos[id].trx; if (trx->lock.wait_lock == nullptr) { /* trx is on its way to being woken up, so this cycle is a false positive. As this particular cycle will resolve itself, we ignore it. */ return false; } ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT); } return true; } /* A helper function which rotates the deadlock cycle, so that the order of transactions in it is suitable for notification. From correctness perspective this could be a no-op, but we have test cases which assume some determinism in that in which order trx from cycle are reported. These tests usually depend on the old behaviour in which the order in which transactions attempted to wait was meaningful. In the past we reported: - the transaction which closed the cycle by adding last wait-for edge as (2) - the transaction which is waiting for (2) as (1) To make these test pass we find the trx with most recent reservation_no (2), and the one before it in the cycle (1), and move (1) to the beginning. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return the indexes from cycle_ids sorted rotated in backward-compatible way */ static ut::vector lock_wait_rotate_cycle_ids_for_notification( const ut::vector &cycle_ids, const ut::vector &infos) { size_t latest_pos = lock_wait_find_latest_pos_on_cycle(cycle_ids, infos); size_t previous_pos = (latest_pos - 1 + cycle_ids.size()) % cycle_ids.size(); return lock_wait_rotate_so_pos_is_first(previous_pos, cycle_ids); } /** A helper function which rotates the deadlock cycle, so that the order of transactions in it is suitable for notification. From correctness perspective this could be a no-op, but we have tests which depend on deterministic output from such notifications, and we want to be backward compatible. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return the transactions from cycle_ids rotated in backward-compatible way */ static ut::vector lock_wait_trxs_rotated_for_notification( const ut::vector &cycle_ids, const ut::vector &infos) { return lock_wait_map_ids_to_trxs( lock_wait_rotate_cycle_ids_for_notification(cycle_ids, infos), infos); } /** Handles a deadlock found, by notifying about it, and rolling back the chosen victim. @param[in,out] chosen_victim the transaction to roll back @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions */ static void lock_wait_handle_deadlock( trx_t *chosen_victim, const ut::vector &cycle_ids, const ut::vector &infos) { lock_notify_about_deadlock( lock_wait_trxs_rotated_for_notification(cycle_ids, infos), chosen_victim); lock_wait_rollback_deadlock_victim(chosen_victim); } /** Given an array with information about all waiting transactions and indexes in it which form a deadlock cycle, checks if the transactions allegedly forming the deadlock cycle, indeed are still waiting, and if so, chooses a victim and handles the deadlock. @param[in] cycle_ids indexes in `infos` array, of transactions forming the deadlock cycle @param[in] infos information about all waiting transactions @return true if the cycle found was indeed a deadlock cycle, false if it was a false positive */ static bool lock_wait_check_candidate_cycle( ut::vector &cycle_ids, const ut::vector &infos) { ut_ad(!lock_wait_mutex_own()); ut_ad(!lock_mutex_own()); lock_wait_mutex_enter(); /* We have released all mutexes after we have built the `infos` snapshot and before we've got here. So, while it is true that the edges form a cycle, it may also be true that some of these transactions were already rolled back, and memory pointed by infos[i].trx or infos[i].waits_for is no longer the trx it used to be (as we reuse trx_t objects). It may even segfault if we try to access it (because trx_t object could be freed). So we need to somehow verify that the pointer is still valid without accessing it. We do that by checking if slot->reservation_no has changed since taking a snapshot. If it has not changed, then we know that the trx's pointer still points to the same trx as the trx is sleeping, and thus has not finished and wasn't freed. So, we start by first checking that the slots still contain the trxs we are interested in. This requires lock_wait_mutex, but not lock_mutex. */ if (!lock_wait_trxs_are_still_in_slots(cycle_ids, infos)) { lock_wait_mutex_exit(); return false; } /* At this point we are sure that we can access memory pointed by infos[i].trx and that transactions are still in their slots. (And, as `cycle_ids` is a cycle, we also know that infos[cycle_ids[i]].wait_for is equal to infos[cycle_ids[i+1]].trx, so infos[cycle_ids[i]].wait_for can also be safely accessed). This however does not mean necessarily that they are still waiting. They might have been already notified that they should wake up (by calling lock_wait_release_thread_if_suspended()), but they had not yet chance to act upon it (it is the trx being woken up who is responsible for cleaning up the `slot` it used). So, the slot can be still in use and contain a transaction, which was already decided to be rolledback for example. However, we can recognize this situation by looking at trx->lock.wait_lock, as each call to lock_wait_release_thread_if_suspended() is performed only after lock_reset_lock_and_trx_wait() resets trx->lock.wait_lock to NULL. Checking trx->lock.wait_lock must be done under lock_mutex. */ lock_mutex_enter(); if (!lock_wait_trxs_are_still_waiting(cycle_ids, infos)) { lock_mutex_exit(); lock_wait_mutex_exit(); return false; } /* We can now release lock_wait_mutex, because: 1. we have verified that trx->lock.wait_lock is not NULL for cycle_ids 2. we hold lock_sys->mutex 3. lock_sys->mutex is required to change trx->lock.wait_lock to NULL 4. only after changing trx->lock.wait_lock to NULL a trx can finish So as long as we hold lock_sys->mutex we can access trxs. */ lock_wait_mutex_exit(); trx_t *const chosen_victim = lock_wait_choose_victim(cycle_ids, infos); ut_a(chosen_victim); lock_wait_handle_deadlock(chosen_victim, cycle_ids, infos); lock_mutex_exit(); return true; } /** Generates a list of `cycle_ids` by following `outgoing` edges from the `start`. @param[in,out] cycle_ids will contain the list of ids on cycle @param[in] start the first id on the cycle @param[in] outgoing the ids of edge endpoints: id -> outgoing[id] */ static void lock_wait_extract_cycle_ids(ut::vector &cycle_ids, const uint start, const ut::vector &outgoing) { cycle_ids.clear(); uint id = start; do { cycle_ids.push_back(id); id = outgoing[id]; } while (id != start); } /** Assumming that `infos` contains information about all waiting transactions, and `outgoing[i]` is the endpoint of wait-for edge going out of infos[i].trx, or -1 if the transaction is not waiting, it identifies and handles all cycles in the wait-for graph @param[in] infos Information about all waiting transactions @param[in] outgoing The ids of edge endpoints. If outgoing[id] == -1, then there is no edge going out of id, otherwise infos[id].trx waits for infos[outgoing[id]].trx */ static void lock_wait_find_and_handle_deadlocks( const ut::vector &infos, const ut::vector &outgoing) { ut_ad(infos.size() == outgoing.size()); /** We are going to use int and uint to store positions within infos */ ut_ad(infos.size() < (1 << 20)); const auto n = static_cast(infos.size()); ut::vector cycle_ids; cycle_ids.clear(); ut::vector colors; colors.clear(); colors.resize(n, 0); uint current_color = 0; for (uint start = 0; start < n; ++start) { if (colors[start] != 0) { /* This node was already fully processed*/ continue; } ++current_color; for (int id = start; 0 <= id; id = outgoing[id]) { /* We don't expect transaction to deadlock with itself only and we do not handle cycles of length=1 correctly */ ut_ad(id != outgoing[id]); if (colors[id] == 0) { /* This node was never visited yet */ colors[id] = current_color; continue; } /* This node was already visited: - either it has current_color which means we've visited it during current DFS descend, which means we have found a cycle, which we need to verify, - or, it has a color used in a previous DFS which means that current DFS path merges into an already processed portion of wait-for graph, so we can stop now */ if (colors[id] == current_color) { /* found a candidate cycle! */ lock_wait_extract_cycle_ids(cycle_ids, id, outgoing); if (lock_wait_check_candidate_cycle(cycle_ids, infos)) { MONITOR_INC(MONITOR_DEADLOCK); } else { MONITOR_INC(MONITOR_DEADLOCK_FALSE_POSITIVES); } } break; } } MONITOR_INC(MONITOR_DEADLOCK_ROUNDS); MONITOR_SET(MONITOR_LOCK_THREADS_WAITING, n); } /** Takes a snapshot of transactions in waiting currently in slots, searches for deadlocks among them and resolves them. */ static void lock_wait_check_for_deadlocks() { /* Note: I was tempted to declare `infos` as `static`, or at least declare it in lock_wait_timeout_thread() and reuse the same instance over and over again to avoid allocator calls caused by push_back() calls inside lock_wait_snapshot_waiting_threads() while we hold lock_sys->lock_wait_mutex. I was afraid, that allocator might need to block on some internal mutex in order to synchronize with other threads using allocator, and this could in turn cause contention on lock_wait_mutex. I hoped, that since vectors never shrink, and only grow, then keeping a single instance of `infos` alive for the whole lifetime of the thread should increase performance, because after some initial period of growing, the allocations will never have to occur again. But, I've run many many various experiments, with/without static, with infos declared outside, with reserve(n) using various values of n (128, srv_max_n_threads, even a simple ML predictor), and nothing, NOTHING was faster than just using local vector as we do here (at least on tetra01, tetra02, when comparing ~70 runs of each algorithm on uniform, pareto, 128 and 1024 usrs). This was counter-intuitive to me, so I've checked how malloc is implemented and it turns out that modern malloc is a variant of ptmalloc2 and uses a large number of independent arenas (larger than number of threads), and various smart heuristics to map threads to these arenas in a way which avoids blocking. So, before changing the way we handle allocation of memory in these lock_wait_* routines, please, PLEASE, first check empirically if it really helps by running many tests and checking for statistical significance, as variance seems to be large. (And the downside of using `static` is that it is not very obvious how our custom deallocator, which tries to update some stats, would behave during shutdown). Anyway, in case some tests will indicate that taking a snapshot is a bottleneck, one can check if declaring this vectors as static solves the issue. */ if (innobase_deadlock_detect) { ut::vector infos; ut::vector outgoing; lock_wait_snapshot_waiting_threads(infos); lock_wait_build_wait_for_graph(infos, outgoing); lock_wait_find_and_handle_deadlocks(infos, outgoing); } } /** A thread which wakes up threads whose lock wait may have lasted too long, analyzes wait-for-graph changes, and checks for deadlocks and resolves them */ void lock_wait_timeout_thread() { int64_t sig_count = 0; os_event_t event = lock_sys->timeout_event; ut_ad(!srv_read_only_mode); /** The last time we've checked for timeouts. */ auto last_checked_for_timeouts_at = ut_time(); do { auto now = ut_time(); if (0.5 < ut_difftime(now, last_checked_for_timeouts_at)) { last_checked_for_timeouts_at = now; lock_wait_check_slots_for_timeouts(); } lock_wait_check_for_deadlocks(); /* When someone is waiting for a lock, we wake up every second (at worst) and check if a timeout has passed for a lock wait */ os_event_wait_time_low(event, 1000000, sig_count); sig_count = os_event_reset(event); } while (srv_shutdown_state.load() == SRV_SHUTDOWN_NONE); }