/************************************************************************ * * Copyright (c) 2016 Alibaba.com, Inc. All Rights Reserved * $Id: paxos.cc,v 1.0 07/31/2016 04:16:32 PM yingqiang.zyq(yingqiang.zyq@alibaba-inc.com) $ * ************************************************************************/ /** * @file paxos.cc * @author yingqiang.zyq(yingqiang.zyq@alibaba-inc.com) * @date 07/31/2016 04:16:32 PM * @version 1.0 * @brief The implement of the PAXOS algorithm * **/ #include #include "paxos.h" namespace alisql { Paxos::Paxos(uint64_t electionTimeout, std::shared_ptr log, uint64_t purgeLogTimeout) :debugMaxSendLogIndex(0) ,config_(new StableConfiguration()) /*TODO:*/ ,log_(log) ,clusterId_(0) ,shutdown_(false) ,maxPacketSize_(1000000) ,maxDelayIndex_(10000) ,minDelayIndex_(100) ,largeBatchRatio_(5) ,pipeliningTimeout_(3) ,electionTimeout_(electionTimeout) ,heartbeatTimeout_(electionTimeout/5) ,purgeLogTimeout_(purgeLogTimeout) ,currentTerm_(1) ,leaderStepDowning_(false) ,commitIndex_(0) ,leaderId_(0) ,leaderAddr_("") ,votedFor_(0) ,forceRequestMode_(false) ,currentEpoch_(0) ,forceSyncEpochDiff_(0) ,state_(FOLLOWER) ,subState_(SubNone) ,weightElecting_(false) ,leaderForceSyncStatus_(true) ,consensusAsync_(false) ,replicateWithCacheLog_(false) ,optimisticHeartbeat_(false) //,changeStateWorkers_(0) ,autoPurge_(false) ,useAppliedIndex_(true) ,minMatchIndex_(0) ,appliedIndex_(0) ,followerMetaNo_(0) ,lastSyncMetaNo_(0) ,syncMetaInterval_(1) ,maxDelayIndex4NewMember_(100) ,maxMergeReportTimeout_(2000) ,nextEpochCheckStatemachine_(0) ,compactOldMode_(true) ,enableLogCache_(false) ,enableDynamicEasyIndex_(false) ,enableLearnerPipelining_(false) ,enableAutoResetMatchIndex_(false) ,enableLearnerAutoResetMatchIndex_(false) ,stateChangeCb_(nullptr) ,checksumCb_(nullptr) ,checksum_mode_(false) ,port_(0) { } Paxos::~Paxos () { if (!shutdown_.load()) shutdown(); } void Paxos::shutdown() { /* We should stop all ThreadTimer before close ThreadTimerService in Service::shutdown */ lock_.lock(); if (state_ == LEADER) { log_->setMetaData(keyLastLeaderTerm, currentTerm_); log_->setMetaData(keyLastLeaderLogIndex, commitIndex_); } shutdown_.store(true); if (ccMgr_.prepared) { ccMgr_.aborted= 1; ccMgr_.cond.notify_all(); } ccMgr_.autoChangeAddr = ""; ccMgr_.condChangeDone.notify_all(); lock_.unlock(); cond_.notify_all(); electionTimer_.reset(); epochTimer_.reset(); purgeLogTimer_.reset(); purgeLogQueue_.stop(); changeStateQueue_.stop(); appendLogQueue_.stop(); commitDepQueue_.stop(); config_->forEach(&Server::stop, NULL); config_->forEachLearners(&Server::stop, NULL); srv_->closeThreadPool(); srv_->shutdown(); /* When Service::shutdown return, there is not backend worker left, so we can release config_ now. */ config_.reset(); } void Paxos::stop() { /* We should stop all ThreadTimer before close ThreadTimerService in Service::shutdown */ electionTimer_->stop(); epochTimer_->stop(); purgeLogTimer_->stop(); config_.reset(); srv_->stop(); } void Paxos::changeState_(enum State newState) { /* We call sendAsyncEvent every time if the term change or state change. */ /* if (state_ == newState) return; */ easy_error_log("Server %d : Paxos state change from %s to %s !!\n", localServer_->serverId, stateString[state_], stateString[newState]); /* only leader run purge log timer */ if (state_ == LEADER) { purgeLogTimer_->stop(); } if (newState != CANDIDATE) { forceRequestMode_= false; } state_.store(newState); leaderForceSyncStatus_.store(true); if (newState == LEADER) { if (autoPurge_ == true) { purgeLogTimer_->restart(); } } else { subState_.store(SubNone); weightElecting_ = false; } if (newState == LEADER) { leaderId_.store(localServer_->serverId); leaderAddr_= localServer_->strAddr; option.extraStore->setRemote(option.extraStore->getLocal()); } log_->resetMetaCache(); if (stateChangeCb_) { if (changeStateQueue_.push(new ChangeStateArgType(state_, currentTerm_, commitIndex_, this))) srv_->sendAsyncEvent(&SingleProcessQueue::process, &changeStateQueue_, Paxos::execStateChangeCb); } cond_.notify_all(); } void Paxos::membershipChangeHistoryUpdate_(const MembershipChangeType &mc) { if (membershipChangeHistory_.size() >= 10) membershipChangeHistory_.erase(membershipChangeHistory_.begin()); membershipChangeHistory_.push_back(mc); } int Paxos::applyConfigureChangeNoLock_(uint64_t logIndex) { LogEntry entry; uint64_t index= 0; if (logIndex == 0) // for defensive return 0; if (log_->getEntry(logIndex, entry, false)) { easy_error_log("Server %d: fail to get log entry when apply configure change, index %llu", localServer_->serverId, logIndex); return 0; } assert(entry.optype() == kConfigureChange); ConfigureChangeValue val; val.ParseFromString(std::move(entry.value())); MembershipChangeType mc; mc.cctype = (CCOpTypeT)val.cctype(); mc.optype = (CCOpTypeT)val.optype(); if (val.addrs().size()) mc.address = *(val.addrs().begin()); if (val.cctype() == CCMemberOp) { //for membership change const std::string& addr= *(val.addrs().begin()); if (val.optype() == CCAddNode) { assert(val.addrs_size() == 1); if (state_ != LEARNER) config_->addMember(addr, this); else if (addr == localServer_->strAddr) { /* learner change to follower */ std::vector strConfig; for (auto& it : val.allservers()) { strConfig.push_back(it); if (it == addr) index= strConfig.size(); } assert(index != 0); /* the allservers already include the new add server(local server). */ //strConfig.push_back(addr); /* The learner's localServer may be AliSQLServer, we should pass it to installConfig. */ std::dynamic_pointer_cast(config_)->installConfig(strConfig, index, this, localServer_); config_->forEach(&Server::connect, (void *)NULL); changeState_(FOLLOWER); electionTimer_->start(); /* Init learners */ std::vector strConfigL; for (auto& addr : val.alllearners()) { if (addr != localServer_->strAddr) strConfigL.push_back(std::move(addr)); } config_->delAllLearners(); config_->addLearners(strConfigL, this, true); log_->setMetaData(Paxos::keyLearnerConfigure, config_->learnersToString()); log_->setMetaData(Paxos::keyMemberConfigure, config_->membersToString(localServer_->strAddr)); /* Print Log */ std::string logBuf; for (auto& addr : strConfig) { logBuf += addr; logBuf += " "; } std::string logBufL; for (auto& addr : strConfigL) { logBufL += addr; logBufL += " "; } easy_error_log("Server %d : Init follower from learner, new members(%s) new learners(%s)\n", localServer_->serverId, logBuf.c_str(), logBufL.c_str()); } else { std::vector strConfig; strConfig.push_back(std::move(addr)); config_->delLearners(strConfig, this); } if (ccMgr_.autoChangeAddr == addr) { ccMgr_.autoChangeAddr = ""; ccMgr_.autoChangeRet = 0; ccMgr_.condChangeDone.notify_all(); } } else if (val.optype() == CCDelNode) { if (state_ != LEARNER) { if (addr != localServer_->strAddr) config_->delMember(addr, this); else { /* This node is removed from the cluster, shutdown myself */ easy_error_log("Server %d : This node is removed from the cluster, shutdown myself currentTerm(%llu) lli(%llu) ccIndex(%llu)!!\n", localServer_->serverId, currentTerm_.load(), log_->getLastLogIndex(), logIndex); localServer_->serverId += 1000; stop(); } } } else if (val.optype() == CCDowngradeNode) { std::vector strConfig; strConfig.push_back(addr); if (state_ != LEARNER) { if (addr != localServer_->strAddr) { config_->delMember(addr, this); config_->addLearners(strConfig, this); //if (state_ == LEADER) // config_->forEachLearners(&Server::connect, (void *)NULL); } else { auto oldId= localServer_->serverId; std::shared_ptr localServer= std::dynamic_pointer_cast(config_->getServer(oldId)); assert(localServer != nullptr); std::dynamic_pointer_cast(config_)->installConfig(strConfig, 1, this, localServer); config_->addLearners(strConfig, this); log_->setMetaData(Paxos::keyLearnerConfigure, config_->learnersToString()); log_->setMetaData(Paxos::keyMemberConfigure, config_->membersToString()); /* We set the init serverId to 100 for tmp, this serverId will change when we */ localServer_->serverId= 100; this->state_.store(LEARNER); this->electionTimer_->stop(); easy_error_log("Server %d : This server is downgrade from follower(%llu) to learner(%llu)!!", localServer_->serverId, oldId, localServer_->serverId); } } else { config_->addLearners(strConfig, this); } } else if (val.optype() == CCConfigureNode) { mc.forceSync = val.forcesync(); mc.electionWeight = val.electionweight(); if (state_ != LEARNER) { auto server= config_->getServer(val.serverid()); if (server == nullptr || addr != server->strAddr) { easy_error_log("Server %d : Can't find the target server(id:%llu, addr:%s) in the configure!! Current member configure:%s\n", localServer_->serverId, val.serverid(), addr.c_str(), config_->membersToString(localServer_->strAddr).c_str()); } else { config_->configureMember(val.serverid(), val.forcesync(), val.electionweight(), this); if (val.serverid() == localServer_->serverId) electionTimer_->setRandWeight(val.electionweight()); } } } } else if (val.cctype() == CCLearnerOp) { if (val.optype() == CCAddNode || val.optype() == CCAddLearnerAutoChange) { std::vector strConfig, strLearners; for (auto& addr : val.addrs()) strConfig.push_back(std::move(addr)); for (auto& addr : val.alllearners()) strLearners.push_back(std::move(addr)); if (StableConfiguration::isServerInVector(localServer_->strAddr, strConfig)) { config_->delAllLearners(); config_->addLearners(strLearners, this, true); } config_->addLearners(strConfig, this); /* The old learner will skip the connect call. */ if (state_ == LEADER) config_->forEachLearners(&Server::connect, (void *)NULL); if (val.optype() == CCAddLearnerAutoChange) { // strConfig.size is 1 in this case ccMgr_.autoChangeAddr = strConfig[0]; } } else if (val.optype() == CCDelNode) { std::vector strConfig; for (auto& addr : val.addrs()) strConfig.push_back(std::move(addr)); config_->delLearners(strConfig, this); /* autoChange case, wakeup addFollower if deleted */ auto findret = std::find(strConfig.begin(), strConfig.end(), ccMgr_.autoChangeAddr); if (ccMgr_.autoChangeAddr != "" && findret != strConfig.end()) { ccMgr_.autoChangeAddr = ""; ccMgr_.autoChangeRet = -2; ccMgr_.condChangeDone.notify_all(); } } else if (val.optype() == CCConfigureNode) { auto server= config_->getServer(val.serverid()); auto source = config_->getServer(val.learnersource()); const std::string& addr= *(val.addrs().begin()); mc.learnerSource = (source ? source->strAddr : ""); mc.sendByAppliedIndex = val.applymode(); if (server == nullptr || addr != server->strAddr) { easy_error_log("Server %d : Can't find the target server(id:%llu, addr:%s) in the configure!! Current learner configure:%s\n", localServer_->serverId, val.serverid(), addr.c_str(), config_->learnersToString().c_str()); } else { if (server->learnerSource == localServer_->serverId) { server->stepDown(nullptr); } config_->configureLearner(val.serverid(), val.learnersource(), this); server->sendByAppliedIndex = val.applymode(); /* We should also init learner if we're leader and learner source is 0. */ if (server->learnerSource == localServer_->serverId || (state_ == LEADER && server->learnerSource == 0)) { easy_error_log("Server %d : a new learner %d is sourced from me!!\n", localServer_->serverId, server->serverId); server->beginLeadership(nullptr); server->connect(nullptr); } } } else if (val.optype() == CCSyncLearnerAll) { std::vector strConfig; std::string strServers; for (auto& addr : val.alllearners()) { strServers += addr; strServers += ";"; strConfig.push_back(std::move(addr)); } if (strServers.size() > 0) strServers.resize(strServers.size() - 1); auto strLearners= config_->learnersToString(); if (strServers != strLearners) { easy_error_log("Server %d : Error: local learner meta error local:%s leader:%s!!\n", localServer_->serverId, strLearners.c_str(), strServers.c_str()); easy_error_log("Server %d : SyncLearnerAll: update local learner config from %s to %s\n", localServer_->serverId, strLearners.c_str(), strServers.c_str()); config_->delAllLearners(); config_->addLearners(strConfig, this, true); log_->setMetaData(Paxos::keyLearnerConfigure, config_->learnersToString()); } else { easy_error_log("Server %d : SyncLearnerAll: local learner is match with leader %s\n", localServer_->serverId, strServers.c_str()); } } } else { assert(0); } membershipChangeHistoryUpdate_(mc); uint64_t itmp; log_->getMetaData(std::string(keyScanIndex), &itmp); if (itmp <= logIndex) log_->setMetaData(keyScanIndex, 0); if (ccMgr_.needNotify == 1) { ccMgr_.applied= 1; ccMgr_.aborted= 0; ccMgr_.cond.notify_all(); } easy_error_log("Server %d : applyConfigureChange_ done! logIndex(%llu) currentTerm(%ld) val.cctype(%d) val.optype(%d)\n", localServer_->serverId, logIndex, currentTerm_.load(), val.cctype(), val.optype()); return 0; } int Paxos::leaderTransfer_(uint64_t targetId) { if (state_ != LEADER) return PaxosErrorCode::PE_NOTLEADR; auto server = config_->getServer(targetId); if (nullptr == server || targetId == 0) return PaxosErrorCode::PE_NOTFOUND; if (subState_ == SubLeaderTransfer) { easy_error_log("Server %d : leaderTransfer to server(%ld), Now we're in another leader transfer, skip this action!", localServer_->serverId, targetId); return PaxosErrorCode::PE_CONFLICTS; } if (cdrMgr_.inRecovery) { easy_error_log("Server %d : leaderTransfer to server(%ld), Now we're in commit dependency recovery, skip this action!", localServer_->serverId, targetId); return PaxosErrorCode::PE_CONFLICTS; } if (targetId == localServer_->serverId) return PaxosErrorCode::PE_NONE; if (std::dynamic_pointer_cast(server)->isLearner) // server == nullptr already checked { easy_error_log("Server %d : leaderTransfer to server(%ld), it is a learner, skip this action!", localServer_->serverId, targetId); return PaxosErrorCode::PE_NOTFOLLOWER; } ++ (stats_.countLeaderTransfer); easy_error_log("Server %d : leaderTransfer to server(%ld), currentTerm(%ld), lli(%ld)\n", localServer_->serverId, targetId, currentTerm_.load(), log_->getLastLogIndex()); /* Stop new replicateLog */ subState_.store(SubLeaderTransfer); MembershipChangeType mc; mc.cctype = CCMemberOp; mc.optype = CCLeaderTransfer; mc.address = server->strAddr; membershipChangeHistoryUpdate_(mc); auto term= currentTerm_.load(); auto lli= log_->getLastLogIndex(); lock_.unlock(); auto slli= log_->getSafeLastLogIndex(); // sleep for 500ms to let log sync to disk if (lli < slli) msleep(500); int ret= leaderTransferSend_(targetId, term, slli, 5); lock_.lock(); return ret; } int Paxos::leaderTransfer(uint64_t targetId) { std::lock_guard lg(lock_); return leaderTransfer_(targetId); } int Paxos::leaderTransfer(const std::string& addr) { std::lock_guard lg(lock_); uint64_t targetId = config_->getServerIdFromAddr(addr); return leaderTransfer_(targetId); } int Paxos::leaderTransferSend_(uint64_t targetId, uint64_t term, uint64_t logIndex, uint64_t leftCnt) { std::lock_guard lg(lock_); -- leftCnt; if (checkLeaderTransfer(targetId, term, logIndex, leftCnt) > 0) { std::shared_ptr server= std::dynamic_pointer_cast(config_->getServer(targetId)); if (server == nullptr) { easy_error_log("Server %d : try transfer leader to id(%d), which is not in the configuration!!", localServer_->serverId, targetId); return PaxosErrorCode::PE_NOTFOUND; } if (commitIndex_ == logIndex && commitIndex_ == server->matchIndex) leaderCommand(LeaderTransfer, server); else { easy_error_log("Server %d : skip send cmd LeaderTransfer because the pos is not catch up. commitIndex(%llu), lli(%llu), target matchIndex(%llu)", localServer_->serverId, commitIndex_, log_->getLastLogIndex(), server->matchIndex.load()); } // TODO we also need to call leaderCommand in tryUpdateCommitIndex_ /* do not conflict with heartbeat timeout */ new ThreadTimer(srv_->getThreadTimerService(), srv_, getLeaderTransferInterval_(), ThreadTimer::Oneshot, &Paxos::leaderTransferSend_, this, targetId, term, logIndex, leftCnt); } return PaxosErrorCode::PE_NONE; } int Paxos::checkLeaderTransfer(uint64_t targetId, uint64_t term, uint64_t& logIndex, uint64_t leftCnt) { uint64_t lastLogIndex= log_->getLastLogIndex(); if ( state_ == LEADER && subState_ == SubLeaderTransfer && term == currentTerm_) { if (lastLogIndex > logIndex) { easy_error_log("Server %d : checkLeaderTransfer: In transfer to server %ld local lli:%llu is bigger than target lli:%llu, we update target lli to current lli.\n", localServer_->serverId, targetId, lastLogIndex, logIndex); logIndex= lastLogIndex; } if (leftCnt > 0) { easy_warn_log("Server %d : checkLeaderTransfer: LeaderTransfer to server %ld not complete, left check time %llu", localServer_->serverId, targetId, leftCnt); return 1; } else { subState_.store(SubNone); weightElecting_ = false; easy_error_log("Server %d : checkLeaderTransfer: LeaderTransfer to server %ld fail because of timeout currentTerm(%ld), lli(%ld)\n", localServer_->serverId, targetId, term, logIndex); return -1; } } else if (state_ == FOLLOWER && currentTerm_ > term && lastLogIndex > logIndex && leaderId_ == targetId) { easy_error_log("Server %d : checkLeaderTransfer: LeaderTransfer success target(id:%ld t:%ld lli:%ld) current(t:%ld lli:%ld)\n", localServer_->serverId, targetId, term, logIndex, currentTerm_.load(), lastLogIndex); return 0; } else { subState_.store(SubNone); weightElecting_ = false; easy_error_log("Server %d : checkLeaderTransfer: Nonleader election may happened during the leadertransfer, please check the status! target(id:%ld t:%ld lli:%ld) current(id:%ld t:%ld lli:%ld)\n", localServer_->serverId, targetId, term, logIndex, leaderId_.load(), currentTerm_.load(), lastLogIndex); return -1; } return 0; } int Paxos::checkConfigure_(CCOpTypeT cctype, CCOpTypeT type, std::vector& strConfig, const std::vector& servers) { for (auto it= strConfig.begin(); it != strConfig.end(); ) { if (type == CCAddNode || type == CCAddLearnerAutoChange) { bool dup= false; for (auto& server : servers) { if (server && server->strAddr == *it) { it= strConfig.erase(it); dup= true; break; } } /* In add learner case we should also check servers */ if (!dup && cctype == CCLearnerOp) { auto& members= config_->getServers(); for (auto& server : members) { if (server && server->strAddr == *it) { it= strConfig.erase(it); dup= true; break; } } } if (!dup) ++ it; } else if (type == CCDelNode) { bool found= false; uint64_t i= 0; for (auto& server : servers) { if (server && server->strAddr == *it) { found= true; break; } ++ i; } if (found) ++ it; else it= strConfig.erase(it); } } return 0; } inline void Paxos::prepareConfigureChangeEntry_(const LogEntry& entry, PaxosMsg *msg, bool fromCache) { if (ccMgr_.prepared == 0) { ccMgr_.prepared= 1; ccMgr_.preparedIndex= entry.index(); } else { // uint64_t leaderCommitIndex = fromCache? logRecvCache_.getCommitIndex(): msg->commitindex(); // potantial bug here // assert(ccMgr_.preparedIndex <= leaderCommitIndex || ccMgr_.preparedIndex <= commitIndex_); applyConfigureChangeNoLock_(ccMgr_.preparedIndex); ccMgr_.prepared= 1; ccMgr_.preparedIndex= entry.index(); } log_->setMetaData(keyScanIndex, ccMgr_.preparedIndex); } int Paxos::sendConfigureAndWait_(const ConfigureChangeValue& val, std::unique_lock& ul) { int ret= 0; std::string buf; val.SerializeToString(&buf); LogEntry entry; entry.set_optype(kConfigureChange); entry.set_value(buf); // Step 3: send log entry, and wait for applied uint64_t index; if (ccMgr_.prepared == 0) { log_->setMetaData(keyScanIndex, log_->getLastLogIndex()); if ((index= replicateLog_(entry, false)) > 0) { if (entry.index() > commitIndex_) { ccMgr_.prepared= 1; ccMgr_.preparedIndex= entry.index(); ccMgr_.needNotify= 1; if (ccMgr_.waitTimeout.load() == 0) { while (ccMgr_.applied == 0 && ccMgr_.aborted == 0) ccMgr_.cond.wait(ul); } else { bool waitRet = ccMgr_.cond.wait_for(ul, std::chrono::milliseconds(ccMgr_.waitTimeout.load()), [this]() { return (ccMgr_.applied != 0 || ccMgr_.aborted != 0); }); if (waitRet == false) { ccMgr_.needNotify= 0; easy_error_log("Server %d : configureChange wait timeout, preparedIndex(%d).\n", localServer_->serverId, ccMgr_.preparedIndex); return PaxosErrorCode::PE_TIMEOUT; } } if (ccMgr_.aborted == 1) { easy_error_log("Server %d : configureChange aborted, preparedIndex(%d).\n", localServer_->serverId, ccMgr_.preparedIndex); ret= PaxosErrorCode::PE_DEFAULT; } else { if (state_ == FOLLOWER && ccMgr_.preparedIndex != entry.index()) { /* * Refering to fucntion prepareConfigureChangeEntry_, * leader changed during configureChange, * just return timeout as a result to let the client check and retry */ ccMgr_.aborted= ccMgr_.applied= ccMgr_.needNotify= 0; easy_error_log("Server %d : configureChange timeout after leader transfer, old preparedIndex(%d), current preparedIndex(%d).\n", localServer_->serverId, entry.index(), ccMgr_.preparedIndex); return PaxosErrorCode::PE_TIMEOUT; } assert(ccMgr_.preparedIndex == entry.index()); // success, ret is 0 } } else { /* one node case */ applyConfigureChangeNoLock_(entry.index()); if (ccMgr_.needNotify != 1) ccMgr_.clear(); } } else { log_->setMetaData(keyScanIndex, 0); ret= PaxosErrorCode::PE_REPLICATEFAIL; } } else ret= PaxosErrorCode::PE_CONFLICTS; return ret; } void Paxos::setConfigureChangeTimeout(uint64_t t) { ccMgr_.waitTimeout.store(t); } int Paxos::configureChange_(CCOpTypeT cctype, CCOpTypeT optype, std::vector& strConfigArg, const std::vector& servers) { if (cctype != CCMemberOp && cctype != CCLearnerOp) return PaxosErrorCode::PE_INVALIDARGUMENT; if (optype != CCAddNode && optype != CCDelNode && optype != CCSyncLearnerAll && optype != CCAddLearnerAutoChange) return PaxosErrorCode::PE_INVALIDARGUMENT; int ret= PaxosErrorCode::PE_NONE; std::vector strConfig= strConfigArg; std::unique_lock ul(lock_); if (optype == CCAddLearnerAutoChange) { if (strConfig.size() > 1) { easy_error_log("Server %d : Learner auto change to follower only support one learner at a time.\n", localServer_->serverId); return PaxosErrorCode::PE_INVALIDARGUMENT; } if (ccMgr_.autoChangeAddr != "") { easy_error_log("Server %d : Previous learner auto change to follower not finish.\n", localServer_->serverId); return PaxosErrorCode::PE_DEFAULT; } } if (optype == CCAddNode || optype == CCAddLearnerAutoChange || optype == CCDelNode) { /* Step 1: remove dup servers */ checkConfigure_(cctype, optype, strConfig, servers); if (strConfig.size() == 0) { easy_error_log("Server %d : New add member already exist or delete member not found!!\n", localServer_->serverId); if (optype == CCDelNode) return PaxosErrorCode::PE_NOTFOUND; else return PaxosErrorCode::PE_EXISTS; } } /* Step 2: build log entry */ std::string logBuf; ConfigureChangeValue val; for (auto& addr : strConfig) { val.add_addrs(addr); logBuf += addr; logBuf += " "; } easy_error_log("Server %d : configureChange begin: cctype(%d) optype(%d) term(%llu) lli(%llu) addrs(%s)\n", localServer_->serverId, cctype, optype, currentTerm_.load(), log_->getLastLogIndex(), logBuf.c_str()); val.set_cctype(cctype); val.set_optype(optype); if (cctype == CCMemberOp && optype == CCAddNode) { auto& newServerAddr= *(strConfig.begin()); bool addNew= false; auto newServer= config_->getLearnerByAddr(newServerAddr); if (newServer == nullptr) { easy_error_log("Server %d : Try to add member from learner %s which is not exist!!\n", localServer_->serverId, newServerAddr.c_str()); return PaxosErrorCode::PE_NOTFOUND; } else if(newServer->getMatchIndex() + maxDelayIndex4NewMember_ < log_->getLastLogIndex()) { easy_error_log("Server %d : Try to add member from learner %d, which is delay too much, matchIndex(%llu), lli(%llu)!!\n", localServer_->serverId, newServer->serverId, newServer->getMatchIndex(), log_->getLastLogIndex()); return PaxosErrorCode::PE_DELAY; } for (auto& server : servers) { if (server) val.add_allservers(StableConfiguration::memberToString(server)); else { if (! addNew) { val.add_allservers(newServerAddr); addNew= true; } else val.add_allservers("0"); } } if (! addNew) val.add_allservers(newServerAddr); for (auto& learner : config_->getLearners()) { if (learner && newServerAddr != learner->strAddr) val.add_alllearners(StableConfiguration::learnerToString(learner)); else val.add_alllearners("0"); } } else if (cctype == CCLearnerOp && (optype == CCAddNode || optype == CCAddLearnerAutoChange || optype == CCSyncLearnerAll)) { for (auto& learner : config_->getLearners()) { if (learner) val.add_alllearners(StableConfiguration::learnerToString(learner)); else val.add_alllearners("0"); } } ret= sendConfigureAndWait_(val, ul); easy_error_log("Server %d : configureChange return: cctype(%d) optype(%d) addrs(%s) return(%d) success(%d) preparedIndex(%llu) lli(%llu)\n", localServer_->serverId, cctype, optype, logBuf.c_str(), ret, ccMgr_.applied, ccMgr_.preparedIndex, log_->getLastLogIndex()); /* * In some cases we cannot clear ccMgr flags: * 1. an old configureChange exist and sendConfigureAndWait_ return error directly * 2. replicateLog fail because of some reasons (leader change...) * 3. configureChange timeout (needNotify is set to 0 in sendConfigureAndWait_ ) */ if (ret != PaxosErrorCode::PE_REPLICATEFAIL && ret != PaxosErrorCode::PE_CONFLICTS && ret != PaxosErrorCode::PE_TIMEOUT) ccMgr_.clear(); return ret; } int Paxos::changeLearners(CCOpTypeT type, std::vector& strConfig) { return configureChange_(CCLearnerOp, type, strConfig, config_->getLearners()); } int Paxos::changeMember(CCOpTypeT type, std::string& strAddr) { std::vector tmpConfig{strAddr}; if (type == CCAddLearnerAutoChange) { /* addFollower procedure: add learner -> wait recv log -> change to follower */ int ret= configureChange_(CCLearnerOp, CCAddLearnerAutoChange, tmpConfig, config_->getLearners()); if (ret != 0) { easy_error_log("Server %d : addFollower configChange stage 1 fail, error code %d.\n", localServer_->serverId, ret); return ret; } /* wait learner become follower */ std::unique_lock ul(lock_); if (ccMgr_.waitTimeout.load() == 0) ccMgr_.condChangeDone.wait(ul, [this](){ return ccMgr_.autoChangeAddr == ""; }); else { ret = ccMgr_.condChangeDone.wait_for(ul, std::chrono::milliseconds(ccMgr_.waitTimeout.load()), [this](){ return ccMgr_.autoChangeAddr == ""; }); if (!ret) { easy_error_log("Server %d : addFollower wait timeout (%d ms).\n", localServer_->serverId, ccMgr_.waitTimeout.load()); return PaxosErrorCode::PE_TIMEOUT; } } return ccMgr_.autoChangeRet; } else { return configureChange_(CCMemberOp, type, tmpConfig, config_->getServers()); } } int Paxos::autoChangeLearnerAction() { std::vector tmpConfig; { std::lock_guard lg(lock_); if (ccMgr_.autoChangeAddr == "") return PaxosErrorCode::PE_DEFAULT; tmpConfig.push_back(ccMgr_.autoChangeAddr); } int ret= configureChange_(CCMemberOp, CCAddNode, tmpConfig, config_->getServers()); if (ccMgr_.autoChangeAddr != "") { ccMgr_.autoChangeAddr = ""; ccMgr_.autoChangeRet = ret; ccMgr_.condChangeDone.notify_all(); } return ret; } int Paxos::configureLearner_(uint64_t serverId, uint64_t source, bool applyMode, std::unique_lock &ul) { auto server= config_->getServer(serverId); int ret= PaxosErrorCode::PE_NONE; if (!server) { easy_error_log("Server %d : can't find server %llu in configureLearner\n", localServer_->serverId, serverId); return PaxosErrorCode::PE_NOTFOUND; } if (server->learnerSource == source && server->sendByAppliedIndex == applyMode) { easy_warn_log("Server %d : nothing changed in configureLearner server %llu, learnerSource:%llu\n", localServer_->serverId, serverId, source); return PaxosErrorCode::PE_NONE; } easy_error_log("Server %d : configureLearner: change learnerSource from %llu to %llu\n", localServer_->serverId, server->learnerSource, source); ConfigureChangeValue val; val.set_cctype(CCLearnerOp); val.set_optype(CCConfigureNode); /* For check. */ val.add_addrs(server->strAddr); val.set_serverid(serverId); val.set_learnersource(source); val.set_applymode(applyMode); ret= sendConfigureAndWait_(val, ul); easy_error_log("Server %d : configureLearner return: serverid(%d) return(%d) success(%d) preparedIndex(%llu) lli(%llu)\n", localServer_->serverId, serverId, ret, ccMgr_.applied, ccMgr_.preparedIndex, log_->getLastLogIndex()); if (ret != PaxosErrorCode::PE_REPLICATEFAIL && ret != PaxosErrorCode::PE_CONFLICTS && ret != PaxosErrorCode::PE_TIMEOUT) ccMgr_.clear(); return ret; } int Paxos::configureLearner(uint64_t serverId, uint64_t source, bool applyMode) { std::unique_lock ul(lock_); return configureLearner_(serverId, source, applyMode, ul); } int Paxos::configureLearner(const std::string& addr, const std::string& sourceAddr, bool applyMode) { std::unique_lock ul(lock_); uint64_t serverId = config_->getServerIdFromAddr(addr); uint64_t source = config_->getServerIdFromAddr(sourceAddr); if (serverId < 100 || source == 0) return PaxosErrorCode::PE_NOTFOUND; /* We make a trick here: If you want to clear the current learner source, just source the address to itself */ if (serverId == source) source = 0; return configureLearner_(serverId, source, applyMode, ul); } int Paxos::configureMember_(uint64_t serverId, bool forceSync, uint electionWeight, std::unique_lock &ul) { if (electionWeight > 9) { easy_error_log("Server %d : Fail to change electionWeight. Max electionWeight is 9.", localServer_->serverId); return PaxosErrorCode::PE_INVALIDARGUMENT; } auto server= config_->getServer(serverId); int ret= PaxosErrorCode::PE_NONE; if (!server) { easy_error_log("Server %d : can't find server %llu in configureMember\n", localServer_->serverId, serverId); return PaxosErrorCode::PE_NOTFOUND; } if (serverId >= 100) { easy_error_log("Server %d : can't configure learner %llu in configureMember\n", localServer_->serverId, serverId); return PaxosErrorCode::PE_WEIGHTLEARNER; } if (server->forceSync == forceSync && server->electionWeight == electionWeight) { easy_warn_log("Server %d : nothing changed in configureMember server %llu, forceSync:%u electionWeight:%u\n", localServer_->serverId, serverId, forceSync, electionWeight); return PaxosErrorCode::PE_NONE; } ConfigureChangeValue val; val.set_cctype(CCMemberOp); val.set_optype(CCConfigureNode); /* For check. */ val.add_addrs(server->strAddr); val.set_serverid(serverId); val.set_forcesync(forceSync); val.set_electionweight(electionWeight); ret= sendConfigureAndWait_(val, ul); easy_error_log("Server %d : configureMember return: serverid(%d) return(%d) success(%d) preparedIndex(%llu) lli(%llu)\n", localServer_->serverId, serverId, ret, ccMgr_.applied, ccMgr_.preparedIndex, log_->getLastLogIndex()); if (ret != PaxosErrorCode::PE_REPLICATEFAIL && ret != PaxosErrorCode::PE_CONFLICTS && ret != PaxosErrorCode::PE_TIMEOUT) ccMgr_.clear(); return ret; } int Paxos::configureMember(uint64_t serverId, bool forceSync, uint electionWeight) { std::unique_lock ul(lock_); return configureMember_(serverId, forceSync, electionWeight, ul); } int Paxos::configureMember(const std::string& addr, bool forceSync, uint electionWeight) { std::unique_lock ul(lock_); uint64_t serverId = config_->getServerIdFromAddr(addr); return configureMember_(serverId, forceSync, electionWeight, ul); } int Paxos::downgradeMember_(uint64_t serverId, std::unique_lock &ul) { auto server= config_->getServer(serverId); int ret= 0; if (serverId >= 100) { easy_error_log("Server %d : try to downgrade server %d which is already a learner!!\n", localServer_->serverId, serverId); return PaxosErrorCode::PE_DOWNGRADLEARNER; } if (!server) { easy_error_log("Server %d : can't find server %llu in configureMember!!\n", localServer_->serverId, serverId); return PaxosErrorCode::PE_NOTFOUND; } if (localServer_->serverId == serverId && state_ == LEADER) { easy_error_log("Server %d : can't downgrade leader(%llu) to learner!!\n", localServer_->serverId, serverId); return PaxosErrorCode::PE_DOWNGRADELEADER; } ConfigureChangeValue val; val.set_cctype(CCMemberOp); val.set_optype(CCDowngradeNode); /* For check. */ val.add_addrs(server->strAddr); ret= sendConfigureAndWait_(val, ul); easy_error_log("Server %d : downgradeMember return: serverid(%d) return(%d) success(%d) preparedIndex(%llu) lli(%llu)\n", localServer_->serverId, serverId, ret, ccMgr_.applied, ccMgr_.preparedIndex, log_->getLastLogIndex()); if (ret != PaxosErrorCode::PE_REPLICATEFAIL && ret != PaxosErrorCode::PE_CONFLICTS && ret != PaxosErrorCode::PE_TIMEOUT) ccMgr_.clear(); return ret; } int Paxos::downgradeMember(uint64_t serverId) { std::unique_lock ul(lock_); return downgradeMember_(serverId, ul); } int Paxos::downgradeMember(const std::string& addr) { std::unique_lock ul(lock_); uint64_t serverId = config_->getServerIdFromAddr(addr); return downgradeMember_(serverId, ul); } void Paxos::becameLeader_() { if (state_ != LEADER) { /* Deal with commit dependency case before set to LEADER */ LogEntry tmpEntry; uint64_t tmpIndex = log_->getLastLogIndex(); if (log_->getEntry(tmpIndex, tmpEntry, false)) { easy_error_log("Server %d: fail to get log entry when become leader to find out commit dependency, index %llu", localServer_->serverId, tmpIndex); exit(1); } if (tmpEntry.optype() == kCommitDep) { cdrMgr_.inRecovery = true; cdrMgr_.lastLogIndex = tmpIndex; cdrMgr_.lastNonCommitDepIndex = 0; easy_error_log("Server %d : Last log optype is kCommitDep, will reset the log.\n", localServer_->serverId); } nextEpochCheckStatemachine_= getNextEpochCheckStatemachine_(currentEpoch_.load()); /* Deal with the election weight things. */ if (config_->needWeightElection(localServer_->electionWeight)) { easy_error_log("Server %d : Try weight election for this election term(%llu)!!\n", localServer_->serverId, currentTerm_.load()); subState_.store(SubLeaderTransfer); weightElecting_ = true; new ThreadTimer(srv_->getThreadTimerService(), srv_, electionTimeout_, ThreadTimer::Oneshot, &Paxos::electionWeightAction, this, currentTerm_.load(), currentEpoch_.fetch_add(1)); } /* become leader. */ changeState_(LEADER); /* We change timer form election to heartbeat type. */ electionTimer_->stop(); config_->forEach(&Server::beginLeadership, NULL); config_->forEachLearners(&Server::beginLeadership, NULL); /* we start epochTimer_ when we became candidate to detect all node when we became leader. */ /* epochTimer_ will stop when we became follower. */ //epochTimer_->restart(); if (!cdrMgr_.inRecovery) { /* Send an empty log entry to implicitly commit old entries */ LogEntry entry1; log_->getEmptyEntry(entry1); replicateLog_(entry1, false); } else { /* in commit dependency recovery */ if (commitDepQueue_.push(new commitDepArgType(cdrMgr_.lastLogIndex, currentTerm_.load(), this))) srv_->sendAsyncEvent(&SingleProcessQueue::process, &commitDepQueue_, Paxos::commitDepResetLog); } uint64_t lastLogIndex= log_->getLastLogIndex(); LogEntry entry; log_->getEntry(lastLogIndex, entry, false); // ignore error uint64_t lastLogTerm= entry.term(); easy_error_log("Server %d : become Leader (currentTerm %ld, lli:%ld, llt:%ld)!!\n", localServer_->serverId, currentTerm_.load(), lastLogIndex, lastLogTerm); } } bool Paxos::cdrIsValid(commitDepArgType* arg) { std::lock_guard lg(lock_); if (cdrMgr_.inRecovery && currentTerm_ == arg->term) return true; else return false; } void Paxos::cdrClear(commitDepArgType* arg) { std::lock_guard lg(lock_); if (currentTerm_ == arg->term) cdrMgr_.clear(); } void Paxos::commitDepResetLog(commitDepArgType* arg) { if (!arg->paxos->cdrIsValid(arg)) return; std::shared_ptr log = arg->paxos->getLog(); std::shared_ptr localServer = arg->paxos->getLocalServer(); easy_error_log("Server %d : start reset log because of commit dependency.\n", localServer->serverId); LogEntry tmpEntry; uint64_t tmpIndex = arg->lastLogIndex; while(--tmpIndex > 0) { if (log->getEntry(tmpIndex, tmpEntry, false)) { easy_error_log("Server %d: fail to get log entry when reset commit dependency, index %llu", localServer->serverId, tmpIndex); exit(1); } if (tmpEntry.optype() != kCommitDep) break; } easy_error_log("Server %d : commitDepResetLog reset from index %ld to %ld.\n", localServer->serverId, tmpIndex + 1, arg->lastLogIndex); arg->paxos->truncateBackward_(tmpIndex + 1); if (arg->paxos->debugResetLogSlow) sleep(1); tmpEntry.Clear(); log->getEmptyEntry(tmpEntry); tmpEntry.set_term(arg->term); while (log->getLastLogIndex() < arg->lastLogIndex) { if (debugResetLogSlow) sleep(1); tmpEntry.set_index(0); tmpEntry.set_checksum(0); /* do not use writeLog, lastSyncedIndex is larger than logindex */ /* no lock protection: PolarDB-X Engine log module ensure that you cannot append a log if you are a follower */ uint64_t reti = localServer->appendLog(tmpEntry); /* avoid dead loop */ if (reti == 0) { /* fail term check */ easy_error_log("Server %d : fail to do log reset for index %llu, which means I am not the real leader.\n", log->getLastLogIndex() + 1); break; } } arg->paxos->cdrClear(arg); easy_error_log("Server %d : finish commitDepResetLog.\n", localServer->serverId); /* still Send an extra empty log entry to implicitly commit old entries */ arg->paxos->replicateLog_(tmpEntry, false); } uint64_t Paxos::replicateLog_(LogEntry &entry, const bool needLock) { uint64_t term= currentTerm_.load(); auto state= state_.load(); auto subState= subState_.load(); if (leaderStepDowning_.load() || state != LEADER || (subState == SubLeaderTransfer && needLock) || term != currentTerm_.load()) { if (state != LEADER) { easy_error_log("Server %d : replicateLog fail because we're not leader!\n", localServer_->serverId); } else if (subState == SubLeaderTransfer) { easy_error_log("Server %d : replicateLog fail because we're in LeaderTransfer!\n", localServer_->serverId); } else { easy_error_log("Server %d : replicateLog fail because we're in LeaderTransfer!\n", localServer_->serverId); } return 0; } if (cdrMgr_.inRecovery) { entry.set_term(0); easy_error_log("Server %d : replicateLog fail because we're in commit dependency recovery!\n", localServer_->serverId); return 0; } entry.set_term(term); /* if (needLock) lock_.unlock(); */ ++ (stats_.countReplicateLog); easy_info_log("Server %d : replicateLog write start logTerm(%ld)\n", localServer_->serverId, term); /* Traditional Path: write checksum before wirteLog, which is different from PolarDB-X Engine */ /* if checksum not 0, use the checksum from outside (for ut now) */ if (checksumCb_ && checksum_mode_ && entry.checksum() == 0) { const unsigned char* buf = reinterpret_cast(entry.value().c_str()); entry.set_checksum((uint64_t)checksumCb_(0, buf, entry.value().size())); } auto logIndex= localServer_->writeLog(entry); entry.set_index(logIndex); if (entry.optype() != kCommitDep && logIndex > 0) cdrMgr_.setLastNonCommitDepIndex(logIndex); easy_info_log("Server %d : replicateLog write done logTerm(%ld), logIndex(%ld)\n", localServer_->serverId, term, logIndex); /* TODO * if we use sendAsyncEvent to append log here,too much workers will be used to * appendLog and be blocked by mutex. So we use sendAsyncEvent only after we have * the Group appnedLog function !! */ if (logIndex > 0) //appendLog(false); //srv_->sendAsyncEvent(AEAppendLog, NULL); //srv_->sendAsyncEvent(&Paxos::appendLog, this, true); if (appendLogQueue_.push(new (Paxos *)(this))) srv_->sendAsyncEvent(&SingleProcessQueue::mergeableSameProcess, &appendLogQueue_, Paxos::appendLogCb); if (!shutdown_.load() && config_->getServerNumLockFree() == 1) { if (needLock) tryUpdateCommitIndex(); else tryUpdateCommitIndex_(); } return logIndex; } int Paxos::requestVote(bool force) { std::lock_guard lg(lock_); if (shutdown_.load()) return -1; if (state_ == LEADER) { /* TODO: should stepdown ? */ /* only connect to learner when i am leader */ if (debugWitnessTest) { PaxosMsg msg; msg.set_term(currentTerm_); msg.set_msgtype(RequestVote); msg.set_candidateid(localServer_->serverId); msg.set_addr(localServer_->strAddr); msg.set_force((uint64_t)force); uint64_t lastLogIndex; msg.set_lastlogindex(lastLogIndex= log_->getLastLogIndex()); LogEntry entry; log_->getEntry(lastLogIndex, entry, false); msg.set_lastlogterm(entry.term()); config_->forEachLearners(&Server::sendMsg, (void *)&msg); } return -1; } if (state_ == LEARNER) { easy_error_log("Server %d : Skip requestVote because I am learner.", localServer_->serverId); return -1; } if (localServer_->electionWeight == 0) { easy_error_log("Server %d : Skip requestVote because electionWeight is 0 currentTerm(%ld)\n", localServer_->serverId, currentTerm_.load()); return -1; } /* For debug */ if (debugDisableElection) { easy_error_log("Server %d : Skip requestVote because of debugDisableElection currentTerm(%ld)\n", localServer_->serverId, currentTerm_.load()); return -2; } uint64_t lastLogIndex = log_->getLastLogIndex(); LogEntry entry; if (log_->getEntry(lastLogIndex, entry, false)) { easy_error_log("Server %d: fail to get log entry when request vote, index %llu", localServer_->serverId, lastLogIndex); } else { ++currentTerm_; log_->setTerm(currentTerm_); log_->setMetaData(keyCurrentTerm, currentTerm_); leaderId_.store(0); leaderAddr_= std::string(""); option.extraStore->setRemote(""); config_->forEach(&Server::beginRequestVote, NULL); forceRequestMode_= force; changeState_(CANDIDATE); easy_warn_log("Server %d : Epoch task currentEpoch(%llu) during requestVote\n", localServer_->serverId, currentEpoch_.load()); currentEpoch_.fetch_add(1); epochTimer_->restart(); votedFor_= localServer_->serverId; log_->setMetaData(keyVoteFor, votedFor_); easy_error_log("Server %d : Start new requestVote: new term(%ld)\n", localServer_->serverId, currentTerm_.load()); PaxosMsg msg; msg.set_term(currentTerm_); msg.set_msgtype(RequestVote); msg.set_candidateid(localServer_->serverId); msg.set_addr(localServer_->strAddr); msg.set_force((uint64_t)force); msg.set_lastlogindex(lastLogIndex); msg.set_lastlogterm(entry.term()); config_->forEach(&Server::sendMsg, (void *)&msg); } electionTimer_->restart(electionTimeout_, true); if (config_->getServerNum() == 1) { /* Only me in the cluster, became leader immediately. */ becameLeader_(); } return 0; } int Paxos::onRequestVote(PaxosMsg *msg, PaxosMsg *rsp) { ++ (stats_.countOnMsgRequestVote); rsp->set_msgid(msg->msgid()); rsp->set_msgtype(RequestVoteResponce); std::lock_guard lg(lock_); if (shutdown_.load()) return -1; rsp->set_serverid(localServer_->serverId); if (state_ == LEARNER) { rsp->set_term(msg->term()); rsp->set_votegranted(0); easy_error_log("Server %d : Receive a RequestVote from server %d, term(%llu) when I'm LEARNER!! Just reject!!\n", localServer_->serverId, msg->candidateid(), msg->term()); return 0; } auto server= std::dynamic_pointer_cast(config_->getServer(msg->candidateid())); if (server == nullptr || server->strAddr != msg->addr()) { rsp->set_term(currentTerm_); rsp->set_votegranted(0); easy_error_log("Server %d : reject RequestVote because this server is not in the current configure, server(id:%llu, addr:%s).\n", localServer_->serverId, msg->candidateid(), msg->addr().c_str()); return 0; } if (msg->term() < currentTerm_) { rsp->set_term(currentTerm_); rsp->set_votegranted(0); easy_error_log("Server %d : Receive an old RequestVote from server %d msg term(%d) current term(%d) reject!!\n", localServer_->serverId, msg->serverid(), msg->term(), currentTerm_.load()); return 0; } uint64_t lastLogIndex= log_->getLastLogIndex(); LogEntry entry; if (log_->getEntry(lastLogIndex, entry, false)) { rsp->set_term(currentTerm_); rsp->set_votegranted(0); easy_error_log("Server %d: fail to get log entry when on request vote, index %llu", localServer_->serverId, lastLogIndex); return 0; } uint64_t lastLogTerm= entry.term(); bool logCheck= (msg->lastlogterm() > lastLogTerm || (msg->lastlogterm() == lastLogTerm && msg->lastlogindex() >= lastLogIndex)); easy_error_log("Server %d : leaderStickiness check: msg::force(%d) state_:%d electionTimer_::Stage:%d leaderId_:%llu .\n", localServer_->serverId, msg->force(), state_.load(), electionTimer_->getCurrentStage(), leaderId_.load()); //if (state_ == LEADER || (state_ == FOLLOWER && electionTimer_->getCurrentStage() == 0 && !msg->force())) if (!msg->force() && (state_ == LEADER || (state_ == FOLLOWER && (electionTimer_->getCurrentStage() == 0 && leaderId_ != 0) && !Paxos::debugDisableElection))) { rsp->set_term(currentTerm_); rsp->set_votegranted(0); easy_error_log("Server %d : reject RequestVote because of leaderStickiness, local(lli:%ld, llt:%ld); msg(candidateid: %d, term: %ld lli:%ld, llt:%ld) .\n", localServer_->serverId, lastLogIndex, lastLogTerm, msg->candidateid(), msg->term(), msg->lastlogindex(), msg->lastlogterm()); if (state_ == LEADER) rsp->set_force(1); return 0; } if (msg->term() > currentTerm_) { /* Enter New Term */ easy_error_log("Server %d : New Term in onRequestVote !! server %d 's term(%d) is bigger than me(%d).\n", localServer_->serverId, msg->candidateid(), msg->term(), currentTerm_.load()); newTerm(msg->term()); //TODO handle leader case. need stepDown ? if (state_ == LEADER) ; } rsp->set_term(currentTerm_); rsp->set_votegranted(logCheck && votedFor_ == 0); if (rsp->votegranted()) { votedFor_= msg->candidateid(); log_->setMetaData(keyVoteFor, votedFor_); electionTimer_->restart(); } easy_error_log("Server %d : isVote: %d, local(lli:%llu, llt:%d); msg(candidateid: %d, term: %d lli:%llu, llt:%d) .\n", localServer_->serverId, rsp->votegranted(), lastLogIndex, lastLogTerm, msg->candidateid(), msg->term(), msg->lastlogindex(), msg->lastlogterm()); return 0; } int Paxos::onClusterIdNotMatch(PaxosMsg *msg) { assert(msg->msgtype() == ClusterIdNotMatch); std::lock_guard lg(lock_); if (shutdown_.load()) return -1; auto server= std::dynamic_pointer_cast(config_->getServer(msg->serverid())); if (server == nullptr) { easy_error_log("Server %d : onClusterIdNotMatch receive a msg msgId(%llu) from server %llu which has been deleted already!\n", localServer_->serverId, msg->msgid(), msg->serverid()); return -1; } server->disconnect(nullptr); easy_error_log("Server %d : server %llu has different cluster id(%llu), local cluster id(%llu). we should remove this server from the current onfiguration!!\n", localServer_->serverId, msg->serverid(), msg->newclusterid(), clusterId_.load()); return 0; } int Paxos::onMsgPreCheck(PaxosMsg *msg, PaxosMsg *rsp) { if (msg->clusterid() != clusterId_.load()) { easy_error_log("Server %d: Recieve a msg from cluster(%llu), current cluster(%llu), msg type(%d), these nodes belong to different clusters.\n", localServer_->serverId, msg->clusterid(), clusterId_.load(), msg->msgtype()); rsp->set_msgtype(Consensus::ClusterIdNotMatch); rsp->set_serverid(msg->serverid()); rsp->set_term(msg->term()); rsp->set_msgid(msg->msgid()); rsp->set_clusterid(msg->clusterid()); rsp->set_newclusterid(clusterId_.load()); return 1; } else if (msg->entries_size() && log_->entriesPreCheck(msg->entries())) { easy_error_log("Server %d: msgId(%llu) onMsgPreCheck, entries pre-check is failed, msg type(%d), skip AppendLog.\n", localServer_->serverId, msg->msgid(), msg->msgtype()); rsp->set_msgtype(Consensus::PreCheckFailedResponce); rsp->set_serverid(msg->serverid()); rsp->set_term(msg->term()); rsp->set_msgid(msg->msgid()); return 1; } return 0; } int Paxos::onMsgPreCheckFailed(PaxosMsg *msg) { assert(msg->msgtype() == PreCheckFailedResponce || msg->msgtype() == ClusterIdNotMatch); // ClusterIdNotMatch if (msg->msgtype() == ClusterIdNotMatch) return onClusterIdNotMatch(msg); // PreCheckFailedResponce std::lock_guard lg(lock_); if (shutdown_.load()) return -1; auto server= std::dynamic_pointer_cast(config_->getServer(msg->serverid())); if (server != nullptr) server->disconnect(nullptr); easy_error_log("Server %d: msgId(%llu) onMsgPreCheckFailed, entries pre-check is failed, msg type(%d).", localServer_->serverId, msg->msgid(), msg->msgtype()); return 0; } int Paxos::onRequestVoteResponce(PaxosMsg *msg) { assert(msg->msgtype() == RequestVoteResponce); std::lock_guard lg(lock_); if (shutdown_.load()) return -1; auto server= std::dynamic_pointer_cast(config_->getServer(msg->serverid())); if (server == nullptr) { easy_error_log("Server %d : onRequestVoteResponce receive a msg msgId(%llu) from server %llu which has been deleted already!\n", localServer_->serverId, msg->msgid(), msg->serverid()); return -2; } if (static_cast(server) == false) //for unittest consensus.Paxos_requestVote1 return 0; server->setLastAckEpoch(currentEpoch_); if (msg->term() > currentTerm_) { easy_error_log("Server %d : New Term in onRequestVoteResponce !! server %d 's term(%d) is bigger than me(%d).\n", localServer_->serverId, msg->serverid(), msg->term(), currentTerm_.load()); newTerm(msg->term()); } else if (msg->term() < currentTerm_) { easy_error_log("Server %d : Receive an old RequestVoteResponce from server %d msg term(%d) current term(%d) skip!!\n", localServer_->serverId, msg->serverid(), msg->term(), currentTerm_.load()); if (msg->force()) { /* We reset term, when we're reject because leaderStickiness. */ if (msg->term() >= log_->getLastLogTerm()) { easy_error_log("Server %d : Downgrade term from %llu to %llu when onRequestVoteResponce, because there are leaderStickiness leader(%ld) exist!!\n", localServer_->serverId, currentTerm_.load(), msg->term(), msg->serverid()); newTerm(msg->term()); } } } else if (msg->votegranted()) { assert(msg->term() == currentTerm_); server->hasVote= true; easy_error_log("Server %d : server %d (term:%ld) vote me to became leader.\n", localServer_->serverId, msg->serverid(), msg->term()); if (config_->quorumAll(&Server::haveVote)) { becameLeader_(); } } else easy_error_log("Server %d : server %d refuse to let me became leader.\n", localServer_->serverId, msg->serverid()); return 0; } int Paxos::appendLog(const bool needLock) { if (shutdown_.load()) return -1; if (needLock) lock_.lock(); if (state_ != LEADER) { if (needLock) lock_.unlock(); return -1; } LogEntry entry; PaxosMsg msg; msg.set_term(currentTerm_); msg.set_msgtype(AppendLog); msg.set_leaderid(localServer_->serverId); msg.set_commitindex(commitIndex_); /* * Some fields of msg are filled by appendLogFillForEach, * called by RemoteServer::sendMsg. */ config_->forEach(&Server::sendMsg, (void *)&msg); if (needLock) lock_.unlock(); return 0; } int Paxos::appendLogToLearner(std::shared_ptr server, bool needLock) { if (needLock) lock_.lock(); /* XXX Now we support learner source to another learner */ if (state_ != LEADER && state_!= FOLLOWER && state_!= LEARNER) { if (needLock) lock_.unlock(); return -1; } LogEntry entry; PaxosMsg msg; msg.set_term(currentTerm_); msg.set_msgtype(AppendLog); msg.set_leaderid(localServer_->serverId); msg.set_commitindex(commitIndex_); if (server == nullptr) config_->forEachLearners(&Server::sendMsg, (void *)&msg); else { server->sendMsg((void *)&msg); } if (needLock) lock_.unlock(); return 0; } int Paxos::appendLogToServer(std::weak_ptr wserver, bool needLock, bool force) { std::shared_ptr server; if (!(server = wserver.lock())) return -1; return appendLogToServerByPtr(server, needLock, force); } int Paxos::appendLogToServerByPtr(std::shared_ptr server, bool needLock, bool force) { bool lockless4force= false; if (!force) { if (needLock) lock_.lock(); uint64_t lastLogIndex= replicateWithCacheLog_.load() ? log_->getLastCachedLogIndex() : log_->getLastLogIndex(); if ((server->nextIndex > lastLogIndex) || (server->isLearner && server->nextIndex > commitIndex_)) { if (needLock) lock_.unlock(); return -1; } } else { if (state_ != LEADER) return -1; assert(needLock); if (!lock_.try_lock()) { lockless4force= true; } } LogEntry entry; PaxosMsg msg; if (lockless4force) { uint64_t savedTerm= currentTerm_.load(); if (leaderStepDowning_.load() || state_.load() != LEADER || savedTerm != currentTerm_.load()) return -1; msg.set_term(savedTerm); } else { msg.set_term(currentTerm_); } msg.set_msgtype(AppendLog); msg.set_leaderid(localServer_->serverId); msg.set_commitindex(commitIndex_); /* * Some fields of msg are filled by appendLogFillForEach, * called by RemoteServer::sendMsg. */ /* TODO is force necessary ! */ if (force) { if (server->waitForReply) { easy_warn_log("Server %d : server %d do not response in the last heartbeat period, force to send heartbeat msg.\n", localServer_->serverId, server->serverId); server->waitForReply= 0; } } server->sendMsgFunc(lockless4force, force, (void *)&msg); if (needLock && !lockless4force) lock_.unlock(); return 0; } // try to deal with heartbeat optimistically(without mutex), // return true if we successfully processed this heartbeat, // return false otherwise bool Paxos::onHeartbeatOptimistically_(PaxosMsg *msg, PaxosMsg *rsp) { // next 2 load is not safe without mutex, but we just assume they remain unchanged StateType state = state_.load(); uint64_t currentTerm = currentTerm_.load(); // state not right or different term, must process this heartbeat in a traditional way(with mutex) if (state != FOLLOWER || msg->term() != currentTerm) return false; easy_error_log("msgId(%llu) received from leader(%d), term(%d), it is heartbeat and deal it optimistically!\n", msg->msgid(), msg->leaderid(), msg->term()); electionTimer_->restart(); rsp->set_msgtype(AppendLogResponce); rsp->set_msgid(msg->msgid()); // if `msg->serverid()` does not match local server id, leader will fail to process this responce rsp->set_serverid(msg->serverid()); rsp->set_issuccess(false); rsp->set_ignorecheck(true); rsp->set_term(currentTerm); rsp->set_appliedindex(0); return true; } int Paxos::onAppendLog(PaxosMsg *msg, PaxosMsg *rsp) { ++ (stats_.countOnMsgAppendLog); assert(msg->msgtype() == AppendLog); std::unique_lock lg(lock_, std::defer_lock); if (msg->entries_size() == 0 && optimisticHeartbeat_.load() == true) { if (lg.try_lock() == false) { if (onHeartbeatOptimistically_(msg, rsp) == true) { return 0; } lg.lock(); } // lock is already held if we reach here } else { lg.lock(); } if (shutdown_.load()) return -1; uint64_t lastLogIndex= log_->getLastLogIndex(); uint64_t prevLogIndex= msg->prevlogindex(); bool newTermFlag= false; if (1 == config_->getServerNumLockFree() && state_.load() != LEARNER) { easy_error_log("Server %d : reject onAppendLog because this server is not in the current configure, server %llu\n", localServer_->serverId, msg->leaderid()); rsp->set_msgid(msg->msgid()); rsp->set_msgtype(AppendLogResponce); rsp->set_serverid(msg->serverid()); rsp->set_issuccess(false); rsp->set_lastlogindex(lastLogIndex); rsp->set_ignorecheck(true); rsp->set_term(currentTerm_); rsp->set_appliedindex(0); return 0; } rsp->set_msgid(msg->msgid()); rsp->set_msgtype(AppendLogResponce); rsp->set_serverid(msg->serverid()); /* when add node and the node does not complete the initialization */ if (NULL == localServer_) { int i = 0; while (NULL == localServer_) { /* avoid loop indefinately */ if (i > 60) break; easy_warn_log("Local server has not be initialized, sleep 1 second!\n"); sleep(1); i++; } } assert(localServer_ != NULL); if (localServer_->serverId != msg->serverid()) { if (state_ != LEARNER) { easy_error_log("Server %d : the server id in the msg(%llu) is not match with local server id for a follower, this may happen during the configure change or hit a bug!!\n", localServer_->serverId, msg->serverid()); if (compactOldMode_ && msg->serverid() == leaderId_) { rsp->set_serverid(localServer_->serverId); easy_warn_log("Server %d : receive a msg from old version leader, in compact mode we use %llu instead of %llu as server id \n", localServer_->serverId, localServer_->serverId, msg->serverid()); } } else { easy_error_log("Server %d : the server id in the msg(%llu) is not match with local server id for a learner, we change the local server id to %llu!!\n", localServer_->serverId, msg->serverid(), msg->serverid()); localServer_->serverId= msg->serverid(); } } easy_warn_log("Server %d : msgId(%llu) onAppendLog start, receive logs from leader(%d), msg.term(%d) lli(%llu)\n", localServer_->serverId, msg->msgid(), msg->leaderid(), msg->term(), lastLogIndex); /* * XXX About msg->lastlogindex * when appendLog is success: msg->lastlogindex means the last log index in the msg (prevLogIndex + numEntries) * when appendLog is unsuccess: msg->lastlogindex means the last log index in the follower's local log_ * when appendLog is success but in cached mode: msg->lastlogindex means the last log index in the follower's local log_(not include cache) */ rsp->set_issuccess(false); rsp->set_lastlogindex(lastLogIndex); rsp->set_ignorecheck(false); rsp->set_appliedindex(appliedIndex_.load()); /* in some case we should step down */ if (msg->term() > currentTerm_) { easy_warn_log("Server %d : New Term in onAppendLog !! server %d 's term(%d) is bigger than me(%d).\n", localServer_->serverId, msg->leaderid(), msg->term(), currentTerm_.load()); if (state_.load() != LEADER) { newTerm(msg->term()); newTermFlag= true; } else { rsp->set_term(currentTerm_); return -1; } } else if (msg->term() < currentTerm_) { if (!forceRequestMode_ && leaderId_.load() == 0 && msg->term() >= log_->getLastLogTerm() && state_ == CANDIDATE) { easy_error_log("Server %d : Downgrade term from %llu to %llu when onAppendLog, because there are leaderStickiness leader(%ld) exist!!\n", localServer_->serverId, currentTerm_.load(), msg->term(), msg->serverid()); newTerm(msg->term()); } else if (state_ == LEARNER && (enableLearnerAutoResetMatchIndex_ || msg->term() >= log_->getLastLogTerm())) { easy_error_log("Server %d : Downgrade term from %llu to %llu when onAppendLog, because I am learner!!\n", localServer_->serverId, currentTerm_.load(), msg->term()); newTerm(msg->term()); } else { easy_warn_log("Server %d : msgId(%llu) receive logs from old leader(%ld) current leader(%ld). localTerm(%ld),msg.term(%d) \n", localServer_->serverId, msg->msgid(), msg->leaderid(), leaderId_.load(), currentTerm_.load(), msg->term()); rsp->set_term(currentTerm_); return -1; } } else if (state_ != FOLLOWER && state_ != LEARNER) { changeState_(FOLLOWER); } rsp->set_term(currentTerm_); if (leaderId_ == 0) { leaderId_.store(msg->leaderid()); leaderAddr_= ""; option.extraStore->setRemote(""); rsp->set_force(1); } else if (leaderId_ != msg->leaderid()) { /* TODO is this possible? */ easy_warn_log("Server %d : receive logs from different leader. old(%d),new(%d), term(%ld),msg.term(%d) \n", localServer_->serverId, leaderId_.load(), msg->leaderid(), currentTerm_.load(), msg->term()); leaderId_.store(msg->leaderid()); leaderAddr_= ""; option.extraStore->setRemote(""); rsp->set_force(1); } if (msg->has_addr()) { leaderAddr_= msg->addr(); if (msg->has_extra()) option.extraStore->setRemote(msg->extra()); } if (leaderAddr_ == "") rsp->set_force(1); if (state_ != LEARNER) electionTimer_->restart(); if (!msg->has_prevlogterm()) { rsp->set_ignorecheck(true); easy_warn_log("Server %d : msgId(%llu) receive logs without prevlogterm. from server %ld, localTerm(%ld),msg.term(%d) lli:%ld\n", localServer_->serverId, msg->msgid(), msg->leaderid(), currentTerm_.load(), msg->term(), lastLogIndex); return 0; } if (prevLogIndex > lastLogIndex) { uint64_t msgEntrieSize= msg->entries_size(); uint64_t msgLastIndex; uint64_t beginTerm; uint64_t beginIndex; if (msgEntrieSize != 0) { msgLastIndex= prevLogIndex + msg->entries_size(); beginTerm= msg->entries().begin()->term(); beginIndex= msg->entries().begin()->index(); } /* Now we allow hole in the log. we put the uncontinue log in the cache */ if (enableLogCache_ && !msg->nocache() && msgEntrieSize != 0 && beginTerm == (msg->entries().end()-1)->term() && beginTerm == currentTerm_) { logRecvCache_.put(beginIndex, msgLastIndex, *(msg->mutable_entries())); logRecvCache_.setCommitIndex(msg->commitindex()); rsp->set_issuccess(true); rsp->set_lastlogindex(log_->getLastLogIndex()); easy_warn_log("Server %d : receive uncontinue log local lastLogIndex(%ld, term:%ld); msgId(%llu) msg prevlogindex(%ld, term:%ld) has %llu entries firstIndex(%llu) lastIndex(%llu); put it in cache.\n", localServer_->serverId, lastLogIndex, currentTerm_.load(), msg->msgid(), msg->prevlogindex(), msg->prevlogterm(), msgEntrieSize, beginIndex, msgLastIndex); } else { /* * This is possible. It happened when the new leader send the first appendlog msg. * We return a hint to let leader know our last log index. */ easy_warn_log("Server %d : msgId(%llu) receive log's prevlogindex(%ld, term:%ld) is bigger than lastLogIndex(%ld, term:%ld) reject.\n", localServer_->serverId, msg->msgid(), msg->prevlogindex(), msg->prevlogterm(), lastLogIndex, currentTerm_.load()); rsp->set_lastlogindex(log_->getLastLogIndex()); /* We clear cache here. */ //logRecvCache_.clear(); return -1; } } else { LogEntry prevLogEntry; int error; if ((error= log_->getEntry(prevLogIndex, prevLogEntry, false)) || (prevLogEntry.term() != msg->prevlogterm() && prevLogEntry.optype() != kMock)) //TODO should check start index here. { if (error) { easy_warn_log("Server %d: fail to get log entry on append log, index %llu", localServer_->serverId, prevLogIndex); } else { /* log is not match, reject it. the leader will send the correct log again. */ easy_warn_log("Server %d : msgId(%llu) msg's prevlogterm(%llu) is not match with local log's prevlogterm(%llu) with index(%llu) reject!", localServer_->serverId, msg->msgid(), msg->prevlogterm(), prevLogEntry.term(), prevLogIndex); } if (state_ == FOLLOWER) { /* * In some rare case, leader thinks it is a learner but this node is still in follower state, * because it has not received the downgrade configure change logEntry. * Just set the role field in response msg and let leader be aware of this situation. */ rsp->set_role(state_); } return 0; } /* else easy_warn_log("Server %d : msgId(%llu) msg's prevlogterm(%llu) is match with local log's prevlogterm(%llu) pass.", localServer_->serverId, msg->msgid(), msg->prevlogterm(), prevLogEntry.term()); */ // checksum test if (checksum_mode_) { for (auto& entry : msg->entries()) { if (log_checksum_test(entry)) { easy_error_log("Server %d: msgId(%llu) log index %llu checksum fail.", localServer_->serverId, msg->msgid(), entry.index()); return -1; } } } rsp->set_issuccess(true); enableLogCache_ = true; //assert(msg->entries_size() <= 1); if (msg->entries_size() > 0) { easy_warn_log("Server %d : msgId(%llu) receive log has %ld entries, plt:%ld, pli:%ld, commitIndex:%ld\n", localServer_->serverId, msg->msgid(), msg->entries_size(), msg->prevlogterm(), msg->prevlogindex(), msg->commitindex()); bool appendDone= false; if (msg->entries_size() != 0 && log_->getLastLogIndex() == prevLogIndex) { /* no need truncate */ uint64_t lli= log_->append(msg->entries()); if (lli == 0) { rsp->set_lastlogindex(log_->getLastLogIndex()); rsp->set_issuccess(false); rsp->set_msgerror(PaxosMsg::APPEND); easy_warn_log("Server %d : msgId(%llu) write log entries from index %llu, size %lu failed\n", localServer_->serverId, msg->msgid(), prevLogIndex + 1, msg->entries_size()); return 0; } appendDone= true; rsp->set_lastlogindex(log_->getLastLogIndex()); assert(lli == (prevLogIndex + msg->entries_size())); assert(lli == log_->getLastLogIndex()); if (true)//(state_ != LEARNER) { for (auto& entry : msg->entries()) if (entry.optype() == kConfigureChange) { prepareConfigureChangeEntry_(entry, msg); } } } else { uint64_t msgLastIndex= prevLogIndex + msg->entries_size(); uint64_t lastLogIndex= log_->getLastLogIndex(); if (!newTermFlag && lastLogIndex >= msgLastIndex) { /* In some continues log entries, if the first entry and the last entry have the same term, all these entries have the same term. */ uint64_t beginTerm= msg->entries().begin()->term(); if (beginTerm == (msg->entries().end()-1)->term() && beginTerm == currentTerm_ && prevLogEntry.term() == currentTerm_) { easy_warn_log("Server %d : ignore %ld entries, plt:%ld, pli:%ld, commitIndex:%ld lliInMsg:%llu lli:%llu\n", localServer_->serverId, msg->entries_size(), msg->prevlogterm(), msg->prevlogindex(), msg->commitindex(), msgLastIndex, lastLogIndex); appendDone= true; rsp->set_lastlogindex(msgLastIndex); } } } if (!appendDone) { uint64_t index= prevLogIndex; rsp->set_lastlogindex(prevLogIndex + msg->entries_size()); int dupcnt = 0; for (auto it= msg->entries().begin(); it != msg->entries().end(); ++it) { ++index; const LogEntry &entry= *it; easy_warn_log("Server %d : parse entries index:%ld, entry.term:%ld, entry.index:%ld\n", localServer_->serverId, index, entry.term(), entry.index()); assert(entry.index() == index); if (log_->getLastLogIndex() >= index) { /* need truncate */ LogEntry en; if (!log_->getEntry(index, en, false) && (en.term() == entry.term() || en.optype() == kMock)) { /* The duplicate log entry, that has already received. */ dupcnt++; easy_warn_log("Server %d : duplicate log entry, ignore, entry.term:%ld, entry.index:%ld\n", localServer_->serverId, entry.term(), entry.index()); continue; } // commit index might be bigger than last log index if we set learner auto reset match index on if (enableLearnerAutoResetMatchIndex_ && commitIndex_ >= index && index > 1) commitIndex_ = index - 1; /* Truncate the log start from the index. */ truncateBackward_(index); ++ (stats_.countTruncateBackward); easy_error_log("Server %d : truncate paxos log from(include) %ld in appendLog msg, lli:%ld\n", localServer_->serverId, index, log_->getLastLogIndex()); break; } else break; } int msgEntrieSize= msg->entries_size(); msg->mutable_entries()->DeleteSubrange(0, dupcnt); assert(msg->entries_size() == (msgEntrieSize - dupcnt)); easy_warn_log("Server %d : Duplicate entrys count %d, remaining entries count %d", localServer_->serverId, dupcnt, msg->entries_size()); if (msg->entries_size() > 0) { assert((log_->getLastLogIndex()+1) == msg->entries(0).index()); if (log_->append(msg->entries()) == 0) { rsp->set_lastlogindex(log_->getLastLogIndex()); rsp->set_issuccess(false); rsp->set_msgerror(PaxosMsg::APPEND); easy_warn_log("Server %d : msgId(%llu) write log entries from index %llu, size %lu failed\n", localServer_->serverId, msg->msgid(), msg->entries(0).index(), msg->entries_size()); return 0; } assert(log_->getLastLogIndex() == msg->entries(msg->entries_size() - 1).index()); } // deal with ConfigureChange for (auto& entry : msg->entries()) if (entry.optype() == kConfigureChange) { prepareConfigureChangeEntry_(entry, msg); } } PaxosLogCacheNode *node= logRecvCache_.get(log_->getLastLogIndex() + 1); if (node != NULL) { if (log_->append(node->entries) == 0) { rsp->set_lastlogindex(log_->getLastLogIndex()); rsp->set_issuccess(false); rsp->set_msgerror(PaxosMsg::APPEND); easy_warn_log("Server %d : msgId(%llu) write cached log entries from index %llu, size %lu failed\n", localServer_->serverId, msg->msgid(), log_->getLastLogIndex() + 1, node->entries.size()); delete node; return 0; } rsp->set_lastlogindex(log_->getLastLogIndex()); for (auto& entry : node->entries) if (entry.optype() == kConfigureChange) { prepareConfigureChangeEntry_(entry, msg, true); } easy_warn_log("Server %d : Get log from cache, beginIndex(%llu) endIndex(%llu) term(%llu)\n", localServer_->serverId, node->beginIndex, node->endIndex, node->entries.begin()->term()); } delete node; } else ++ (stats_.countOnHeartbeat); //rsp->set_lastlogindex(log_->getLastLogIndex()); /* Update commitIndex. */ if (msg->commitindex() > commitIndex_ && !debugSkipUpdateCommitIndex) { if (ccMgr_.prepared && ccMgr_.preparedIndex <= msg->commitindex() && ccMgr_.preparedIndex > commitIndex_) { //srv_->sendAsyncEvent(&Paxos::applyConfigureChange_, this, ccMgr_.preparedIndex); applyConfigureChangeNoLock_(ccMgr_.preparedIndex); if (ccMgr_.needNotify != 1) ccMgr_.clear(); } easy_warn_log("Server %d : Follower commitIndex change from %ld to %ld\n", localServer_->serverId, commitIndex_, msg->commitindex()); commitIndex_= msg->commitindex(); assert(commitIndex_ <= log_->getLastLogIndex()); /* already hold the lock_ by the caller. */ cond_.notify_all(); /* X-Paxos support learner get log from follower. */ appendLogToLearner(); /* if (srv_->cs) srv_->cs->set(entry.ikey(), entry.value()); */ } } if (tryFillFollowerMeta_(rsp->mutable_cientries())) easy_warn_log("Server %d : msgId(%llu) tryFillFollowerMeta\n", localServer_->serverId); easy_warn_log("Server %d : msgId(%llu) onAppendLog end, is_success %d\n", localServer_->serverId, msg->msgid(), rsp->issuccess()); return 0; } int Paxos::onAppendLogResponce(PaxosMsg *msg) { assert(msg->msgtype() == AppendLogResponce); /* XXX Now we support learner source to another learner */ if (state_ != LEADER && state_ != FOLLOWER && state_ != LEARNER) return -1; /* update extra storage for Followers */ if (msg->has_extra()) option.extraStore->setRemote(msg->extra()); std::lock_guard lg(lock_); if (shutdown_.load()) return -1; auto server= std::dynamic_pointer_cast(config_->getServer(msg->serverid())); auto wserver= std::weak_ptr(server); //easy_warn_log("Server %d : onAppendLogResponce receive a msg msgId(%llu) from server %llu\n", localServer_->serverId, msg->msgid(), msg->serverid()); if (server == nullptr) { easy_warn_log("Server %d : onAppendLogResponce receive a msg msgId(%llu) from server %llu which has been deleted already!\n", localServer_->serverId, msg->msgid(), msg->serverid()); return -2; } if(state_ == FOLLOWER && (server->learnerSource != localServer_->serverId || !server->isLearner)) { easy_warn_log("Server %d : onAppendLogResponce receive a msg msgId(%llu) from server %llu learnerSource:%llu who's learnerSource not match or already not a leaner!\n", localServer_->serverId, msg->msgid(), msg->serverid(), server->learnerSource); return -3; } //assert(server->waitForReply == 1); if (msg->msgid() >= server->guardId) server->waitForReply= 0; else easy_warn_log("Server %d : onAppendLogResponce skip reset waitForReply, msgid %llu guardid %llu", localServer_->serverId, msg->msgid(), server->guardId.load()); if (msg->has_force() && msg->force() == 1) server->needAddr= true; if (msg->term() > currentTerm_) { easy_warn_log("Server %d : New Term in onAppendLogResponce msgId(%llu) !! server %d 's term(%d) is bigger than me(%d).\n", localServer_->serverId, msg->msgid(), msg->serverid(), msg->term(), currentTerm_.load()); if (state_.load() != LEADER) { newTerm(msg->term()); } else { if (server->matchIndex.load() != 0) { /* The follower(server) now is a naughty server, do not use pipelinng */ easy_warn_log("Server %d : msgId(%llu) server %d became a naughty server, reset match index from %lu to 0\n", localServer_->serverId, msg->msgid(), msg->serverid(), server->matchIndex.load()); server->resetMatchIndex(0); } } } else if (msg->term() < currentTerm_) { easy_warn_log("Server %d : Receive prev term's AppendLogResponce msgId(%llu) (term:%ld) from server(%ld), currentTerm(%ld) just ignore!!\n", localServer_->serverId, msg->msgid(), msg->term(), msg->serverid(), currentTerm_.load()); } else { assert(msg->term() == currentTerm_); /* Inc epoch for RemoteServer's. we reset RemoteServer's heartbeat in sendMsgFunc. */ server->setLastAckEpoch(currentEpoch_); if (server->appliedIndex < msg->appliedindex()) server->appliedIndex= msg->appliedindex(); /* * XXX About msg->lastlogindex * when appendLog is success: msg->lastlogindex means the last log index in the msg (prevLogIndex + numEntries) * when appendLog is unsuccess: msg->lastlogindex means the last log index in the follower's local log_ * when appendLog is success but in cached mode: msg->lastlogindex means the last log index in the follower's local log_(not include cache) */ if (msg->issuccess()) { if (server->nextIndex != msg->lastlogindex() + 1 || server->matchIndex != 0) { uint64_t oldMatchIndex= server->matchIndex; uint64_t oldNextIndex= server->nextIndex; if (msg->lastlogindex() < server->nextIndex && server->matchIndex == 0) { easy_warn_log("Server %d : onAppendLogResponce this response of AppendLog to server %d may be a resend msg that we have already received, msg index(%ld) is smaller than nextIndex(%ld)\n", localServer_->serverId, msg->serverid(), msg->lastlogindex(), server->nextIndex.load()); } else { server->hasMatched= true; if (msg->lastlogindex() > server->matchIndex) { server->matchIndex= msg->lastlogindex(); /* trigger auto change to follower, use maxDelayIndex4NewMember_/2 to have more chance to success */ if (ccMgr_.autoChangeAddr == server->strAddr && state_ == LEADER && server->isLearner && (log_->getLastLogIndex() <= (maxDelayIndex4NewMember_ / 2 + server->matchIndex.load()))) { srv_->sendAsyncEvent(&Paxos::autoChangeLearnerAction, this); } } if (server->nextIndex < server->matchIndex + 1) server->nextIndex= server->matchIndex + 1; /* * try to update commitIndex here, * only if matchIndex is greater than commitIndex */ if (server->matchIndex > commitIndex_) tryUpdateCommitIndex_(); } easy_warn_log("Server %d : msgId(%llu) AppendLog to server %d success, matchIndex(old:%llu,new:%llu) and nextIndex(old:%llu,new:%llu) have changed\n", localServer_->serverId, msg->msgid(), msg->serverid(), oldMatchIndex, server->matchIndex.load(), oldNextIndex, server->nextIndex.load()); } else if (server->matchIndex == 0) { // previously this server might be considered a naughty server, and its match index is set to 0, // now we set it right, so when no log is replicated, match index will still be right server->matchIndex= msg->lastlogindex(); easy_warn_log("Server %d : msgId(%llu) AppendLog to server %d success, this is a heartbeat responce, set match index from 0 to %llu. nextIndex(%llu) msg(lli:%llu term:%llu)\n", localServer_->serverId, msg->msgid(), msg->serverid(), msg->lastlogindex(), server->nextIndex.load(), msg->lastlogindex(), msg->term()); } else { /* * We receive a heartbeat responce, before we commit any logEntry in this term. * There must be some bug, or misorder msg. */ easy_warn_log("Server %d : msgId(%llu) AppendLog to server %d success, skip because this is a heartbeat responce. nextIndex(%llu) msg(lli:%llu term:%llu)\n", localServer_->serverId, msg->msgid(), msg->serverid(), server->nextIndex.load(), msg->lastlogindex(), msg->term()); } /* Update meta for learner source */ if (msg->cientries_size() > 0) { config_->mergeFollowerMeta(msg->cientries()); easy_warn_log("Server %d : msgId(%llu) mergeFollowerMeta from server %d\n", localServer_->serverId, msg->msgid(), msg->serverid()); } // we should deal with matchIndex == 0 case. /* * Case when the follower is not uptodate, * There is no need to wait for next heart beat to send the next log entry. */ //if (server->nextIndex <= log_->getLastLogIndex()) // appendLogToServer(server.get(), false); //srv_->sendAsyncEvent(&Paxos::appendLogToServer, this, (RemoteServer *)server.get(), true, false); appendLogToServer(std::move(wserver), false, false); if (getState() == Paxos::FOLLOWER || getState() == Paxos::LEARNER) { updateFollowerMetaNo(); easy_warn_log("Server %d : updateFollowerMetaNo\n", localServer_->serverId); } } else if (msg->has_ignorecheck() && msg->ignorecheck()) { easy_warn_log("Server %d : msgId(%llu) AppendLog to server %d without check\n", localServer_->serverId, msg->msgid(), msg->serverid()); if (server->isLearner) appendLogToServer(std::move(wserver), false); } else { uint64_t oldNextIndex= server->nextIndex; /* We also need to reset matchindex if leader thinks it is a learner but the node is still in follower state */ bool learnerStateNotMatch = server->isLearner && msg->has_role() && msg->role() == FOLLOWER; if (learnerStateNotMatch) server->resetMatchIndex(0); // to trigger decrement nextIndex case if (!server->isLearner || learnerStateNotMatch) { uint64_t term= 0, optype= 0, info= 0; if (consensusAsync_.load() || enableAutoResetMatchIndex_ || (!log_->getLogMeta(server->matchIndex.load() + 1, &term, &optype, &info) && (info & (1 << 5 | 1 << 6)))) { if (!server->hasMatched && msg->lastlogindex() < server->matchIndex) { /* log is lost, it might happen due to follower does not sync the log after they receive it * and there has been a crash recovery * 1. sync log is not set * 2. FLAG_BLOB | FLAG_BLOB_END, haven't got the chance to flush **/ easy_error_log("Server %d : follower(%d) might lost some logs. matchIndex(%llu) is greater than follower's lli(%llu), we reset matchIndex to 0!", localServer_->serverId, msg->serverid(), server->matchIndex.load(), msg->lastlogindex()); server->resetMatchIndex(0); } } if (server->matchIndex > 0) { server->nextIndex= server->matchIndex + 1; } else { /* send correct log for this RemoteServer. */ if (server->nextIndex > 1) { -- (server->nextIndex); } /* XXX if the follower lost many logs, decrement nextIndex once. */ //if (server->nextIndex > msg->prevlogindex() + 1) if (server->nextIndex > msg->lastlogindex() + 1) { server->nextIndex= msg->lastlogindex() + 1; } } if (!msg->has_msgerror() || msg->msgerror() != PaxosMsg::APPEND) appendLogToServer(std::move(wserver), false); easy_warn_log("Server %d : msgId(%llu) AppendLog to server %d failed, msg error %d, " "lastlogindex(%ld) is not match with the local nextIndex(%ld), " "set local nextIndex to %ld, matchIndex(%llu).\n", localServer_->serverId, msg->msgid(), msg->serverid(), msg->has_msgerror()? msg->msgerror(): 0, msg->lastlogindex(), oldNextIndex, server->nextIndex.load(), server->matchIndex.load()); } else { if (enableLearnerAutoResetMatchIndex_) { server->resetMatchIndex(0); // decrease next index until log match, treat learner like follower if (server->nextIndex > 1) { --server->nextIndex; } if (server->nextIndex > msg->lastlogindex() + 1) { server->nextIndex= msg->lastlogindex() + 1; } easy_warn_log("Server %d : Learner(%d) match index reset to 0, nextIndex set to %llu.", localServer_->serverId, msg->serverid(), server->nextIndex.load()); appendLogToLearner(wserver.lock()); } else { uint64_t oldNextIndex= server->nextIndex; uint64_t oldMatchIndex= server->matchIndex; server->resetMatchIndex(msg->lastlogindex()); server->nextIndex= msg->lastlogindex() + 1; if (oldNextIndex != server->nextIndex || oldMatchIndex != server->matchIndex) { easy_warn_log("Server %d : Learner(%d) change its local log position! We reset server(learner)'s matchIndex(old:%llu,new:%llu) and nextIndex(old:%llu,new:%llu).", localServer_->serverId, msg->serverid(), oldMatchIndex, server->matchIndex.load(), oldNextIndex, server->nextIndex.load()); appendLogToLearner(wserver.lock()); } else { easy_warn_log("Server %d : Learner(%d) change its local log position or term, which is not correct !! current matchIndex(%llu) nextIndex(%llu).", localServer_->serverId, msg->serverid(), server->matchIndex.load(), server->nextIndex.load()); } } } /* Resend the correct log entry. */ /* TODO Need async */ } } /* send correct log if needed */ return 0; } int Paxos::onAppendLogSendFail(PaxosMsg *msg, uint64_t *newId) { /* No need to resend the msg if this server is not leader or the prev term msg. */ if (state_ != LEADER || msg->term() != currentTerm_) return -1; if (msg->msgtype() != AppendLog) return -2; /* No need to resend the heartbeat msg. */ if (msg->entries_size() == 0 && msg->has_compressedentries() == false) return -3; lock_.lock(); auto server= std::dynamic_pointer_cast(config_->getServer(msg->serverid())); auto wserver= std::weak_ptr(server); if (server == nullptr) { easy_warn_log("Server %d : onAppendLogSendFail try resend msgId(%llu) to server %llu which has been deleted already!\n", localServer_->serverId, msg->msgid(), msg->serverid()); lock_.unlock(); return -4; } if (server->lostConnect.load() || server->disablePipelining.load()) { lock_.unlock(); return -5; } lock_.unlock(); if (newId) *newId= server->msgId.fetch_add(1); /* TODO reset timer for the correspond RemoteServer. */ return 0; } int Paxos::onLeaderCommand(PaxosMsg *msg, PaxosMsg *rsp) { ++ (stats_.countOnLeaderCommand); lock_.lock(); if (shutdown_.load()) return -1; rsp->set_msgid(msg->msgid()); rsp->set_msgtype(LeaderCommandResponce); rsp->set_serverid(localServer_->serverId); rsp->set_term(currentTerm_); if (msg->lctype() == LeaderTransfer) { /* Update commitIndex. */ if (msg->commitindex() > commitIndex_) { if (ccMgr_.prepared && ccMgr_.preparedIndex <= msg->commitindex() && ccMgr_.preparedIndex > commitIndex_) { applyConfigureChangeNoLock_(ccMgr_.preparedIndex); if (ccMgr_.needNotify != 1) ccMgr_.clear(); } easy_warn_log("Server %d : Follower commitIndex change from %ld to %ld during onLeaderCommand\n", localServer_->serverId, commitIndex_, msg->commitindex()); commitIndex_= msg->commitindex(); assert(commitIndex_ <= log_->getLastLogIndex()); /* notify waitCommitIndexUpdate */ cond_.notify_all(); } if (msg->lastlogindex() == log_->getLastLogIndex() && msg->lastlogindex() == commitIndex_) { rsp->set_issuccess(true); lock_.unlock(); requestVote(); } else { rsp->set_issuccess(false); lock_.unlock(); } } else if (msg->lctype() == PurgeLog) { /* check for purge log */ easy_warn_log("Server %d : prepare to purge log, minMatchIndex %ld \n", localServer_->serverId, msg->minmatchindex()); purgeLogQueue_.push(new purgeLogArgType(msg->minmatchindex(), this)); srv_->sendAsyncEvent(&SingleProcessQueue::process, &purgeLogQueue_, Paxos::doPurgeLog); rsp->set_issuccess(true); lock_.unlock(); } easy_warn_log("Server %d : msgId(%llu) receive leaderCommand from server(%ld), currentTerm(%ld), lli(%ld), issuccess(%d)\n", localServer_->serverId, msg->msgid(), msg->serverid(), currentTerm_.load(), log_->getLastLogIndex(), rsp->issuccess()); return 0; } int Paxos::leaderCommand(LcTypeT type, std::shared_ptr server) { /* We call from internal now. */ //std::lock_guard lg(lock_); /* just ensure only leader do leaderCommand */ if (state_ != LEADER) { return -1; } PaxosMsg msg; msg.set_term(currentTerm_); msg.set_msgtype(LeaderCommand); msg.set_serverid(localServer_->serverId); msg.set_lctype(type); ++ (stats_.countLeaderCommand); if (type == LeaderTransfer) { easy_error_log("Server %d : leaderCommand(LeaderTransfer) to server(%ld), currentTerm(%ld), lli(%llu)\n", localServer_->serverId, server->serverId, currentTerm_.load(), log_->getLastLogIndex()); assert(commitIndex_ == log_->getLastLogIndex() && commitIndex_ == server->matchIndex); msg.set_lastlogindex(log_->getLastLogIndex()); msg.set_commitindex(commitIndex_); } else if (type == PurgeLog) { easy_warn_log("Server %d : leaderCommand(PurgeLog) to all followers\n", localServer_->serverId); /* broadcast minMatchIndex for purging log */ msg.set_minmatchindex(minMatchIndex_); } if (server != nullptr) server->sendMsg((void *)&msg); else config_->forEach(&Server::sendMsg, (void *)&msg); if (debugWitnessTest) config_->forEachLearners(&Server::sendMsg, (void *)&msg); return 0; } int Paxos::onLeaderCommandResponce(PaxosMsg *msg) { easy_warn_log("Server %d : msgId(%llu) receive leaderCommandResponce from server(%ld), currentTerm(%ld), lli(%llu)\n", localServer_->serverId, msg->msgid(), msg->serverid(), currentTerm_.load(), log_->getLastLogIndex()); return 0; } int Paxos::forceSingleLeader() { std::lock_guard lg(lock_); if (state_.load() == LEARNER) { easy_error_log("Server %d : Execute forceSingleLeader for this learner!!", localServer_->serverId); localServer_->serverId= 1; changeState_(FOLLOWER); config_->delAllLearners(); } else { easy_error_log("Server %d : Execute forceSingleLeader for this server!!", localServer_->serverId); config_->delAllRemoteServer(localServer_->strAddr, this); } log_->setMetaData(Paxos::keyLearnerConfigure, config_->learnersToString()); log_->setMetaData(Paxos::keyMemberConfigure, config_->membersToString(localServer_->strAddr)); srv_->sendAsyncEvent(&Paxos::requestVote, this, true); return 0; } int Paxos::forceSingleLearner() { std::lock_guard lg(lock_); if (state_.load() == LEARNER) { easy_error_log("Server %d : Execute forceSingleLearner failed because this is already a learner!!", localServer_->serverId); return 1; } else { easy_error_log("Server %d : Execute forceSingleLearner for this server!!", localServer_->serverId); config_->delAllLearners(); config_->delAllRemoteServer(localServer_->strAddr, this); } log_->setMetaData(Paxos::keyLearnerConfigure, config_->membersToString()); log_->setMetaData(Paxos::keyMemberConfigure, ""); changeState_(LEARNER); electionTimer_->stop(); localServer_->serverId += 100; return 0; } int Paxos::forcePromote() { /* send requestVote request immediately to try to become a leader */ std::lock_guard lg(lock_); srv_->sendAsyncEvent(&Paxos::requestVote, this, true); return 0; } uint64_t Paxos::waitCommitIndexUpdate(uint64_t baseIndex, uint64_t term) { std::unique_lock ul(lock_); if (term != 0 && currentTerm_ != term) return 0; /* TODO maybe we can signal cond_ only when it is need by a min heap of baseIndex. */ while (commitIndex_ <= baseIndex && (term == 0 || currentTerm_ == term) && !shutdown_.load() && (state_ != LEADER || !consensusAsync_.load() || localServer_->lastSyncedIndex.load() <= baseIndex)) cond_.wait(ul); if (term != 0 && currentTerm_ != term) return 0; return (state_ == LEADER && consensusAsync_.load()) ? localServer_->lastSyncedIndex.load() : commitIndex_; } uint64_t Paxos::checkCommitIndex(uint64_t baseIndex, uint64_t term) { uint64_t ret = 0; /* should call the blocking interface (waitCommitIndexUpdate) if term is 0 */ if (term == 0) return 0; /* double check term to make sure we have a valid commitIndex */ if (currentTerm_ != term) return 0; ret = getCommitIndex(); /* double check term & check shutdown */ if (currentTerm_ != term || shutdown_.load()) return 0; return ret; } void Paxos::newTerm(uint64_t newTerm) { if (state_ == LEADER) { leaderStepDowning_.store(true); easy_error_log("Server %d : new term(old:%ld,new:%ld), This is a Leader Step Down!!\n", localServer_->serverId, currentTerm_.load(), newTerm); log_->setMetaData(keyLastLeaderTerm, currentTerm_); log_->setMetaData(keyLastLeaderLogIndex, commitIndex_); if (ccMgr_.autoChangeAddr != "") { ccMgr_.autoChangeAddr = ""; ccMgr_.autoChangeRet = -1; ccMgr_.condChangeDone.notify_all(); } } else easy_error_log("Server %d : new term(old:%ld,new:%ld) !!\n", localServer_->serverId, currentTerm_.load(), newTerm); currentTerm_.store(newTerm); log_->setTerm(currentTerm_); log_->setMetaData(keyCurrentTerm, currentTerm_); leaderId_.store(0); leaderAddr_= std::string(""); option.extraStore->setRemote(""); votedFor_= 0; log_->setMetaData(keyVoteFor, votedFor_); if (state_ != LEARNER) { changeState_(FOLLOWER); //electionTimer_->restart(electionTimeout_, true); } else { changeState_(LEARNER); } leaderStepDowning_.store(false); logRecvCache_.clear(); /* TODO only step down when we are Leader */ config_->forEach(&Server::stepDown, NULL); config_->forEachLearners(&Server::stepDown, NULL); config_->forEachLearners(&Server::disconnect, NULL); epochTimer_->stop(); } uint64_t Paxos::appendLogFillForEachAsync(PaxosMsg *msg, RemoteServer *server, LogFillModeT mode) { std::lock_guard lg(lock_); if(currentTerm_ != msg->term()) { easy_warn_log("Server %d : skip sendMsg async, because term has already changed target(%llu), now(%llu)\n", localServer_->serverId, msg->term(), currentTerm_.load()); return 0; } return appendLogFillForEach(msg, server, mode); } uint64_t Paxos::appendLogFillForEach(PaxosMsg *msg, RemoteServer *server, LogFillModeT mode) { /* There is no need to lock in this function. */ uint64_t prevLogTerm; uint64_t nextIndex= server->nextIndex; uint64_t prevLogIndex= nextIndex - 1; uint64_t lastLogIndex= replicateWithCacheLog_.load() ? log_->getLastCachedLogIndex() : log_->getLastLogIndex(); uint64_t size= 0; if (cdrMgr_.inRecovery) { easy_warn_log("Server %d : fill nothing to msg during commit dependency recovery.\n", localServer_->serverId); return size; /* size is 0 */ } if(prevLogIndex > lastLogIndex) { easy_warn_log("Server %d : server %d 's prevLogIndex %ld larger than lastLogIndex %ld. Just ignore.\n", localServer_->serverId, server->serverId, prevLogIndex, lastLogIndex); return size; /* size is 0 */ } assert(prevLogIndex <= lastLogIndex); if (prevLogIndex > 0) { LogEntry entry; if (0 != log_->getEntry(prevLogIndex, entry, true, server->serverId)) { easy_warn_log("Server %d :getEntry fail for prevLogIndex(%ld) in Fill AppendLog to server %d\n", localServer_->serverId, prevLogIndex, msg->serverid()); return size; } prevLogTerm= entry.term(); } else prevLogTerm= 0; if (server->needAddr) { msg->set_addr(localServer_->strAddr); msg->set_extra(option.extraStore->getLocal()); server->needAddr= false; } msg->set_prevlogindex(prevLogIndex); msg->set_prevlogterm(prevLogTerm); msg->set_nocache(true); /* We reuse msg here, so there may some entries exist. (need to send to other follower) */ if (msg->entries_size() != 0) msg->mutable_entries()->Clear(); /* We reuse msg here, since there may exists a compression. (need to send to other follower) */ if (msg->has_compressedentries()) msg->clear_compressedentries(); /* try to use appliedIndex instead of commitIndex for learner */ uint64_t lastSendLogIndex= lastLogIndex; if (server->isLearner) { if (!server->sendByAppliedIndex) lastSendLogIndex = commitIndex_; else lastSendLogIndex = appliedIndex_.load(); } /* For debug */ if (debugMaxSendLogIndex != 0) { lastSendLogIndex= (lastSendLogIndex > debugMaxSendLogIndex) ? debugMaxSendLogIndex : lastSendLogIndex; } uint64_t maxPacketSize= maxPacketSize_; if (mode == LargeBatchMode) maxPacketSize *= largeBatchRatio_; if (lastSendLogIndex >= nextIndex) { LogEntry entry; ::google::protobuf::RepeatedPtrField* entries; entries= msg->mutable_entries(); uint64_t lastIndex= 0; uint64_t lastInfo= 0; for (uint64_t i= nextIndex; i <= lastSendLogIndex; ++i) { if (0 != log_->getEntry(i, entry, true, server->serverId)) { easy_warn_log("Server %d :getEntry fail for entries(i:%ld) in Fill AppendLog to server %d\n", localServer_->serverId, i, msg->serverid()); break; } log_->putLogMeta(entry.index(), entry.term(), entry.optype(), entry.info()); assert(entry.index() == i); if (entry.optype() == kMock) { easy_error_log("Server %d : read mock log(index:%llu) when send to server %d, the configure of mock index may error or may hit bug!!", localServer_->serverId, i, server->serverId); break; } /* * Restriction from PolarDB-X Engine, possible info values: * 1. FLAG_GU1 = 0x01, needGroup * 2. FLAG_GU2 = 0x02, needGroup * 3. FLAG_LARGE_TRX = 0x04, do not care * 4. FLAG_LARGE_TRX_END = 0x08, do not care */ bool needGroup= false; if ((lastInfo == 1 || lastInfo == 2) && lastInfo == entry.info()) needGroup= true; if (entry.has_info()) lastInfo= entry.info(); else lastInfo= 0; auto entrySize= entry.ByteSize(); if (size + entrySize >= maxPacketSize && size != 0 && !needGroup) break; if (size + entrySize >= maxSystemPacketSize_) { if (size != 0) { easy_warn_log("Server %d : truncate the sending msg, because it may exceed system max packet size (current size:%llu, add size:%llu)", localServer_->serverId, size, entrySize); break; } else { easy_warn_log("Server %d : force send a msg, it may exceed system max packet size (current size:%llu, add size:%llu)", localServer_->serverId, size, entrySize); } } *(entries->Add())= entry; lastIndex= i; size += entrySize; /* packet size may exceed maxPacketSize a little bit. */ if (mode == EmptyMode) { /* XXX in EmptyMode we send 1 entty */ break; } } /* * We enable pipelining in two cases: * 1. in a new term, a server has not matched once. * 2. the server is learner (because the learner may change its local log pos in one term.) */ if ((!server->isLearner || enableLearnerPipelining_) && server->matchIndex != 0 && mode == NormalMode && lastIndex != 0) { server->nextIndex= lastIndex + 1; msg->set_nocache(false); easy_warn_log("Server %d : update server %d 's nextIndex(old:%llu,new:%llu)\n", localServer_->serverId, server->serverId, nextIndex, server->nextIndex.load()); } } msg->set_commitindex(std::min(msg->commitindex(), prevLogIndex + msg->entries_size())); return size; } int Paxos::tryUpdateCommitIndex() { std::lock_guard lg(lock_); int ret= tryUpdateCommitIndex_(); //if (ret == 0) //appendLog(false); return ret; } int Paxos::tryUpdateCommitIndex_() { if (state_ != LEADER) return -1; if (shutdown_.load()) return -1; uint64_t newCommitIndex= config_->quorumMin(&Server::getMatchIndex); uint64_t forceCommitIndex= config_->forceMin(&Server::getMatchIndex); if (forceCommitIndex < newCommitIndex && leaderForceSyncStatus_.load()) newCommitIndex= forceCommitIndex; if (commitIndex_ >= newCommitIndex) return -1; // in case leader does not write log to disk, unlikely to happen if (newCommitIndex > log_->getLastLogIndex()) return -1; // if async mode, skip log check if (!consensusAsync_) { uint64_t term = 0, optype = 0, info = 0; if (log_->getLogMeta(newCommitIndex, &term, &optype, &info)) return -1; /* XXX leader don't commit for other term. */ if (term != currentTerm_) return -1; /* commit dependency case */ if (optype == kCommitDep) { easy_warn_log("Server %d : index %ld is kCommitDep, check lastNonCommitDepIndex %llu.\n", localServer_->serverId, newCommitIndex, cdrMgr_.lastNonCommitDepIndex.load()); if (cdrMgr_.lastNonCommitDepIndex > newCommitIndex) return -1; else newCommitIndex = cdrMgr_.lastNonCommitDepIndex; } } if (commitIndex_ >= newCommitIndex) return -1; if (ccMgr_.prepared && ccMgr_.preparedIndex <= newCommitIndex && ccMgr_.preparedIndex > commitIndex_) { applyConfigureChangeNoLock_(ccMgr_.preparedIndex); /* * Case: we prepare a change when we're follower, and we apply when we're leader. * In this case, we should clear ccMgr info. */ if (ccMgr_.needNotify != 1) ccMgr_.clear(); } easy_warn_log("Server %d : Leader commitIndex change from %ld to %ld\n", localServer_->serverId, commitIndex_, newCommitIndex); commitIndex_= newCommitIndex; /* already hold the lock_ by the caller. */ cond_.notify_all(); appendLogToLearner(); return 0; } /* TODO should read from config file or cmd line */ int Paxos::init(const std::vector& strConfig/*start 0*/, uint64_t current/*start 1*/, ClientService *cs, uint64_t ioThreadCnt, uint64_t workThreadCnt, std::shared_ptr localServer, bool memory_usage_count, uint64_t heartbeatThreadCnt) { // set new seed for auto leader transfer srand(time(0)); bool needSetMeta= false; /* Init persistent variables */ uint64_t itmp; if (! log_->getMetaData(std::string(keyClusterId), &itmp)) { clusterId_.store(itmp); } if (! log_->getMetaData(std::string(keyCurrentTerm), &itmp)) { currentTerm_= itmp; log_->setTerm(currentTerm_); } if (! log_->getMetaData(std::string(keyVoteFor), &itmp)) votedFor_= itmp; /* Init members and learners */ std::string config; log_->getMetaData(std::string(keyMemberConfigure), config); uint64_t metaCurrent= 0; std::vector strMembers= StableConfiguration::stringToVector(config, metaCurrent); //TODO check strConfig and strMembers is equal const std::vector *pConfig= NULL; uint64_t index; if (strConfig.size() == 0) { if (metaCurrent == 0) { easy_error_log("Paxos::init: Can't find metaCurrent in MemberConfigure when init a follower node, there may have some error in meta, or this may be a learner!!"); assert(0); return -1; } pConfig= &strMembers; index= metaCurrent; } else { pConfig= &strConfig; index= current; /* We init from the arg(not the meta), so we should set the meta after init the configure! */ needSetMeta= true; } config.clear(); log_->getMetaData(std::string(keyLearnerConfigure), config); std::vector strLearners= StableConfiguration::stringToVector(config, metaCurrent); /* Search logs and init ccMgr if any configurechange has not been applied. */ uint64_t startScanIndex= 0; uint64_t lastLogIndex= log_->getLastLogIndex(); log_->getMetaData(keyScanIndex, &startScanIndex); if (startScanIndex != 0) { if (startScanIndex > lastLogIndex) { /* We have not write the configure change logentry into the log, skip the scan. */ log_->setMetaData(keyScanIndex, 0); } else { easy_error_log("Server %d : Start scan log on startup from %llu to %llu for uncommit configure change log entries.\n", index, startScanIndex, lastLogIndex); for (uint64_t i= startScanIndex; i <= lastLogIndex; ++i) { LogEntry entry; if (log_->getEntry(i, entry, false)) { easy_error_log("Fail to get log on startup, index %llu", i); exit(1); } if (entry.optype() == kConfigureChange) { if (ccMgr_.prepared == 0) { ccMgr_.prepared= 1; ccMgr_.preparedIndex= entry.index(); } else { easy_error_log("Server %d : Scan log on startup find more than 1 uncommit configure change entries!!\n", index); } } } } /* Check if the start scan index have already been clean up or hit some bug. */ log_->getMetaData(keyScanIndex, &startScanIndex); if (startScanIndex != 0) { easy_error_log("Server %d : startScanIndex(%llu) does not been clean up after scan log, may hit a bug!! We clean it now!!\n", index); log_->setMetaData(keyScanIndex, 0); } } log_->initMetaCache(); /* Init Service */ srv_= std::shared_ptr(new Service(this)); if (cs) srv_->cs= cs; srv_->init(ioThreadCnt, workThreadCnt, heartbeatTimeout_, memory_usage_count, heartbeatThreadCnt); std::string curConfig= (*pConfig)[index - 1]; auto pos = curConfig.find(":"); host_ = curConfig.substr(0, pos); port_ = std::stoull(curConfig.substr(pos + 1)); int error= 0; if ((error= srv_->start(port_))) { easy_error_log("Fail to start libeasy service, error(%d).", error); abort(); } electionTimer_= std::make_shared(srv_->getThreadTimerService(), srv_, electionTimeout_, ThreadTimer::Stage, &Paxos::startElectionCallback, this); electionTimer_->start(); epochTimer_= std::make_shared(srv_->getThreadTimerService(), srv_, electionTimeout_, ThreadTimer::Repeatable, &Paxos::epochTimerCallback, this); purgeLogTimer_ = std::make_shared(srv_->getThreadTimerService(), srv_, purgeLogTimeout_, ThreadTimer::Repeatable, &Paxos::purgeLogCallback, this); /* * TODO: we should 1. start libeasy 2. installConfig 3. requestVote */ /* Init Configuration */ std::dynamic_pointer_cast(config_)->installConfig((*pConfig), index, this, localServer); config_->forEach(&Server::connect, (void *)NULL); config_->addLearners(strLearners, this, true); if (needSetMeta) { log_->setMetaData(Paxos::keyLearnerConfigure, config_->learnersToString()); log_->setMetaData(Paxos::keyMemberConfigure, config_->membersToString(localServer_->strAddr)); } return 0; } int Paxos::initAsLearner(std::string& strConfig, ClientService *cs, uint64_t ioThreadCnt, uint64_t workThreadCnt, std::shared_ptr localServer, bool memory_usage_count, uint64_t heartbeatThreadCnt) { // set new seed for auto leader transfer srand(time(0)); bool needSetMeta= false; state_= LEARNER; easy_warn_log("Start init node as a learner."); /* Init persistent variables */ uint64_t itmp; if (! log_->getMetaData(std::string(keyClusterId), &itmp)) { clusterId_.store(itmp); } if (! log_->getMetaData(std::string(keyCurrentTerm), &itmp)) { currentTerm_= itmp; log_->setTerm(currentTerm_); } if (! log_->getMetaData(std::string(keyVoteFor), &itmp)) votedFor_= itmp; std::vector strLearners; std::string strMember; std::string config; if (strConfig.size() == 0) { /* Init members and learners */ /** * Old learner node format: * keyMemberConfigure: "" * keyLearnerConfigure: "" * New learner node format: * keyMemberConfigure: "" * keyLearnerConfigure: "all learners in the cluster" **/ log_->getMetaData(std::string(keyMemberConfigure), config); if (config.size() > 0) { /* new format */ uint64_t metaCurrent; strMember= config; config.clear(); log_->getMetaData(std::string(keyLearnerConfigure), config); strLearners= StableConfiguration::stringToVector(config, metaCurrent); } else { /* old format */ log_->getMetaData(std::string(keyLearnerConfigure), strMember); needSetMeta= true; } } else { strMember= strConfig; needSetMeta= true; } log_->initMetaCache(); /* Init Service */ srv_= std::shared_ptr(new Service(this)); if (cs) srv_->cs= cs; srv_->init(ioThreadCnt, workThreadCnt, heartbeatTimeout_, memory_usage_count, heartbeatThreadCnt); electionTimer_= std::make_shared(srv_->getThreadTimerService(), srv_, electionTimeout_, ThreadTimer::Stage, &Paxos::startElectionCallback, this); epochTimer_= std::make_shared(srv_->getThreadTimerService(), srv_, electionTimeout_, ThreadTimer::Repeatable, &Paxos::epochTimerCallback, this); purgeLogTimer_ = std::make_shared(srv_->getThreadTimerService(), srv_, purgeLogTimeout_, ThreadTimer::Repeatable, &Paxos::purgeLogCallback, this); const std::string& curConfig= strMember; auto pos = curConfig.find(":"); host_ = curConfig.substr(0, pos); port_ = std::stoull(curConfig.substr(pos + 1)); int error= 0; if ((error= srv_->start(port_))) { easy_error_log("Fail to start libeasy service, error(%d).", error); abort(); } /* * TODO: we should 1. start libeasy 2. installConfig 3. requestVote */ /* Init Configuration */ std::vector tmpConfig; tmpConfig.push_back(strMember); std::dynamic_pointer_cast(config_)->installConfig(tmpConfig, 1, this, localServer); localServer_->serverId += 100; /* Learner has all other learner's info now. */ //config_->forEachLearners(&Server::connect, (void *)NULL); config_->addLearners(strLearners, this, true); if (needSetMeta) { log_->setMetaData(Paxos::keyLearnerConfigure, config_->learnersToString()); log_->setMetaData(Paxos::keyMemberConfigure, config_->membersToString()); } return 0; } void Paxos::msleep(uint64_t t) { struct timeval sleeptime; if (t == 0) return; sleeptime.tv_sec= t / 1000; sleeptime.tv_usec= (t - (sleeptime.tv_sec * 1000)) * 1000; select(0, 0, 0, 0, &sleeptime); } void Paxos::startElectionCallback() { easy_warn_log("Server %d : Enter startElectionCallback\n", localServer_->serverId); requestVote(false); } void Paxos::heartbeatCallback(std::weak_ptr wserver) { std::shared_ptr server; if (!(server = wserver.lock())) return; Paxos *paxos= server->paxos; easy_warn_log("Server %d : send heartbeat msg to server %ld\n", paxos->getLocalServer()->serverId, server->serverId); paxos->appendLogToServer(wserver, true, true); } uint64_t Paxos::getLeaderTransferInterval_() { return (electionTimeout_ / 5) + 100; } uint64_t Paxos::getNextEpochCheckStatemachine_(uint64_t epoch) { if (option.enableAutoLeaderTransfer_) return epoch + std::max((uint64_t)5, (option.autoLeaderTransferCheckSeconds_ * 1000 / electionTimeout_)); else return UINT64_MAX; } // paxos mutex protected uint64_t Paxos::leaderTransferIfNecessary_(uint64_t epoch) { bool run= false; std::string reason; uint64_t target; if (!option.enableAutoLeaderTransfer_.load() || state_ != LEADER || subState_ == SubLeaderTransfer) { return 0; } if (localServer_->logType) { run= true; reason= "instance is log node"; } else if (nextEpochCheckStatemachine_ != UINT64_MAX) { if (log_->isStateMachineHealthy()) { nextEpochCheckStatemachine_= UINT64_MAX; } else if (epoch >= nextEpochCheckStatemachine_) { run= true; reason= "state machine not healthy"; nextEpochCheckStatemachine_ = getNextEpochCheckStatemachine_(epoch); } } if (!run) { return 0; } run= false; auto servers = config_->getServers(); std::vector choices; for (auto& e : servers) { if (e == nullptr || e->serverId == localServer_->serverId) continue; std::shared_ptr server = std::dynamic_pointer_cast(e); if (server->electionWeight >= localServer_->electionWeight && server->getLastAckEpoch() >= epoch) { run= true; choices.push_back(server->serverId); } } if (!run) { return 0; } target = choices[rand() % choices.size()]; easy_error_log("Server %d: try to do an auto leader transfer, reason: %s, target: %llu", localServer_->serverId, reason.c_str(), target); return target; } void Paxos::epochTimerCallback() { std::unique_lock ul(lock_); if (state_ != LEADER && state_ != CANDIDATE) { epochTimer_->stop(); return; } if (state_ == CANDIDATE) { /* When we're candidate we only calculate the epoch. */ easy_warn_log("Server %d : Epoch task currentEpoch(%llu)\n", localServer_->serverId, currentEpoch_.load()); currentEpoch_.fetch_add(1); return; } uint64_t forceMinEpoch= config_->forceMin(&Server::getLastAckEpoch); uint64_t quorumEpoch= config_->quorumMin(&Server::getLastAckEpoch); easy_warn_log("Server %d : Epoch task currentEpoch(%llu) quorumEpoch(%llu) forceMinEpoch(%llu)\n", localServer_->serverId, currentEpoch_.load(), quorumEpoch, forceMinEpoch); if (currentEpoch_.load() > (forceMinEpoch + forceSyncEpochDiff_)) { if (leaderForceSyncStatus_ == true) { leaderForceSyncStatus_.store(false); easy_warn_log("Server %d : lost connect with force sync server, disable force sync now!\n", localServer_->serverId); } } else { if (leaderForceSyncStatus_ == false) { leaderForceSyncStatus_.store(true); easy_warn_log("Server %d : reconnect with all force sync server, enable force sync now!\n", localServer_->serverId); } } if (currentEpoch_.load() > quorumEpoch) { /* Lost connect with major followers, we should step down. */ easy_error_log("Server %d : lost connect with major followers, stepdown myself\n", localServer_->serverId); if (debugDisableStepDown) { easy_warn_log("Server %d : Skip step down because of debugDisableStepDown currentTerm(%ld)\n", localServer_->serverId, currentTerm_.load()); return; } newTerm(currentTerm_ + 1); electionTimer_->start(); } else { assert(currentEpoch_.load() == quorumEpoch); uint64_t prevEpoch = currentEpoch_.fetch_add(1); uint64_t target = leaderTransferIfNecessary_(prevEpoch); if (target) { subState_.store(SubLeaderTransfer); weightElecting_ = true; ul.unlock(); /* try time should not exceed one epoch */ uint64_t times = std::max((electionTimeout_ / getLeaderTransferInterval_() + 1), (uint64_t)3); leaderTransferSend_(target, currentTerm_.load(), log_->getLastLogIndex(), times); } } } int Paxos::initAutoPurgeLog(bool autoPurge, bool useAppliedIndex, std::function handler) { autoPurge_ = autoPurge; if (!autoPurge_) purgeLogTimer_->stop(); useAppliedIndex_ = useAppliedIndex; if (autoPurge && !useAppliedIndex) { easy_warn_log("Server %d : use commitIndex instead of appliedIndex when auto purging log.", localServer_->serverId); } log_->setPurgeLogFilter(handler); return 0; } void Paxos::purgeLogCallback() { /* purge log without a forceIndex */ forcePurgeLog(false /* local */); } void Paxos::doPurgeLog(purgeLogArgType *arg) { uint64_t purgeIndex; if (arg->paxos->useAppliedIndex_) purgeIndex = arg->index < arg->paxos->getAppliedIndex()? arg->index: arg->paxos->getAppliedIndex(); else purgeIndex = arg->index < arg->paxos->getCommitIndex()? arg->index: arg->paxos->getCommitIndex(); easy_warn_log("Server %d : doPurgeLog purge index %ld\n", arg->paxos->localServer_->serverId, purgeIndex); arg->paxos->getLog()->truncateForward(purgeIndex); } void Paxos::updateAppliedIndex(uint64_t index) { appliedIndex_.store(index); } uint64_t Paxos::collectMinMatchIndex(std::vector &cis, bool local, uint64_t forceIndex) { uint64_t ret = forceIndex; /* * minMatchIndex Protection * 1. non local (only leader): all nodes matchIndex * 2. local & Leader: all nodes matchIndex * 3. local & not leader: all learner source from me */ for (auto ci : cis) { if (ci.serverId == localServer_->serverId) continue; if (local == false || state_ == LEADER || (ci.role == LEARNER && ci.learnerSource == localServer_->serverId)) ret = ci.matchIndex < ret ? ci.matchIndex : ret; } uint64_t lastLogIndex = log_->getLastLogIndex(); if (ret > lastLogIndex) ret = lastLogIndex; return ret; } int Paxos::forcePurgeLog(bool local, uint64_t forceIndex) { if (local == false && state_ != LEADER) { return -1; } /* update minMatchIndex_ */ /* appendlog should take purge log information if minMatchIndex_ is not 0 */ std::vector cis; getClusterInfo(cis); if (cis.size() == 0) { return 0; } minMatchIndex_ = collectMinMatchIndex(cis, local, forceIndex); easy_warn_log("Server %d : Prepare to purge log to %s, update minMatchIndex %ld\n", localServer_->serverId, local ? "local" : "cluster", minMatchIndex_); /* leader */ purgeLogQueue_.push(new purgeLogArgType(minMatchIndex_, this)); srv_->sendAsyncEvent(&SingleProcessQueue::process, &purgeLogQueue_, Paxos::doPurgeLog); if (local == false) { /* follower */ std::lock_guard lg(lock_); return leaderCommand(PurgeLog, NULL); } else { return 0; } } void Paxos::electionWeightAction(uint64_t term, uint64_t baseEpoch) { easy_error_log("Server %d : electionWeightAction start, term:%llu epoch:%llu", localServer_->serverId, term, baseEpoch); std::lock_guard lg(lock_); if (term != currentTerm_.load() || state_.load() != LEADER) { subState_.store(SubNone); weightElecting_ = false; easy_error_log("Server %d : electionWeightAction fail, action term(%llu), currentTerm(%llu), current state(%s)\n", localServer_->serverId, term, currentTerm_.load(), stateString[state_]); return; } uint64_t targetId= config_->getMaxWeightServerId(baseEpoch, localServer_); if (targetId != localServer_->serverId && targetId != 0) { auto term= currentTerm_.load(); auto lli= log_->getLastLogIndex(); easy_error_log("Server %d : electionWeightAction try to transfer leader to server %llu, term(%llu)\n", localServer_->serverId, targetId, term); uint64_t retryTimes = 5; lock_.unlock(); leaderTransferSend_(targetId, term, lli, retryTimes); lock_.lock(); } else { subState_.store(SubNone); weightElecting_ = false; easy_error_log("Server %d : electionWeightAction skip transfer leader because %s.\n", localServer_->serverId, targetId == 0 ? "no available server" : "I am the max weight available server"); } } void Paxos::resetNextIndexForServer(std::shared_ptr server) { std::lock_guard lg(lock_); auto lastLogIndex= getLastLogIndex(); /* make sure the first appendLog msg when reconnect have payload to truncateForward. */ if (lastLogIndex > 1) lastLogIndex -= 1; if (server->matchIndex.load() != 0) server->nextIndex.store(server->matchIndex.load() + 1); else if (server->isLearner && server->sendByAppliedIndex) server->nextIndex.store(appliedIndex_.load() + 1); else server->nextIndex.store(lastLogIndex); } bool Paxos::tryFillFollowerMeta_(::google::protobuf::RepeatedPtrField< ::alisql::ClusterInfoEntry > *ciEntries) { uint64_t localFollowerMetaNo= followerMetaNo_.fetch_add(0); if (localFollowerMetaNo > lastSyncMetaNo_ + syncMetaInterval_) { lastSyncMetaNo_= localFollowerMetaNo; config_->forEachLearners(&Server::fillFollowerMeta, (void *)ciEntries); } return ciEntries->size() != 0; } int Paxos::getClusterInfo(std::vector &cis) { cis.clear(); config_->forEach(&Server::fillInfo, (void *)&cis); config_->forEachLearners(&Server::fillInfo, (void *)&cis); return 0; } int Paxos::getClusterHealthInfo(std::vector &healthInfo) { std::lock_guard lg(lock_); if (state_ != LEADER) return 1; uint64_t lastLogIndex = getLastLogIndex(); uint64_t appliedIndex = appliedIndex_; std::vector cis; getClusterInfo(cis); for (auto &e : cis) { HealthInfoType hi; hi.serverId = e.serverId; hi.addr = e.ipPort; hi.role = e.role; if (e.serverId != localServer_->serverId) { std::shared_ptr server = std::dynamic_pointer_cast(config_->getServer(e.serverId)); if (server) { hi.connected = !(server->lostConnect || server->netError); } else { hi.connected = false; } } else { hi.connected = true; } hi.logDelayNum = lastLogIndex > e.matchIndex ? lastLogIndex - e.matchIndex : 0; hi.applyDelayNum = appliedIndex > e.appliedIndex ? appliedIndex - e.appliedIndex : 0; healthInfo.push_back(hi); } return 0; } void Paxos::printClusterInfo(const std::vector &cis) { for (auto& ci : cis) { std::cout<< "serverId:"<< ci.serverId<< " ipPort:"<< ci.ipPort<< " matchIndex:"<< ci.matchIndex<< " nextIndex:"<< ci.nextIndex<< " role:"<< ci.role<< " hasVoted:"<< ci.hasVoted << " forceSync:" << ci.forceSync << " electionWeight:" << ci.electionWeight << " learnerSource:" << ci.learnerSource << " appliedIndex:" << ci.appliedIndex << " pipelining:" << ci.pipelining << std::endl<< std::flush; } } void Paxos::getMemberInfo(MemberInfoType *mi) { mi->serverId= localServer_->serverId; mi->currentTerm= currentTerm_; mi->currentLeader= leaderId_; mi->commitIndex= commitIndex_; uint64_t lastLogIndex= log_->getLastLogIndex(); LogEntry entry; uint64_t lastLogTerm= 0; if (log_->getEntry(lastLogIndex, entry, false) == 0) lastLogTerm= entry.term(); mi->lastLogTerm= lastLogTerm; mi->lastLogIndex= lastLogIndex; if (weightElecting_.load() || leaderStepDowning_.load()) mi->role= NOROLE; else mi->role= state_; mi->votedFor= votedFor_; mi->lastAppliedIndex= appliedIndex_.load(); mi->currentLeaderAddr= leaderAddr_; } uint64_t Paxos::getServerIdFromAddr(const std::string& strAddr) { std::unique_lock ul(lock_); return config_->getServerIdFromAddr(strAddr); } // override default compress option for address, if address is empty, override for all // return 0 on success, 1 on failure int Paxos::setMsgCompressOption(int type, size_t threshold, bool checksum, const std::string &strAddr) { std::unique_lock ul(lock_); MsgCompressOption option((MsgCompressionType)type, threshold, checksum); if (shutdown_.load() || config_ == nullptr) { easy_error_log("set MsgCompressOption fail, Paxos is stopped.\n"); return 1; } if (strAddr == "") { config_->forEach(&Server::setMsgCompressOption, &option); config_->forEachLearners(&Server::setMsgCompressOption, &option); } else { uint64_t id = config_->getServerIdFromAddr(strAddr); Configuration::ServerRef server; if (id == 0 || ((server = config_->getServer(id)) == nullptr)) { easy_error_log("Server %d : can't find server %s in setMsgCompressOption\n", localServer_->serverId, strAddr.c_str()); return 1; } server->setMsgCompressOption(&option); } easy_warn_log("set MsgCompressOption type(%d) threshold(%u) checksum(%d) to server(%s) succeed.\n", type, threshold, checksum, strAddr == "" ? "all" : strAddr.c_str()); return 0; } int Paxos::resetMsgCompressOption() { return setMsgCompressOption(0 /* type */, 0 /* threshold */, 0 /* checksum */, ""); } int Paxos::setClusterId(uint64_t ci) { int ret = log_->setMetaData(std::string(keyClusterId), ci); if (ret == 0) clusterId_.store(ci); return ret; } void Paxos::setLearnerConnTimeout(uint64_t t) { if (t < (heartbeatTimeout_/4)) t = heartbeatTimeout_/4; easy_warn_log("Server %d : Learner connection timeout set to %llu.", localServer_->serverId, t); localServer_->learnerConnTimeout = t; } void Paxos::setSendPacketTimeout(uint64_t t) { if (t < heartbeatTimeout_) t = heartbeatTimeout_; easy_warn_log("Server %d : Send packet timeout set to %llu.", localServer_->serverId, t); srv_->setSendPacketTimeout(t); } void Paxos::forceFixMatchIndex(uint64_t targetId, uint64_t newIndex) { std::unique_lock ul(lock_); if (state_ != LEADER || targetId == 0 || targetId == localServer_->serverId) return; std::shared_ptr server= std::dynamic_pointer_cast(config_->getServer(targetId)); if (!server) { easy_warn_log("Server %d : can't find server %llu in forceFixMatchIndex\n", localServer_->serverId, targetId); return; } easy_error_log("Server %d : force fix server %d's matchIndex(old: %llu, new: %llu). Dangerous Operation!", localServer_->serverId, targetId, server->matchIndex.load(), newIndex); server->resetMatchIndex(newIndex); } void Paxos::forceFixMatchIndex(const std::string& addr, uint64_t newIndex) { std::unique_lock ul(lock_); uint64_t targetId = config_->getServerIdFromAddr(addr); if (state_ != LEADER || targetId == 0 || targetId == localServer_->serverId) return; std::shared_ptr server= std::dynamic_pointer_cast(config_->getServer(targetId)); if (!server) { easy_warn_log("Server %d : can't find server %llu in forceFixMatchIndex\n", localServer_->serverId, targetId); return; } easy_error_log("Server %d : force fix server %d's matchIndex(old: %llu, new: %llu). Dangerous Operation!", localServer_->serverId, targetId, server->matchIndex.load(), newIndex); server->resetMatchIndex(newIndex); } int Paxos::log_checksum_test(const LogEntry &le) { if (checksumCb_ && checksum_mode_ && le.checksum() != 0) { const unsigned char* buf = reinterpret_cast(le.value().c_str()); uint64_t cs = checksumCb_(0, buf, le.value().size()); if (cs == le.checksum()) return 0; else return -1; } return 0; } void Paxos::reset_flow_control() { std::unique_lock ul(lock_); config_->reset_flow_control(); } void Paxos::set_flow_control(uint64_t serverId, int64_t fc) { /* * flow control mode: * >0 TODO * 0 no flow control * -1 slow send log (send by heartbeat) * <-1 never send log */ std::unique_lock ul(lock_); config_->set_flow_control(serverId, fc); } void Paxos::truncateBackward_(uint64_t firstIndex) { if(ccMgr_.prepared && ccMgr_.preparedIndex >= firstIndex) { /* * 1. set aborted to 1 to notify client the configureChange is failed. * 2. reset preparedIndex to prevent apply configureChange before ccMgr_ is cleared. * 3. prepared flag is kept to 1 to prevent concurrent configureChange until * client is notified and ccMgr_ is cleared. */ ccMgr_.aborted = 1; ccMgr_.preparedIndex = 0; ccMgr_.cond.notify_all(); // if scan index < firstIndex, we do not reset, it will be reset when applied; // if scan index >= firstIndex, then the prepared configure change will not be committed, // we need to reset to 0. log_->setMetaData(keyScanIndex, 0); } log_->truncateBackward(firstIndex); } const std::string Paxos::keyCurrentTerm= "@keyCurrentTerm_@"; const std::string Paxos::keyVoteFor= "@keyVoteFor_@"; const std::string Paxos::keyLastLeaderTerm= "@keyLastLeaderTerm_@"; const std::string Paxos::keyLastLeaderLogIndex= "@keyLastLeaderLogIndex_@"; const std::string Paxos::keyMemberConfigure= "@keyMemberConfigure_@"; const std::string Paxos::keyLearnerConfigure= "@keyLearnerConfigure_@"; const std::string Paxos::keyScanIndex= "@keyScanIndex_@"; const std::string Paxos::keyClusterId= "@keyClusterId_@"; const uint64_t Paxos::maxSystemPacketSize_= 50000000; bool Paxos::debugDisableElection= false; bool Paxos::debugDisableStepDown= false; bool Paxos::debugWitnessTest= false; bool Paxos::debugResetLogSlow= false; bool Paxos::debugSkipUpdateCommitIndex= false; } //namespace alisql