425 lines
12 KiB
C++
425 lines
12 KiB
C++
/************************************************************************
|
|
*
|
|
* Copyright (c) 2019 Alibaba.com, Inc. All Rights Reserved
|
|
* $Id: consensus-bug-t.cc,v 1.0 Mar 06, 2019 5:22:02 PM jarry.zj(jarry.zj@alibaba-inc.com) $
|
|
*
|
|
************************************************************************/
|
|
|
|
/**
|
|
* @file consensus-bug-t.cc
|
|
* @author jarry.zj(jarry.zj@alibaba-inc.com)
|
|
* @date Mar 06, 2019 5:22:02 PM
|
|
* @version 1.0
|
|
* @brief unit test for consensus complicated bugs (maybe still unsolved)
|
|
*
|
|
**/
|
|
|
|
#include <atomic>
|
|
#include <thread>
|
|
#include <gtest/gtest.h>
|
|
#include <string>
|
|
#include "easyNet.h"
|
|
#include "service.h"
|
|
#include "paxos.h"
|
|
#include "paxos_log.h"
|
|
#include "paxos_server.h"
|
|
#include "paxos_configuration.h"
|
|
#include "paxos.pb.h"
|
|
#include "files.h"
|
|
|
|
#include "rd_paxos_log.h"
|
|
#include "file_paxos_log.h"
|
|
|
|
using namespace alisql;
|
|
|
|
static void msleep(uint64_t t)
|
|
{
|
|
struct timeval sleeptime;
|
|
if (t == 0)
|
|
return;
|
|
sleeptime.tv_sec= t / 1000;
|
|
sleeptime.tv_usec= (t - (sleeptime.tv_sec * 1000)) * 1000;
|
|
select(0, 0, 0, 0, &sleeptime);
|
|
}
|
|
|
|
/*
|
|
* corner case1: change follower to learner
|
|
* Ref: https://yuque.antfin-inc.com/db_core_team/internal_docs/kfg8ch#84659740
|
|
* It is a real case from Alibaba production environment.
|
|
* Has been fixed in X-Paxos
|
|
*/
|
|
TEST(bug, cc_corner_case1)
|
|
{
|
|
std::vector<std::string> strConfig;
|
|
strConfig.emplace_back("127.0.0.1:11001");
|
|
strConfig.emplace_back("127.0.0.1:11002");
|
|
strConfig.emplace_back("127.0.0.1:11003");
|
|
std::string strTmp3("127.0.0.1:11004");
|
|
|
|
const uint64_t timeout= 2000;
|
|
std::shared_ptr<PaxosLog> rlog, rlog1;
|
|
rlog1= rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir11", true, 4 * 1024 * 1024);
|
|
Paxos *paxos1= new Paxos(timeout, rlog);
|
|
paxos1->init(strConfig, 1);
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir12", true, 4 * 1024 * 1024);
|
|
Paxos *paxos2= new Paxos(timeout, rlog);
|
|
paxos2->init(strConfig, 2);
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir13", true, 4 * 1024 * 1024);
|
|
Paxos *paxos3= new Paxos(timeout, rlog);
|
|
paxos3->init(strConfig, 3);
|
|
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir14", true, 4 * 1024 * 1024);
|
|
Paxos *paxos4= new Paxos(timeout, rlog);
|
|
paxos4->initAsLearner(strTmp3);
|
|
|
|
sleep(3);
|
|
paxos1->requestVote();
|
|
sleep(1);
|
|
|
|
EXPECT_EQ(paxos1->getState(), Paxos::LEADER);
|
|
|
|
strConfig.clear();
|
|
strConfig.push_back(strTmp3);
|
|
|
|
EXPECT_EQ(paxos1->changeLearners(Paxos::CCAddNode, strConfig), 0);
|
|
|
|
LogEntry le;
|
|
le.set_index(0);
|
|
le.set_optype(0);
|
|
le.set_value("aaa");
|
|
for (uint64_t i= 0; i<10; ++i)
|
|
{
|
|
le.clear_term();
|
|
paxos1->replicateLog(le);
|
|
msleep(200);
|
|
}
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos2->getLastLogIndex());
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos3->getLastLogIndex());
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex());
|
|
/* Now, all nodes are normal */
|
|
|
|
/*paxos1 write a new log and nobody receive */
|
|
paxos1->set_flow_control(2, -2);
|
|
paxos1->set_flow_control(3, -2);
|
|
paxos1->set_flow_control(100, -2);
|
|
le.clear_term();
|
|
paxos1->replicateLog(le);
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos2->getLastLogIndex() + 1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos3->getLastLogIndex() + 1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex() + 1);
|
|
|
|
/* paxos1 crash, paxos2 become new leader */
|
|
easy_warn_log("Now stop paxos1...");
|
|
delete paxos1;
|
|
paxos2->requestVote();
|
|
sleep(1);
|
|
EXPECT_EQ(paxos2->getState(), Paxos::LEADER);
|
|
sleep(1);
|
|
/* equal but last one is different */
|
|
EXPECT_EQ(paxos2->getLastLogIndex(), rlog1->getLastLogIndex());
|
|
|
|
/* change paxos1 to learner, change paxos3 to follower */
|
|
paxos2->downgradeMember(1);
|
|
paxos2->changeMember(Paxos::CCAddNode, strTmp3);
|
|
le.clear_term();
|
|
paxos2->replicateLog(le);
|
|
sleep(1);
|
|
|
|
/* restart paxos1 */
|
|
easy_warn_log("Now restart paxos1...");
|
|
paxos1 = new Paxos(timeout, rlog1);
|
|
strConfig.clear();
|
|
paxos1->init(strConfig, 0);
|
|
sleep(5);
|
|
|
|
/* paxos3 and paxos4 are normal */
|
|
EXPECT_EQ(paxos2->getLastLogIndex(), paxos3->getLastLogIndex());
|
|
EXPECT_EQ(paxos2->getLastLogIndex(), paxos4->getLastLogIndex());
|
|
|
|
/* If bug still exists */
|
|
// EXPECT_EQ(paxos2->getLastLogIndex(), paxos1->getLastLogIndex() + 3);
|
|
/* If bug fixed */
|
|
EXPECT_EQ(paxos2->getLastLogIndex(), paxos1->getLastLogIndex());
|
|
|
|
delete paxos1;
|
|
delete paxos2;
|
|
delete paxos3;
|
|
delete paxos4;
|
|
deleteDir("paxosLogTestDir11");
|
|
deleteDir("paxosLogTestDir12");
|
|
deleteDir("paxosLogTestDir13");
|
|
deleteDir("paxosLogTestDir14");
|
|
}
|
|
|
|
/*
|
|
* corner case2: change learner to follower
|
|
* Ref: https://yuque.antfin-inc.com/db_core_team/internal_docs/kfg8ch#60d98465
|
|
* We are not going to solve this problem now
|
|
* because it hardly ever happens in real world.
|
|
*/
|
|
TEST(bug, cc_corner_case2)
|
|
{
|
|
std::vector<std::string> strConfig;
|
|
strConfig.emplace_back("127.0.0.1:11001");
|
|
strConfig.emplace_back("127.0.0.1:11002");
|
|
strConfig.emplace_back("127.0.0.1:11003");
|
|
std::string strTmp3("127.0.0.1:11004");
|
|
|
|
const uint64_t timeout= 2000;
|
|
std::shared_ptr<PaxosLog> rlog, rlog1;
|
|
rlog1= rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir11", true, 4 * 1024 * 1024);
|
|
Paxos *paxos1= new Paxos(timeout, rlog);
|
|
paxos1->init(strConfig, 1);
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir12", true, 4 * 1024 * 1024);
|
|
Paxos *paxos2= new Paxos(timeout, rlog);
|
|
paxos2->init(strConfig, 2);
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir13", true, 4 * 1024 * 1024);
|
|
Paxos *paxos3= new Paxos(timeout, rlog);
|
|
paxos3->init(strConfig, 3);
|
|
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir14", true, 4 * 1024 * 1024);
|
|
Paxos *paxos4= new Paxos(timeout, rlog);
|
|
paxos4->initAsLearner(strTmp3);
|
|
|
|
sleep(3);
|
|
paxos1->requestVote();
|
|
sleep(1);
|
|
|
|
EXPECT_EQ(paxos1->getState(), Paxos::LEADER);
|
|
|
|
strConfig.clear();
|
|
strConfig.push_back(strTmp3);
|
|
|
|
EXPECT_EQ(paxos1->changeLearners(Paxos::CCAddNode, strConfig), 0);
|
|
|
|
LogEntry le;
|
|
le.set_index(0);
|
|
le.set_optype(0);
|
|
le.set_value("aaa");
|
|
for (uint64_t i= 0; i<10; ++i)
|
|
{
|
|
le.clear_term();
|
|
paxos1->replicateLog(le);
|
|
msleep(200);
|
|
}
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos2->getLastLogIndex());
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos3->getLastLogIndex());
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex());
|
|
/* Now, all nodes are normal */
|
|
|
|
/* paxos1 change paxos4 to follower, but paxos4 do not receive this log */
|
|
paxos1->set_flow_control(100, -2);
|
|
paxos1->changeMember(Paxos::CCAddNode, strTmp3);
|
|
while (paxos1->getLastLogIndex() != paxos2->getCommitIndex())
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos2->getLastLogIndex());
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos3->getLastLogIndex());
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos2->getCommitIndex());
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex() + 1);
|
|
|
|
/* paxos1 die */
|
|
delete paxos1;
|
|
paxos2->requestVote();
|
|
sleep(10); /* try to elect a leader */
|
|
|
|
/* If bug still exists, no leader will be elected */
|
|
EXPECT_FALSE(paxos2->getState() == Paxos::LEADER);
|
|
EXPECT_FALSE(paxos3->getState() == Paxos::LEADER);
|
|
EXPECT_FALSE(paxos4->getState() == Paxos::LEADER);
|
|
/* If bug fixed, leader exists */
|
|
// EXPECT_TRUE(paxos2->getState() == Paxos::LEADER || paxos3->getState() == Paxos::LEADER || paxos4->getState() == Paxos::LEADER);
|
|
|
|
delete paxos2;
|
|
delete paxos3;
|
|
delete paxos4;
|
|
deleteDir("paxosLogTestDir11");
|
|
deleteDir("paxosLogTestDir12");
|
|
deleteDir("paxosLogTestDir13");
|
|
deleteDir("paxosLogTestDir14");
|
|
}
|
|
|
|
/* check server-id */
|
|
class rdsRDPaxosLog: public RDPaxosLog
|
|
{
|
|
public:
|
|
uint32_t rds_server_id;
|
|
|
|
rdsRDPaxosLog(const std::string &dataDir): rds_server_id(0), RDPaxosLog(dataDir, true, 4 * 1024 * 1024) {}
|
|
virtual int getEntry(uint64_t logIndex, LogEntry &entry, bool fastfail = false)
|
|
{
|
|
int ret = RDPaxosLog::getEntry(logIndex, entry, fastfail);
|
|
// add rds fields to entry
|
|
RDSFields opaque;
|
|
opaque.set_rdsserverid(rds_server_id);
|
|
entry.set_opaque(opaque.SerializeAsString());
|
|
return ret;
|
|
}
|
|
virtual bool entriesPreCheck(const ::google::protobuf::RepeatedPtrField<LogEntry> &entries)
|
|
{
|
|
if (entries.size() > 0)
|
|
{
|
|
RDSFields opaque;
|
|
opaque.ParseFromString(entries.Get(0).opaque());
|
|
// check rds_server_id
|
|
if (opaque.has_rdsserverid() && opaque.rdsserverid() != rds_server_id)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
/* incorrect opaque causes pre-check fail */
|
|
TEST(bug, pre_check)
|
|
{
|
|
std::vector<std::string> strConfig;
|
|
strConfig.push_back("127.0.0.1:11001");
|
|
std::string strTmp3("127.0.0.1:11004");
|
|
|
|
const uint64_t timeout= 2000;
|
|
std::shared_ptr<rdsRDPaxosLog> rlog1, rlog2;
|
|
rlog1= std::make_shared<rdsRDPaxosLog>("paxosLogTestDir11");
|
|
Paxos *paxos1= new Paxos(timeout, rlog1);
|
|
paxos1->init(strConfig, 1);
|
|
|
|
rlog2= std::make_shared<rdsRDPaxosLog>("paxosLogTestDir14");
|
|
Paxos *paxos4= new Paxos(timeout, rlog2);
|
|
paxos4->initAsLearner(strTmp3);
|
|
|
|
sleep(3);
|
|
paxos1->requestVote();
|
|
sleep(1);
|
|
|
|
EXPECT_EQ(paxos1->getState(), Paxos::LEADER);
|
|
|
|
strConfig.clear();
|
|
strConfig.push_back(strTmp3);
|
|
|
|
EXPECT_EQ(paxos1->changeLearners(Paxos::CCAddNode, strConfig), 0);
|
|
|
|
LogEntry le;
|
|
le.set_index(0);
|
|
le.set_optype(0);
|
|
le.set_value("aaa");
|
|
|
|
/* ---- main testcase: rds_server_id pre-check ---- */
|
|
for (uint64_t i= 0; i<2; ++i)
|
|
{
|
|
le.clear_term();
|
|
paxos1->replicateLog(le);
|
|
}
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex());
|
|
rlog2->rds_server_id = 1; // use a different server-id
|
|
easy_warn_log("Set rds_server_id to 1");
|
|
for (uint64_t i= 0; i<2; ++i)
|
|
{
|
|
le.clear_term();
|
|
paxos1->replicateLog(le);
|
|
}
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex() + 2);
|
|
for (uint64_t i= 0; i<2; ++i)
|
|
{
|
|
le.clear_term();
|
|
paxos1->replicateLog(le);
|
|
}
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex() + 4);
|
|
|
|
rlog2->rds_server_id = 0; // reset to same server-id
|
|
easy_warn_log("Set rds_server_id to 0");
|
|
for (uint64_t i= 0; i<2; ++i)
|
|
{
|
|
le.clear_term();
|
|
paxos1->replicateLog(le);
|
|
}
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getLastLogIndex(), paxos4->getLastLogIndex());
|
|
/* ---- end of testcase ---- */
|
|
|
|
delete paxos1;
|
|
delete paxos4;
|
|
deleteDir("paxosLogTestDir11");
|
|
deleteDir("paxosLogTestDir14");
|
|
}
|
|
|
|
TEST(bug, follower_does_not_proceed_match_index)
|
|
{
|
|
std::vector<std::string> strConfig;
|
|
strConfig.emplace_back("127.0.0.1:11001");
|
|
strConfig.emplace_back("127.0.0.1:11002");
|
|
strConfig.emplace_back("127.0.0.1:11003");
|
|
|
|
const uint64_t timeout= 5000;
|
|
std::shared_ptr<PaxosLog> rlog, rlog1;
|
|
rlog1= rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir11", true, 4 * 1024 * 1024);
|
|
Paxos *paxos1= new Paxos(timeout, rlog);
|
|
paxos1->init(strConfig, 1);
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir12", true, 4 * 1024 * 1024);
|
|
Paxos *paxos2= new Paxos(timeout, rlog);
|
|
paxos2->init(strConfig, 2);
|
|
rlog= std::make_shared<RDPaxosLog>("paxosLogTestDir13", true, 4 * 1024 * 1024);
|
|
Paxos *paxos3= new Paxos(timeout, rlog);
|
|
paxos3->init(strConfig, 3);
|
|
|
|
sleep(1);
|
|
paxos1->requestVote();
|
|
sleep(1);
|
|
|
|
EXPECT_EQ(paxos1->getState(), Paxos::LEADER);
|
|
|
|
LogEntry le;
|
|
le.set_index(0);
|
|
le.set_optype(0);
|
|
le.set_value("aaa");
|
|
paxos1->replicateLog(le);
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getCommitIndex(), 2);
|
|
EXPECT_EQ(paxos2->getLastLogIndex(), 2);
|
|
|
|
// follower 2 match index should be 2
|
|
{
|
|
std::vector<Paxos::ClusterInfoType> cis;
|
|
paxos1->getClusterInfo(cis);
|
|
EXPECT_EQ(cis[1].matchIndex, 2);
|
|
}
|
|
|
|
// let follower 2 increase its term, then on leader its match index will be reset to 0
|
|
easy_warn_log("now fake election");
|
|
paxos2->fakeRequestVote();
|
|
|
|
// follower 2 match index will be set to 0
|
|
sleep(2);
|
|
{
|
|
std::vector<Paxos::ClusterInfoType> cis;
|
|
paxos1->getClusterInfo(cis);
|
|
EXPECT_EQ(cis[1].matchIndex, 0);
|
|
}
|
|
|
|
paxos2->unfakeRequestVote();
|
|
|
|
// follower 2 match index will be set right
|
|
sleep(2);
|
|
{
|
|
std::vector<Paxos::ClusterInfoType> cis;
|
|
paxos1->getClusterInfo(cis);
|
|
EXPECT_EQ(cis[1].matchIndex, 2);
|
|
}
|
|
|
|
paxos1->replicateLog(le);
|
|
sleep(1);
|
|
EXPECT_EQ(paxos1->getCommitIndex(), 3);
|
|
EXPECT_EQ(paxos2->getLastLogIndex(), 3);
|
|
|
|
delete paxos1;
|
|
delete paxos2;
|
|
delete paxos3;
|
|
deleteDir("paxosLogTestDir11");
|
|
deleteDir("paxosLogTestDir12");
|
|
deleteDir("paxosLogTestDir13");
|
|
}
|
|
|