/************************************************************************ * * Copyright (c) 2016 Alibaba.com, Inc. All Rights Reserved * $Id: service.cc,v 1.0 08/01/2016 10:59:50 AM yingqiang.zyq(yingqiang.zyq@alibaba-inc.com) $ * ************************************************************************/ /** * @file state_machine_service.cc * @author hangfeng.fj(hangfeng.fj@alibaba-inc.com) * @date 08/06/2016 11:59:50 AM * @version 1.0 * @brief implement of StateMachine * **/ #include #include "state_machine_service.h" #include #include #include #include #include #include #include #include #include "rocksdb/filter_policy.h" #include "rocksdb/table.h" #include "rocksdb/cache.h" DECLARE_string(data_dir); DECLARE_string(raftlog_dir); DECLARE_bool(data_compress); DECLARE_bool(raftlog_sync); DECLARE_uint64(data_write_buffer_size); DECLARE_int32(server_id); DECLARE_bool(data_compress); DECLARE_uint64(data_write_buffer_size); DECLARE_bool(raftlog_sync); DECLARE_uint64(max_write_buffer_number); DECLARE_uint64(min_write_buffer_number_to_merge); DECLARE_uint64(max_background_compactions); DECLARE_uint64(max_bytes_for_level_base); DECLARE_uint64(target_file_size_base); DECLARE_uint64(level0_slowdown_writes_trigger); DECLARE_uint64(level0_stop_writes_trigger); DECLARE_bool(enable_statistics); DECLARE_uint64(stats_dump_period_sec); DECLARE_uint64(block_cache_size); DECLARE_uint64(block_size); DECLARE_uint64(bloom_filter_bits_per_key); DECLARE_bool(block_based_bloom_filter); namespace alisql { StateMachine::StateMachine(std::string &serverId, const std::vector &serverMembers, const std::vector &rpcMembers) : stop_(false), selfId_(serverId), lastAppliedIndex_(0), leaderTerm_(0) { std::vector::const_iterator it= serverMembers.begin(); bool selfInCluster= false; for (; it != serverMembers.end(); it++) { serverMembers_.push_back(*it); if (selfId_ == *it) { LOG_INFO("server member[Self]:%s\n", it->c_str()); selfInCluster= true; } else { LOG_INFO("server member:%s\n", it->c_str()); } } it= rpcMembers.begin(); for (; it != rpcMembers.end(); it++) { rpcMembers_.push_back(*it); if (selfId_ == *it) { LOG_INFO("rpc member[Self]:%s\n", it->c_str()); } else { LOG_INFO("rpc member:%s\n", it->c_str()); } } state_.store(Raft::FOLLOWER); } StateMachine::~StateMachine() { } int StateMachine::shutdown() { stop_= true; if (applyLogThread_.joinable()) applyLogThread_.join(); return 0; } void StateMachine::initApplyThread() { applyLogThread_= std::thread(&StateMachine::applyLogThread, this); } void StateMachine::applyLogThread() { while (!stop_) { while (!stop_ && raft_->getCommitIndex() <= lastAppliedIndex_) { raft_->waitCommitIndexUpdate(lastAppliedIndex_); } if (stop_) { return; } uint64_t lastAppliedIdx= lastAppliedIndex_; uint64_t commitIndex= raft_->getCommitIndex(); for (uint64_t i= lastAppliedIdx + 1; i <= commitIndex; i++) { LogEntry logEntry; int ret= raftLog_->getEntry(i, logEntry); if (ret != 0) { LOG_INFO("Fail to get entry idx: %ld from raft log db!\n", i); abort(); } const char *result; std::string entryKey= logEntry.key(); std::string entryVal= logEntry.value(); switch (logEntry.optype()) { case kPut: case kTairSet: set(entryKey, entryVal, i); result= TEXT_STORED; break; case kDel: ret= dataStore_->del("", entryKey, i); if (ret != 0) { LOG_INFO("Fail to delete entry at log idx: %ld from data storage!\n", i); abort(); } result= TEXT_DELETED; break; case kCas: { std::string value; // cas unique number sent within cas command uint64_t inSeqNum= 0; // current cas unique number for this record uint64_t curSeqNum= 0; MemcachedObject::deserializeSeqNum(entryVal, inSeqNum); ret= dataStore_->get("", entryKey, &value); if (ret == 0) //find record { MemcachedObject::deserializeSeqNum(value, curSeqNum); if (inSeqNum != curSeqNum) { LOG_INFO("Server: %s, cas miss for key %s, cur cas: %ld, sent cas: %ld\n", selfId_.c_str(), entryKey.c_str(), curSeqNum, inSeqNum); result= TEXT_EXISTS; break; } } // return "NOT_FOUND\r\n" to indicate that the item user // is trying to store with a "cas" command did not exist. else { result= TEXT_NOT_FOUND; break; } set(entryKey, entryVal, i); result= TEXT_STORED; break; } case kNop: { // Before transfering to leader, raft layer will always send // out a empty log, the statemachine is saft to transfer as // leader if this empty log is appied, this make sure all // logs before the empty one has been applied if (raft_->getState() == Raft::State::LEADER && leaderTerm_ == raft_->getTerm()) { LOG_INFO("Transfer to leader"); state_.store(Raft::State::LEADER); } break; } default: LOG_INFO("Unkonwn op: %ld\n", logEntry.optype()); } { std::lock_guard lg(lock_); if (clientRsps_.find(i) != clientRsps_.end()) { ClientRsp& ack= clientRsps_[i]; if (ack.response != NULL) { ack.response->setResult(result); srv_->sendResponse(ack.easyReq, ack.response, true); } clientRsps_.erase(i); } } lastAppliedIndex_++; } } } void StateMachine::set(std::string &key, std::string &value, uint64_t index) { MemcachedObject::serializeSeqNum(value, index); int ret= dataStore_->set("", key, value, index); if (ret != 0) { LOG_INFO("Fail to put entry at log idx: %ld to data storage!\n", index); abort(); } } int StateMachine::init(StateMachineService *srv) { int ret= 0; srv_= srv; std::string subDir= selfId_; boost::replace_all(subDir, ":", "_"); ret= initDataStorage(subDir); if (ret == -1) { LOG_INFO("Init data storage failed!\n"); return ret; } ret= initRaftLog(subDir); if (ret == -1) { LOG_INFO("Init raft log failed!\n"); return ret; } initLastAppliedIndex(); initRaft(); if (ret != 0) { LOG_INFO("Init raft failed!\n"); return ret; } initApplyThread(); return ret; } int StateMachine::initLastAppliedIndex() { std::string tag; int ret= dataStore_->get("", DataStorage::lastAppliedIndexTag, &tag); if (ret == 0) { lastAppliedIndex_= RDRaftLog::stringToInt(tag); } LOG_INFO("Init last applied index as: %ld\n", lastAppliedIndex_); } void StateMachine::stateChangeCb(enum Raft::State raftState) { // Step down if we are no longer leader if (state_.load() == Raft::LEADER) { if (raftState != Raft::LEADER) { LOG_INFO("Change from state %d to %d", state_.load(), raftState); state_.store(raftState); } } else { if (raftState == Raft::LEADER) { // we will transfer to leader after applying all logs, this is done // in applythread when we apply the empty log generated by raft layer leaderTerm_= raft_->getTerm(); LOG_INFO("Set leader term as: %lld\n", raft_->getTerm()); } else { LOG_INFO("Change from state %d to %d", state_.load(), raftState); state_.store(raftState); } } } int StateMachine::initRaft() { raft_= std::shared_ptr(new Raft(5000, raftLog_)); raft_->setStateChangeCb(std::bind(&StateMachine::stateChangeCb, this, std::placeholders::_1)); return raft_->init(rpcMembers_, FLAGS_server_id, NULL); } int StateMachine::initDataStorage(std::string &subDir) { std::string dataStorePath= FLAGS_data_dir + "/" + subDir + "/storage" ; rocksdb::Options options; getRocksDBOptions(options); dataStore_= std::shared_ptr(new DataStorage(dataStorePath, options)); return 0; } int StateMachine::initRaftLog(std::string &subDir) { std::string raftLogPath= FLAGS_raftlog_dir + "/" + subDir; raftLog_= std::shared_ptr(new RDRaftLog(raftLogPath, FLAGS_data_compress, FLAGS_data_write_buffer_size * 1024 * 1024)); return 0; } int StateMachine::getRocksDBOptions(rocksdb::Options &options) { options.create_if_missing= true; if (FLAGS_data_compress) { options.compression= rocksdb::kSnappyCompression; LOG_INFO("enable snappy compress for data storage\n"); } // Amount of data to build up in memory (backed by an unsorted log // on disk) before converting to a sorted on-disk file. options.write_buffer_size= FLAGS_data_write_buffer_size * 1024 * 1024; // The maximum number of write buffers that are built up in memory. options.max_write_buffer_number= FLAGS_max_write_buffer_number; // The minimum number of write buffers that will be merged together // before writing to storage. options.min_write_buffer_number_to_merge= FLAGS_min_write_buffer_number_to_merge; // Maximum number of concurrent background compaction jobs, submitted to // the default LOW priority thread pool. options.max_background_compactions= FLAGS_max_background_compactions; // Control maximum total data size for base level (level 1). options.max_bytes_for_level_base= FLAGS_max_bytes_for_level_base * 1024 * 1024; // Target file size for compaction. options.target_file_size_base= FLAGS_target_file_size_base * 1024 * 1024; // Soft limit on number of level-0 files. We start slowing down writes at this point. options.level0_slowdown_writes_trigger= FLAGS_level0_slowdown_writes_trigger; // Maximum number of level-0 files. We stop writes at this point. options.level0_stop_writes_trigger= FLAGS_level0_stop_writes_trigger; // Rocksdb Statistics provides cumulative stats over time. // Set it ture only when debugging performance, because it will introduce overhead. if (FLAGS_enable_statistics) { //options.statistics= rocksdb::CreateDBStatistics(); } // Dump statistics periodically in information logs. // Same as rocksdb's default value (10 min). options.stats_dump_period_sec= FLAGS_stats_dump_period_sec; rocksdb::BlockBasedTableOptions table_options; table_options.block_cache= rocksdb::NewLRUCache(FLAGS_block_cache_size * 1024 * 1024 * 1024); table_options.block_size= FLAGS_block_size * 1024; table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(FLAGS_bloom_filter_bits_per_key, FLAGS_block_based_bloom_filter)); options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options)); return 0; } uint64_t StateMachine::getLastAppliedIndex() { return lastAppliedIndex_; } int StateMachine::get(easy_request_t *r, TextRequest *cmd) { TextResponse *response = new TextResponse(); if (response == NULL) { LOG_INFO("Fail to allocate TextResponse!\n"); abort(); } ++stats_.cmd_get; if (state_.load() != Raft::LEADER) { response->setResult(TEXT_ERROR); return srv_->sendResponse(r, response); } const char* key; std::size_t keyLen; std::tie(key, keyLen)= cmd->getKey(); std::string keyStr(key, keyLen); std::string value; bool cas= false; uint64_t termBeforeGets; if (cmd->getCommand() == TextCommand::GETS && raft_->getState() != Raft::State::LEADER) { response->setResult(TEXT_ERROR); return srv_->sendResponse(r, response); } termBeforeGets= raft_->getTerm(); int ret= dataStore_->get("", keyStr, &value); if (ret == -1) { LOG_INFO("Get value for key: %s failed!\n", key); response->setResult(TEXT_END); return srv_->sendResponse(r, response); } if (cmd->getCommand() == TextCommand::GETS) { cas= true; // we must make sure we are leader to get correct cas unique if (raft_->getState() != Raft::State::LEADER || raft_->getTerm() != termBeforeGets) { response->setResult(TEXT_ERROR); return srv_->sendResponse(r, response); } } response->setValueResult(keyStr, value, cas); return srv_->sendResponse(r, response); } int StateMachine::showStats(easy_request_t *r, TextRequest *cmd) { TextResponse *response = new TextResponse(); if (response == NULL) { LOG_INFO("Fail to allocate TextResponse!\n"); abort(); } if (cmd->getStats() == stats_t::CLUSTER) { if (state_.load() == Raft::LEADER) { uint64_t clusterSize= rpcMembers_.size(); Raft::ClusterInfoType cis[clusterSize]; raft_->getClusterInfo(cis, clusterSize); response->setClusterStatsResult(cis, clusterSize); } } else { Raft::MemberInfoType mi; raft_->getMemberInfo(&mi); response->setLocalStatsResult(mi, getLastAppliedIndex(), stats_.cmd_get); } return srv_->sendResponse(r, response); } int StateMachine::showVersion(easy_request_t *r, TextRequest *cmd) { TextResponse *response= new TextResponse(); response->setVersionResult(); return srv_->sendResponse(r, response); } int StateMachine::storage(easy_request_t *r, TextRequest *cmd, LogOperation op) { TextResponse *response= new TextResponse(); if (response == NULL) { LOG_INFO("Fail to allocate TextResponse!\n"); abort(); } if (state_.load() != Raft::LEADER) { response->setResult(TEXT_ERROR); return srv_->sendResponse(r, response); } const char* key; std::size_t keyLen; const char* value; std::size_t valueLen; std::tie(key, keyLen)= cmd->getKey(); std::tie(value, valueLen)= cmd->getData(); std::string keyStr(key, keyLen); std::string valueStr(value, valueLen); uint32_t flags= cmd->getFlags(); time_t exptime= cmd->getExptime(); MemcachedObject object(static_cast(op), flags, exptime, valueStr); std::string buffer; object.dumpObject(&buffer); if (op == kCas) { std::string curValue; int ret= dataStore_->get("", keyStr, &curValue); uint64_t curSeqNum= 0; if (ret == 0) //find record { MemcachedObject::deserializeSeqNum(curValue, curSeqNum); // If the cas unique sent from client is smaller than current cas unique, // return EXIST since replicate the cas command is unnecessary and expensive if (cmd->getCasUnique() < curSeqNum) { LOG_INFO("Server: %s, avoid to replicate cas for key %s, cur cas: %ld, sent cas: %ld\n", selfId_.c_str(), keyStr.c_str(), curSeqNum, cmd->getCasUnique()); response->setResult(TEXT_EXISTS); return srv_->sendResponse(r, response); } } MemcachedObject::serializeSeqNum(buffer, cmd->getCasUnique()); } LogEntry logEntry; logEntry.set_key(keyStr); logEntry.set_value(buffer); logEntry.set_optype(op); return replicateLog(r, logEntry, response); } int StateMachine::del(easy_request_t *r, TextRequest *cmd, LogOperation op) { TextResponse *response= new TextResponse(); if (response == NULL) { LOG_INFO("Fail to allocate TextResponse!\n"); abort(); } if (state_.load() != Raft::LEADER) { response->setResult(TEXT_ERROR); return srv_->sendResponse(r, response); } const char* key; std::size_t keyLen; std::tie(key, keyLen)= cmd->getKey(); std::string keyStr(key, keyLen); LogEntry logEntry; logEntry.set_key(keyStr); logEntry.set_optype(op); return replicateLog(r, logEntry, response); } int StateMachine::replicateLog(easy_request_t *r, LogEntry &logEntry, TextResponse *response) { uint64_t curIndex= raft_->replicateLog(logEntry); if (curIndex == -1) { response->setResult(TEXT_ERROR); return srv_->sendResponse(r, response); } { std::lock_guard lg(lock_); ClientRsp& ack= clientRsps_[curIndex]; ack.response= response; ack.easyReq= r; } return EASY_ABORT; } };/* end of namespace alisql */