polardbxengine/storage/xengine/core/compact/compaction.h

/*
 * Copyright (c) 2020, Alibaba Group Holding Limited
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at

 * http://www.apache.org/licenses/LICENSE-2.0

 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef XENGINE_STORAGE_COMPACTION_H_
#define XENGINE_STORAGE_COMPACTION_H_

#include "compaction_stats.h"
#include "task_type.h"
#include "memory/page_arena.h"
#include "reuse_block_merge_iterator.h"
#include "storage/storage_manager.h"

#include "table/block.h"
#include "table/two_level_iterator.h"
#include "util/aligned_buffer.h"
#include "xengine/cache.h"
#include "xengine/env.h"
#include "xengine/options.h"
#include "util/threadpool_imp.h"

#ifdef FPGA_COMPACTION
#include "db-compaction-fpga/comp_aclr.h"
#include "db-compaction-fpga/comp_stats.h"
#else
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#endif
#include "fpga/comp_stats.h"
#include "fpga/comp_io.h"
#endif

namespace xengine {

namespace db {
class InternalKeyComparator;
//enum CompactionTaskType : int;
};

namespace util {
class EnvOptions;
class Env;
class Comparator;
}

namespace common {
class ImmutableCFOptions;
class MutableCFOptions;
}

namespace table {
class BlockIter;
}

namespace storage {

class ExtentSpaceManager;
class StorageLogger;
class CompactionIterator;
class NewCompactionIterator;

// This function, which is merely used in compaction, is moved from lagacy
// db/compaction.h.
common::CompressionType GetCompressionType(
    const common::ImmutableCFOptions &ioptions,
    const common::MutableCFOptions &mutable_cf_options, int level,
    const bool enable_compression = true);


enum CompactionMode {
  kTypeCPU = 0,
  kTypeFPGA = 1,
  kTypeCheck = 2,
  kTypeMax = 3
};

struct AsyncCompactionTaskInfo {
public:
  AsyncCompactionTaskInfo() {
    comp_mode_ = kTypeMax;
    output_blocks_ = nullptr;
    output_blocks_size_ = nullptr;
    output_blocks_num_ = 0;
  }
  ~AsyncCompactionTaskInfo() {}
public:
  CompactionMode   comp_mode_;
  char             *output_blocks_;
  size_t           *output_blocks_size_;
  size_t           output_blocks_num_;
  CompIO           comp_param_;
  CompStats        comp_stats_;
};
class MinorCompaction;
class MinorTask;

//Compaction Task control block
class CompactionTCB {
public:
  uint64_t            task_id_;
  uint64_t            cpu_failed_;
  uint64_t            fpga_failed_;
  MinorCompaction     *minor_compaction_;
  MinorTask           *minor_task_;
//following for statistic
  uint64_t            queue_ts_;  //
  uint64_t            start_process_ts_;  //post to queue timestame
  uint64_t            process_done_ts_;
  uint64_t            start_done_ts_;
  uint64_t            task_done_ts_;
};
//
// compaction scheduler, we will schedule both cpu and fpga
// we can run in three mode
//  (1)kTypeCPU only use cpu
//  (2)kTypeFPGA use FPGA if availale, use cpu only if this task can not h
//               be handle by fpga
//  (3)kTypeCheck run both in CPU and FPGA to check the correctness
//
class CompactionScheduler {
public:
  CompactionScheduler(const CompactionMode mode_p,
                      const int fpga_device_id,
                      const uint32_t fpga_driver_thread_num,
                      const uint32_t cpu_compaction_thread_num,
                      monitor::Statistics* stats_p) : stats_(stats_p) {
    this->config_mode_ = mode_p;
    this->comp_task_id_.store(0);
    this->done_task_id_.store(0);
    this->trival_task_num_.store(0);
    this->succeed_fpga_task_num_.store(0);
    this->succeed_cpu_task_num_.store(0);
    this->failed_cpu_task_num_.store(0);
    this->failed_fpga_task_num_.store(0);
    this->fpga_device_id_ = fpga_device_id;
    this->fpga_driver_thread_num_ = fpga_driver_thread_num;
    this->cpu_compaction_thread_num_ = cpu_compaction_thread_num;
    this->inited_ = false;
    this->fpga_driver_ = nullptr;
    this->stop_.store(false);
    this->thread_num_.store(0);
    this->thread_pool_impl_ = nullptr;
  }
  ~CompactionScheduler() {
    if (!inited_) return;
    //TODO WAIT all work done before deconstruct fpga_driver_
#ifdef FPGA_COMPACTION
    if (fpga_driver_ != nullptr) {
      //google::ShutdownGoogleLogging();
      delete fpga_driver_;
    }
#endif
    this->stop();
    assert(get_cpu_task_num() == 0);
    assert(get_fpga_task_num() == 0);
    assert(get_done_task_num() == 0);
    //this->thread_pool_impl_->JoinAllThreads();
    if (nullptr != thread_pool_impl_)
      delete thread_pool_impl_;
  }

  int init();
  void switch_mode_if_needed();
  bool stopped() {return this->stop_.load();}
  void stop() {
    if (stopped()) {
      return;
    }
    this->stop_.store(true);
    //wait all thread exit
    this->thread_pool_impl_->JoinAllThreads();
  }

  uint64_t get_thread_num() {
    return this->thread_num_.load(std::memory_order_relaxed);
  }
  void maybe_add_worker();
  void increase_worker_thread() {
    if (this->thread_num_.load() >= cpu_compaction_thread_num_)
      return;
    this->thread_num_.fetch_add(1);
    this->thread_pool_impl_->Schedule(&bg_work_minor_compaction,
                                      this, nullptr, nullptr);
  }
  void decrease_worker_thread() {
    this->thread_num_.fetch_sub(1);
  }
  void check_and_maybe_dump_status();
  void record_tcb_stats(CompactionTCB* tcb);
  uint64_t get_trival_task_num() {
    return this->trival_task_num_.load(std::memory_order_relaxed);
  }
  uint64_t get_succeed_fpga_task_num() {
    return this->succeed_fpga_task_num_.load(std::memory_order_relaxed);
  }
  uint64_t get_succeed_cpu_task_num() {
    return this->succeed_cpu_task_num_.load(std::memory_order_relaxed);
  }
  uint64_t get_failed_fpga_task_num() {
    return this->failed_fpga_task_num_.load(std::memory_order_relaxed);
  }
  uint64_t get_failed_cpu_task_num() {
    return this->failed_cpu_task_num_.load(std::memory_order_relaxed);
  }
  int post_compaction_task(MinorCompaction* minor_comapction,
                           std::vector<MinorTask*> &task_list);
  int do_compaction_job(uint64_t &job_done);
  int do_post_done_job(CompactionTCB* tcb);
  static void bg_work_minor_compaction(void *arg);

  uint64_t get_current_ts() { return util::Env::Default()->NowMicros();}
  uint64_t next_task_id() { return this->comp_task_id_.fetch_add(1); }
  uint64_t get_current_task_id() {
    return  this->comp_task_id_.load(std::memory_order_relaxed);
  }

  bool put_task_to_device(AsyncCompactionTaskInfo *task) {
    assert(nullptr != task);
#ifdef FPGA_COMPACTION
    return this->fpga_driver_->MergeBlocks(&(task->comp_param_));
#else
    return false;
#endif
  }
  uint64_t get_cpu_task_num() {
    std::unique_lock<std::mutex> lck(cpu_queue_mutex_);
    return this->cpu_queue_.size();
  }
  void add_cpu_task(CompactionTCB * tcb) {
    assert(tcb != nullptr);
    std::unique_lock<std::mutex> lck(cpu_queue_mutex_);
    this->cpu_queue_.push_back(tcb);
    tcb->queue_ts_ = this->get_current_ts();
  }
  CompactionTCB* get_cpu_task() {
    CompactionTCB* tcb = nullptr;
    std::unique_lock<std::mutex> lck(cpu_queue_mutex_);
    if (this->cpu_queue_.size() == 0)
      return tcb;
    tcb = this->cpu_queue_.front();
    this->cpu_queue_.pop_front();
    tcb->start_process_ts_ = this->get_current_ts();
    return tcb;
  }
  uint64_t get_fpga_task_num() {
    std::unique_lock<std::mutex> lck(cpu_queue_mutex_);
    return this->fpga_queue_.size();
  }
  void add_fpga_task_hight_priority(CompactionTCB * tcb) {
    assert(tcb != nullptr);
    std::unique_lock<std::mutex> lck(fpga_queue_mutex_);
    this->fpga_queue_.push_front(tcb);
    tcb->queue_ts_ = this->get_current_ts();
  }
  void add_fpga_task(CompactionTCB * tcb) {
    assert(tcb != nullptr);
    std::unique_lock<std::mutex> lck(fpga_queue_mutex_);
    this->fpga_queue_.push_back(tcb);
    tcb->queue_ts_ = this->get_current_ts();
  }
  CompactionTCB* get_fpga_task() {
    CompactionTCB* tcb = nullptr;
    std::unique_lock<std::mutex> lck(fpga_queue_mutex_);
    if (this->fpga_queue_.size() == 0)
      return tcb;
    tcb = this->fpga_queue_.front();
    this->fpga_queue_.pop_front();
    tcb->start_process_ts_ = this->get_current_ts();
    return tcb;
  }
  uint64_t get_done_task_num() {
    std::unique_lock<std::mutex> lck(cpu_queue_mutex_);
    return this->done_queue_.size();
  }
  void add_done_task(CompactionTCB * tcb);
  CompactionTCB* get_done_task() {
    CompactionTCB* tcb = nullptr;
    std::unique_lock<std::mutex> lck(done_queue_mutex_);
    if (this->done_queue_.size() == 0)
      return tcb;
    tcb = this->done_queue_.front();
    this->done_queue_.pop_front();
    tcb->start_done_ts_ = this->get_current_ts();
    return tcb;
  }
  uint64_t get_ongoing_task_num() {
    uint64_t done_id = this->done_task_id_.load(std::memory_order_relaxed);
    uint64_t current_id = this->comp_task_id_.load(std::memory_order_relaxed);
    return current_id > (done_id + 1) ? (current_id - done_id - 1) : 0;
  }
  void set_mode(int run_mode) {
    assert(run_mode >= 0);
    assert(run_mode < kTypeMax);
    current_mode_.store(run_mode);
  }
  CompactionMode get_mode() {
    return  static_cast<CompactionMode>(current_mode_.load());
  }
  std::string get_mode_str() {
    std::string ret = "unkown";
    if (get_mode() == kTypeCPU)
      ret = "cpu";
    else if (get_mode() == kTypeFPGA)
      ret = "fpga";
    else {
      assert(get_mode() == kTypeCheck);
      ret = "check";
    }
    return ret;
  }

public:
  bool                                 inited_;
  std::atomic<int>                     current_mode_;
  CompactionMode                       config_mode_;
  std::atomic<uint64_t>                comp_task_id_;
  std::atomic<uint64_t>                done_task_id_;

  //statistic
  std::atomic<uint64_t>                trival_task_num_;
  std::atomic<uint64_t>                succeed_fpga_task_num_;
  std::atomic<uint64_t>                succeed_cpu_task_num_;
  std::atomic<uint64_t>                failed_fpga_task_num_;
  std::atomic<uint64_t>                failed_cpu_task_num_;

  std::atomic<bool>                    stop_;
  monitor::Statistics*                 stats_;

  std::atomic<uint64_t>                thread_num_;
  util::ThreadPoolImpl*                thread_pool_impl_;
  //FPGA
  int                                  fpga_device_id_;
  uint32_t                             fpga_driver_thread_num_;
  std::atomic<int>                     fpga_status_; //0 ok, otherwsie error
#ifdef FPGA_COMPACTION
  CompAclr                             *fpga_driver_;
#else
  void*                                fpga_driver_;
#endif
  //CPU
  uint32_t                             cpu_compaction_thread_num_;

  std::mutex                           cpu_queue_mutex_;
  std::deque<CompactionTCB*>           cpu_queue_;

  std::mutex                           fpga_queue_mutex_;
  std::deque<CompactionTCB*>           fpga_queue_;

  std::mutex                           done_queue_mutex_;
  std::deque<CompactionTCB*>           done_queue_;
};

void CompactionMainLoop(void *arg);

struct CompactionContext {
  const std::atomic<bool> *shutting_down_;
  const std::atomic<bool> *bg_stopped_;
  const std::atomic<int64_t> *cancel_type_;
  const common::ImmutableCFOptions *cf_options_;
  const common::MutableCFOptions *mutable_cf_options_;
  const util::EnvOptions *env_options_;
  // user Comparator in data block level;
  const util::Comparator *data_comparator_;
  const db::InternalKeyComparator *internal_comparator_;
  ExtentSpaceManager *space_manager_;
  int64_t table_space_id_;
  common::SequenceNumber earliest_write_conflict_snapshot_;
  std::vector<common::SequenceNumber> existing_snapshots_;
  // maybe 0 (StreamCompaction), or 1 (MinorCompaction for FPGA)
  // or 2 (NewStreamCompaction)
  int32_t minor_compaction_type_;
  CompactionScheduler *compaction_scheduler_;
  // maybe 0, 1, or 2, default is 1
  int32_t output_level_;
  db::TaskType task_type_;
  // Intra Level-0 need to force all the reused and newly created extents
  // to have the same layer sequence.
  // Selected layer sequence should be different from the old layer sequences.
  // We use smallest (Extent.type_.sequence_-1) as the selected layer sequence.
  // Default: -1, means have not been set.
  int64_t force_layer_sequence_;
  storage::StorageLogger *storage_logger_;
  bool enable_thread_tracking_;
  bool need_check_snapshot_;

  CompactionContext()
      : shutting_down_(nullptr),
        bg_stopped_(nullptr),
        cancel_type_(nullptr),
        cf_options_(nullptr),
        mutable_cf_options_(nullptr),
        env_options_(nullptr),
        data_comparator_(nullptr),
        internal_comparator_(nullptr),
        space_manager_(nullptr),
        table_space_id_(-1),
        earliest_write_conflict_snapshot_(0),
        minor_compaction_type_(0),
        compaction_scheduler_(nullptr),
        output_level_(1),
        task_type_(db::TaskType::INVALID_TYPE_TASK),
        force_layer_sequence_(-1),
        storage_logger_(nullptr),
        enable_thread_tracking_(false),
        need_check_snapshot_(true)
  {
    existing_snapshots_.clear();
  }

  bool valid() const {
    return nullptr != shutting_down_ && nullptr != bg_stopped_ &&
           nullptr != cf_options_ &&
           nullptr != mutable_cf_options_ && nullptr != env_options_ &&
           nullptr != data_comparator_ && nullptr != internal_comparator_ &&
           nullptr != space_manager_ && table_space_id_ >= 0 &&
           nullptr != storage_logger_;
  }
};

struct ColumnFamilyDesc {
  int32_t column_family_id_;
  std::string column_family_name_;
  ColumnFamilyDesc() : column_family_id_(0), column_family_name_("default") {}
  ColumnFamilyDesc(int32_t id, const std::string &name)
      : column_family_id_(id), column_family_name_(name) {}
};

class Compaction {
 public:
  // Compaction output
  struct Statstics {
    CompactRecordStats record_stats_;
    CompactPerfStats perf_stats_;
    MinorCompactStats minor_stats_;
  };

  enum Level {
    L0 = 0,
    L1 = 1,
    L2 = 2,
  };

 public:
  virtual ~Compaction(){};
  // input extent meta data iterator;
  virtual int add_merge_batch(
      const MetaDescriptorList &extents,
      const size_t start, const size_t end) = 0;
  virtual int run() = 0;
  virtual int cleanup() = 0;

  virtual const Statstics &get_stats() const = 0;
  virtual storage::ChangeInfo &get_change_info() = 0;
  virtual const int64_t *get_input_extent() const = 0;
};

// compaction history information schema
struct CompactionJobStatsInfo {
  uint32_t subtable_id_;
  int64_t sequence_;
  int type_;
  Compaction::Statstics stats_;
};

class GeneralCompaction : public Compaction {
  static const int64_t DEFAULT_ROW_LENGTH = 8 * 1024;  // 8kb
  static const int64_t RESERVE_READER_NUM = 64;
  static const int64_t RESERVE_MERGE_WAY_SIZE = 16;
 public:
  GeneralCompaction(const CompactionContext &context,
                    const ColumnFamilyDesc &cf,
                    memory::ArenaAllocator &arena);
  virtual ~GeneralCompaction();

  virtual int run();
  virtual int cleanup();

  // input extent meta data iterator;
  virtual int add_merge_batch(
      const MetaDescriptorList &extents,
      const size_t start, const size_t end);

  virtual const Statstics &get_stats() const { return stats_; }
  virtual storage::ChangeInfo &get_change_info() {
    return change_info_;
  }

  virtual const int64_t *get_input_extent() const {
    return input_extents_;
  }

  size_t get_extent_size() const { return merge_extents_.size(); }
  // set level2's largest key
  void set_level2_largest_key(const common::Slice *l2_largest_key) {
    l2_largest_key_ = l2_largest_key;
  }
  // set delete percent
  void set_delete_percent(const int64_t delete_percent) {
    delete_percent_ = delete_percent;
  }
  int down_level_extent(const MetaDescriptor &extent);
  int copy_data_block(const MetaDescriptor &data_block
                      /*const common::XengineSchema *schema*/);
  int create_extent_index_iterator(const MetaDescriptor &extent,
                                   size_t &iterator_index,
                                   DataBlockIterator *&iterator,
                                   ExtSEIterator::ReaderRep &rep);
  int destroy_extent_index_iterator(const int64_t iterator_index);
  int delete_extent_meta(const MetaDescriptor &extent);
  bool check_do_reuse(const MetaDescriptor &meta) const;
 protected:
  friend class ExtSEIterator;

  int open_extent();
  int close_extent(db::MiniTables *flush_tables = nullptr);

  void start_record_compaction_stats();
  void stop_record_compaction_stats();

  int build_multiple_seiterators(const int64_t batch_index,
      const storage::BlockPosition &batch,
      MultipleSEIterator *&merge_iterator);
  int build_compactor(NewCompactionIterator *&compactor,
      MultipleSEIterator *merge_iterator);

  int merge_extents(MultipleSEIterator *&merge_iterator,
      db::MiniTables *flush_tables = nullptr);

  int prefetch_extent(int64_t extent_id);

  int get_table_reader(const MetaDescriptor &extent,
                       table::TableReader *&reader);
  int get_extent_index_iterator(const MetaDescriptor &extent,
                                table::TableReader *&reader,
                                table::BlockIter *&index_iterator);
  int get_extent_index_iterator(table::TableReader *reader,
                                table::BlockIter *&index_iterator);

  int create_data_block_iterator(const storage::BlockPosition &data_block,
                                 table::TableReader *reader,
                                 table::BlockIter *&block_iterator);
  int destroy_data_block_iterator(table::BlockIter *block_iterator);

  AsyncRandomAccessExtent *get_async_extent_reader(int64_t extent_id) const;
  void destroy_async_extent_reader(int64_t extent_id, bool is_reuse = false);

  int switch_schema_for_block(const MetaDescriptor &data_block,
                              /*const common::XengineSchema *src_schema,*/
                              table::TableReader *table_reader);
  virtual void clear_current_readers();
  virtual void clear_current_writers();
  void record_compaction_iterator_stats(
      const NewCompactionIterator &compactor,
      CompactRecordStats &stats);

 protected:
  using PrefetchExtentMap = std::unordered_map<int64_t, AsyncRandomAccessExtent *,
  std::hash<int64_t>, std::equal_to<int64_t>,
  memory::stl_adapt_allocator<std::pair<int64_t,
  AsyncRandomAccessExtent *>, memory::ModId::kCompaction>>;

  // TODO default options for create table builder, remove future
  std::vector<std::unique_ptr<db::IntTblPropCollectorFactory>> props_;
  std::string compression_dict_;
  // options for create builder and reader;
  CompactionContext context_;
  ColumnFamilyDesc cf_desc_;

  // all extents need to merge in one compaciton task.
  MetaDescriptorList merge_extents_;
  // [start, end) sub task in %merge_extents_;
  BlockPositionList merge_batch_indexes_;
  PrefetchExtentMap prefetch_extents_;

  // compaction writer,
  bool write_extent_opened_;
  std::unique_ptr<table::TableBuilder> extent_builder_;
  util::WritableBuffer block_buffer_;
  db::MiniTables mini_tables_;
  storage::ChangeInfo change_info_;

  // Compaction result, written meta, statistics;
  Statstics stats_;
//  memory::ArenaAllocator row_arena_; // for schema switch

  // information schema compaction input extents
  int64_t input_extents_[3]; // todo to check max_level
  // for minor
  const common::Slice *l2_largest_key_;
  // for major
  int64_t delete_percent_;
  ExtSEIterator *se_iterators_;
  memory::ArenaAllocator arena_;
  memory::WrapAllocator wrap_alloc_;
  memory::stl_adapt_allocator<ExtSEIterator::ReaderRep> stl_alloc_;
  using ReadRepList = std::vector<ExtSEIterator::ReaderRep, memory::stl_adapt_allocator<ExtSEIterator::ReaderRep>>;
  ReadRepList reader_reps_;
};

}  // namespace storage
}  // namespace xengine

#endif