187 lines
5.3 KiB
C++
187 lines
5.3 KiB
C++
#include <boost/crc.hpp>
|
|
#include <cstdlib>
|
|
#include <string>
|
|
#include <string.h>
|
|
|
|
#include "plugin/galaxy/udf/udf.h"
|
|
|
|
class HashCheckCaculator {
|
|
public:
|
|
/**
|
|
* use ECMA-182 CRC Polynomial
|
|
*/
|
|
using Crc = boost::crc_optimal<64, 0x42F0E1EBA9EA3693, 0xffffffffffffffff,
|
|
0xffffffffffffffff, false, false>;
|
|
|
|
HashCheckCaculator() : crc64(), hashVal(0), isFirstCacl(true) {}
|
|
|
|
/**
|
|
* this function process every row element include NULL element.
|
|
* if element is NULL, it will feed a null_tag(0xfe) to crc64.
|
|
* this function also attach a separator_tag(0xff) to each element.
|
|
* */
|
|
void rowElementUpdate(const char *buffer, size_t size) {
|
|
if (buffer == nullptr) {
|
|
crc64.process_byte(null_tag);
|
|
} else {
|
|
crc64.process_bytes(buffer, size);
|
|
}
|
|
crc64.process_byte(separator_tag);
|
|
}
|
|
|
|
/**
|
|
* get the row digest and feed to myhash.
|
|
* */
|
|
void rowUpdate() {
|
|
Crc::value_type crcResult = crc64.checksum();
|
|
if (!isFirstCacl) {
|
|
hashVal = myhash(static_cast<uint64_t>(crcResult));
|
|
} else {
|
|
isFirstCacl = false;
|
|
hashVal = crcResult;
|
|
}
|
|
crc64.reset();
|
|
}
|
|
|
|
uint64_t getDigest() const { return hashVal; }
|
|
|
|
void reset() {
|
|
crc64.reset();
|
|
isFirstCacl = true;
|
|
hashVal = 0;
|
|
}
|
|
|
|
bool hasData() const { return !isFirstCacl; }
|
|
|
|
private:
|
|
inline uint64_t myhash(uint64_t data) const {
|
|
return fact_p + fact_q * (hashVal + data) + fact_r * hashVal * data;
|
|
}
|
|
|
|
Crc crc64;
|
|
|
|
uint64_t hashVal;
|
|
bool isFirstCacl;
|
|
|
|
const static uint64_t fact_p;
|
|
const static uint64_t fact_q;
|
|
const static uint64_t fact_r;
|
|
const static char separator_tag;
|
|
const static char null_tag;
|
|
};
|
|
|
|
const uint64_t HashCheckCaculator::fact_p = 3860031;
|
|
const uint64_t HashCheckCaculator::fact_q = 2779;
|
|
const uint64_t HashCheckCaculator::fact_r = 2;
|
|
const char HashCheckCaculator::separator_tag = 0xff;
|
|
const char HashCheckCaculator::null_tag = 0xfe;
|
|
|
|
|
|
bool hashcheck_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
|
|
void hashcheck_deinit(UDF_INIT *initid);
|
|
void hashcheck_reset(UDF_INIT *initid, UDF_ARGS *args, unsigned char *is_null,
|
|
unsigned char *error);
|
|
void hashcheck_clear(UDF_INIT *initid, unsigned char *is_null, unsigned char *error);
|
|
void hashcheck_add(UDF_INIT *initid, UDF_ARGS *args, unsigned char *is_null,
|
|
unsigned char *error);
|
|
longlong hashcheck(UDF_INIT *initid, UDF_ARGS *args, unsigned char *is_null,
|
|
unsigned char *error);
|
|
void hashcheck_udf(gs::udf::Udf_definition *def);
|
|
|
|
|
|
bool hashcheck_init(UDF_INIT *initid, UDF_ARGS *args, char *message) {
|
|
if (args->arg_count == 0) {
|
|
strcpy(message,
|
|
"wrong number of arguments: hashcheck() requires at least one "
|
|
"argument");
|
|
return 1;
|
|
}
|
|
|
|
HashCheckCaculator *caculator = new (std::nothrow) HashCheckCaculator;
|
|
if (caculator == nullptr) {
|
|
strcpy(message, "memory allocate error");
|
|
return 1;
|
|
}
|
|
|
|
initid->ptr = reinterpret_cast<char *>(caculator);
|
|
initid->maybe_null = 1;
|
|
return 0;
|
|
}
|
|
|
|
void hashcheck_reset(UDF_INIT *initid, UDF_ARGS *args, unsigned char *is_null,
|
|
unsigned char *error) {
|
|
HashCheckCaculator *caculator =
|
|
reinterpret_cast<HashCheckCaculator *>(initid->ptr);
|
|
caculator->reset();
|
|
}
|
|
|
|
void hashcheck_clear(UDF_INIT *initid, unsigned char *is_null, unsigned char *error) {
|
|
HashCheckCaculator *caculator =
|
|
reinterpret_cast<HashCheckCaculator *>(initid->ptr);
|
|
caculator->reset();
|
|
}
|
|
|
|
void hashcheck_add(UDF_INIT *initid, UDF_ARGS *args, unsigned char *is_null,
|
|
unsigned char *error) {
|
|
HashCheckCaculator *caculator =
|
|
reinterpret_cast<HashCheckCaculator *>(initid->ptr);
|
|
|
|
for (int i = 0; i < args->arg_count; i++) {
|
|
if (args->args[i] == nullptr) {
|
|
caculator->rowElementUpdate(nullptr, 0);
|
|
} else {
|
|
switch (args->arg_type[i]) {
|
|
case STRING_RESULT: {
|
|
caculator->rowElementUpdate(args->args[i], args->lengths[i]);
|
|
break;
|
|
}
|
|
case REAL_RESULT: {
|
|
caculator->rowElementUpdate(args->args[i], sizeof(double));
|
|
break;
|
|
}
|
|
case INT_RESULT: {
|
|
caculator->rowElementUpdate(args->args[i], sizeof(long long));
|
|
break;
|
|
}
|
|
case DECIMAL_RESULT: {
|
|
caculator->rowElementUpdate(args->args[i], args->lengths[i]);
|
|
break;
|
|
}
|
|
default:;
|
|
}
|
|
}
|
|
}
|
|
|
|
caculator->rowUpdate();
|
|
}
|
|
|
|
longlong hashcheck(UDF_INIT *initid, UDF_ARGS *args, unsigned char *is_null,
|
|
unsigned char *error) {
|
|
|
|
gs::udf::udf_counter.hashcheck_counter++;
|
|
HashCheckCaculator *caculator =
|
|
reinterpret_cast<HashCheckCaculator *>(initid->ptr);
|
|
if (!caculator->hasData()) {
|
|
*is_null = 1;
|
|
return 0;
|
|
}
|
|
return caculator->getDigest();
|
|
}
|
|
|
|
void hashcheck_deinit(UDF_INIT *initid) {
|
|
HashCheckCaculator *caculator =
|
|
reinterpret_cast<HashCheckCaculator *>(initid->ptr);
|
|
delete caculator;
|
|
initid->ptr = nullptr;
|
|
}
|
|
|
|
void hashcheck_udf(gs::udf::Udf_definition *def) {
|
|
def->m_name = const_cast<char *>("hashcheck");
|
|
def->m_result = INT_RESULT;
|
|
def->m_type = UDFTYPE_AGGREGATE;
|
|
def->m_func_init = (Udf_func_init)hashcheck_init;
|
|
def->m_func_deinit = (Udf_func_deinit)hashcheck_deinit;
|
|
def->m_func_add = (Udf_func_add)hashcheck_add;
|
|
def->m_func_clear = (Udf_func_clear)hashcheck_clear;
|
|
def->m_func = (Udf_func_any) hashcheck;
|
|
} |