509 lines
14 KiB
C++
509 lines
14 KiB
C++
/* Copyright (c) 2018, 2021, Alibaba and/or its affiliates. All rights reserved.
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License, version 2.0,
|
|
as published by the Free Software Foundation.
|
|
This program is also distributed with certain software (including
|
|
but not limited to OpenSSL) that is licensed under separate terms,
|
|
as designated in a particular file or component or in included license
|
|
documentation. The authors of MySQL/PolarDB-X Engine hereby grant you an
|
|
additional permission to link the program and your derivative works with the
|
|
separately licensed software that they have included with
|
|
MySQL/PolarDB-X Engine.
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License, version 2.0, for more details.
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
|
|
|
|
#include "plugin/galaxy/udf/udf.h"
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include <math.h>
|
|
#include <stdint.h>
|
|
#include <math.h>
|
|
#include <cstring>
|
|
|
|
#define HLL_P 14 /* The greater is P, the smaller the error. */
|
|
#define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for */
|
|
/* determining the number of leading zeros. */
|
|
#define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
|
|
#define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
|
|
#define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
|
|
#define HLL_REGISTER_MAX ((1<<HLL_BITS)-1)
|
|
#define HLL_ALPHA_INF 0.721347520444481703680 /* constant for 0.5/ln(2) */
|
|
|
|
template<class HashT>
|
|
class HyperLogLog {
|
|
class Registers {
|
|
public:
|
|
Registers() {
|
|
/* Allocate 1 byte more explicitly to avoid the out-of-bounds arrayaccess */
|
|
m_regs.resize((HLL_REGISTERS * HLL_BITS / 8) + 1 , 0);
|
|
}
|
|
|
|
inline size_t size() const { return m_regs.size(); }
|
|
|
|
inline void reset() { memset(m_regs.data(), 0, size()); }
|
|
|
|
inline int init(uint8_t *buf, size_t len) {
|
|
if (len != size()) return -1;
|
|
uint8_t *p = m_regs.data();
|
|
memcpy(p, buf, len);
|
|
return 0;
|
|
}
|
|
|
|
inline uint8_t get(uint16_t pos) const {
|
|
uint32_t byte = pos * HLL_BITS / 8;
|
|
uint32_t bit = (pos * HLL_BITS) & 7;
|
|
return ((m_regs[byte] >> bit) | (m_regs[byte+1] << (8-bit))) & HLL_REGISTER_MAX;
|
|
}
|
|
|
|
inline void set(uint16_t pos, uint8_t val) {
|
|
uint32_t byte = pos * HLL_BITS / 8;
|
|
uint32_t bit = (pos * HLL_BITS) & 7;
|
|
m_regs[byte] &= ~(HLL_REGISTER_MAX << bit);
|
|
m_regs[byte] |= (val << bit);
|
|
m_regs[byte+1] &= ~(HLL_REGISTER_MAX >> (8-bit));
|
|
m_regs[byte+1] |= (val >> (8-bit));
|
|
}
|
|
|
|
inline void ReplaceIfGreater(uint16_t pos, uint8_t val) {
|
|
if (get(pos) < val) set(pos, val);
|
|
}
|
|
|
|
void merge(Registers ®s) {
|
|
for (uint16_t j = 0; j < HLL_REGISTERS; j++) {
|
|
uint8_t val = regs.get(j);
|
|
if (val > get(j)) {
|
|
set(j, val);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GetHistogram(std::vector<int> ®histo) const {
|
|
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
|
|
const uint8_t *r = m_regs.data();
|
|
uint8_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
|
|
r10, r11, r12, r13, r14, r15;
|
|
for (int j = 0; j < 1024; j++) {
|
|
/* Handle 16 registers per iteration. */
|
|
r0 = r[0] & 63;
|
|
r1 = (r[0] >> 6 | r[1] << 2) & 63;
|
|
r2 = (r[1] >> 4 | r[2] << 4) & 63;
|
|
r3 = (r[2] >> 2) & 63;
|
|
r4 = r[3] & 63;
|
|
r5 = (r[3] >> 6 | r[4] << 2) & 63;
|
|
r6 = (r[4] >> 4 | r[5] << 4) & 63;
|
|
r7 = (r[5] >> 2) & 63;
|
|
r8 = r[6] & 63;
|
|
r9 = (r[6] >> 6 | r[7] << 2) & 63;
|
|
r10 = (r[7] >> 4 | r[8] << 4) & 63;
|
|
r11 = (r[8] >> 2) & 63;
|
|
r12 = r[9] & 63;
|
|
r13 = (r[9] >> 6 | r[10] << 2) & 63;
|
|
r14 = (r[10] >> 4 | r[11] << 4) & 63;
|
|
r15 = (r[11] >> 2) & 63;
|
|
|
|
reghisto[r0]++;
|
|
reghisto[r1]++;
|
|
reghisto[r2]++;
|
|
reghisto[r3]++;
|
|
reghisto[r4]++;
|
|
reghisto[r5]++;
|
|
reghisto[r6]++;
|
|
reghisto[r7]++;
|
|
reghisto[r8]++;
|
|
reghisto[r9]++;
|
|
reghisto[r10]++;
|
|
reghisto[r11]++;
|
|
reghisto[r12]++;
|
|
reghisto[r13]++;
|
|
reghisto[r14]++;
|
|
reghisto[r15]++;
|
|
|
|
r += 12;
|
|
}
|
|
} else {
|
|
for (uint16_t j = 0; j < HLL_REGISTERS; j++) {
|
|
reghisto[get(j)]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
const uint8_t *c_ptr(size_t *len = NULL) const {
|
|
if (len) *len = this->size();
|
|
return m_regs.data();
|
|
}
|
|
|
|
private:
|
|
std::vector<uint8_t> m_regs;
|
|
};
|
|
|
|
public:
|
|
void add(uint8_t *p, size_t len) {
|
|
uint64_t hash = m_hash(p, len);
|
|
m_registers.ReplaceIfGreater(position(hash), ZeroRunLength(hash));
|
|
}
|
|
|
|
size_t count() const {
|
|
double m = HLL_REGISTERS;
|
|
double E;
|
|
|
|
std::vector<int> reghisto(64, 0);
|
|
m_registers.GetHistogram(reghisto);
|
|
double z = m * tau((m-reghisto[HLL_Q+1])/(double)m);
|
|
for (int j = HLL_Q; j >= 1; --j) {
|
|
z += reghisto[j];
|
|
z *= 0.5;
|
|
}
|
|
z += m * sigma(reghisto[0]/(double)m);
|
|
E = llroundl(HLL_ALPHA_INF*m*m/z);
|
|
|
|
return (size_t)E;
|
|
}
|
|
|
|
void merge(HyperLogLog<HashT> &hll) {
|
|
m_registers.merge(hll.m_registers);
|
|
}
|
|
|
|
void inline reset() { m_registers.reset(); }
|
|
|
|
/* The reg data size should be 1 byte less than its real size. */
|
|
inline size_t GetRegSize() const { return m_registers.size()-1; }
|
|
|
|
int InitRegs(uint8_t *buf, size_t len) {
|
|
if (len < GetRegSize()) return -1;
|
|
len = std::min<size_t>(len, GetRegSize());
|
|
m_registers.init(buf, len);
|
|
return 0;
|
|
}
|
|
|
|
int DumpRegs(char *buf, size_t *len) const {
|
|
if (!buf || !len || !*len) return -1;
|
|
size_t reg_size = 0;
|
|
const uint8_t *regs = m_registers.c_ptr(®_size);
|
|
*len = std::min<size_t>(reg_size, *len);
|
|
memcpy(buf, regs, *len);
|
|
return 0;
|
|
}
|
|
|
|
private:
|
|
/* Helper function sigma as defined in
|
|
* "New cardinality estimation algorithms for HyperLogLog sketches"
|
|
* Otmar Ertl, arXiv:1702.01284 */
|
|
static double sigma(double x) {
|
|
if (x == 1.) return INFINITY;
|
|
double zPrime;
|
|
double y = 1;
|
|
double z = x;
|
|
do {
|
|
x *= x;
|
|
zPrime = z;
|
|
z += x * y;
|
|
y += y;
|
|
} while(zPrime != z);
|
|
return z;
|
|
}
|
|
|
|
/* Helper function tau as defined in
|
|
* "New cardinality estimation algorithms for HyperLogLog sketches"
|
|
* Otmar Ertl, arXiv:1702.01284 */
|
|
static double tau(double x) {
|
|
if (x == 0. || x == 1.) return 0.;
|
|
double zPrime;
|
|
double y = 1.0;
|
|
double z = 1 - x;
|
|
do {
|
|
x = sqrt(x);
|
|
zPrime = z;
|
|
y *= 0.5;
|
|
z -= pow(1 - x, 2)*y;
|
|
} while(zPrime != z);
|
|
return z / 3;
|
|
}
|
|
|
|
static inline uint16_t position(uint16_t hash) {
|
|
return (hash & HLL_P_MASK);
|
|
}
|
|
|
|
static uint8_t ZeroRunLength(uint64_t hash) {
|
|
uint8_t rl = 1;
|
|
hash >>= HLL_P;
|
|
hash |= ((uint64_t)1 << HLL_Q);
|
|
while (!(hash & (uint64_t)1)) {
|
|
rl++;
|
|
hash >>= 1;
|
|
}
|
|
return rl;
|
|
}
|
|
|
|
private:
|
|
HashT m_hash;
|
|
Registers m_registers;
|
|
};
|
|
|
|
class MurmurHash {
|
|
public:
|
|
MurmurHash(uint64_t seed = 0xadc83b19ULL) :
|
|
m_seed(seed) {}
|
|
~MurmurHash() {}
|
|
|
|
uint64_t operator() (uint8_t *p, size_t len) {
|
|
const uint64_t m = 0xc6a4a7935bd1e995;
|
|
const int r = 47;
|
|
uint64_t h = m_seed ^ (len * m);
|
|
const uint8_t *data = (const uint8_t *) p;
|
|
const uint8_t *end = data + (len-(len&7));
|
|
|
|
while(data != end) {
|
|
uint64_t k;
|
|
|
|
#ifdef WORDS_BIGENDIAN
|
|
k = (uint64_t) data[0];
|
|
k |= (uint64_t) data[1] << 8;
|
|
k |= (uint64_t) data[2] << 16;
|
|
k |= (uint64_t) data[3] << 24;
|
|
k |= (uint64_t) data[4] << 32;
|
|
k |= (uint64_t) data[5] << 40;
|
|
k |= (uint64_t) data[6] << 48;
|
|
k |= (uint64_t) data[7] << 56;
|
|
#else
|
|
k = *((const uint64_t*)data);
|
|
#endif
|
|
|
|
k *= m;
|
|
k ^= k >> r;
|
|
k *= m;
|
|
h ^= k;
|
|
h *= m;
|
|
data += 8;
|
|
}
|
|
|
|
switch(len & 7) {
|
|
case 7: h ^= (uint64_t)data[6] << 48; /* fall-thru */
|
|
case 6: h ^= (uint64_t)data[5] << 40; /* fall-thru */
|
|
case 5: h ^= (uint64_t)data[4] << 32; /* fall-thru */
|
|
case 4: h ^= (uint64_t)data[3] << 24; /* fall-thru */
|
|
case 3: h ^= (uint64_t)data[2] << 16; /* fall-thru */
|
|
case 2: h ^= (uint64_t)data[1] << 8; /* fall-thru */
|
|
case 1: h ^= (uint64_t)data[0];
|
|
h *= m; /* fall-thru */
|
|
};
|
|
|
|
h ^= h >> r;
|
|
h *= m;
|
|
h ^= h >> r;
|
|
return h;
|
|
}
|
|
|
|
private:
|
|
uint64_t m_seed;
|
|
};
|
|
|
|
|
|
/*
|
|
bool hyperloglog_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
|
|
void hyperloglog_deinit(UDF_INIT *initid);
|
|
void hyperloglog_clear(UDF_INIT *initid, char *is_null, char *error);
|
|
void hyperloglog_add(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error);
|
|
char *hyperloglog(UDF_INIT *initid, UDF_ARGS *args, char *result,
|
|
unsigned long *length, char *is_null, char *error);
|
|
bool hllndv_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
|
|
void hllndv_deinit(UDF_INIT *initid);
|
|
void hllndv_clear(UDF_INIT *initid, char *is_null, char *error);
|
|
void hllndv_add(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error);
|
|
longlong hllndv(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error);
|
|
*/
|
|
|
|
bool hyperloglog_init(UDF_INIT *initid, UDF_ARGS *args, char *message)
|
|
{
|
|
if (args->arg_count != 1) {
|
|
strcpy(message, "HyperLogLog accepts only one argument");
|
|
return 1;
|
|
}
|
|
|
|
switch (args->arg_type[0]) {
|
|
case ROW_RESULT:
|
|
strcpy(message, "HyperLogLog cannot accept row type arguent");
|
|
return 1;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
HyperLogLog<MurmurHash> *hll = new (std::nothrow) HyperLogLog<MurmurHash>();
|
|
if (!hll) return 1;
|
|
initid->const_item = 0;
|
|
initid->maybe_null = 0;
|
|
initid->max_length = hll->GetRegSize();
|
|
initid->ptr = (char *) hll;
|
|
return 0;
|
|
}
|
|
|
|
void hyperloglog_deinit(UDF_INIT *initid)
|
|
{
|
|
if (initid->ptr) {
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
initid->ptr = NULL;
|
|
delete hll;
|
|
}
|
|
}
|
|
|
|
void hyperloglog_clear(UDF_INIT *initid,
|
|
char *is_null MY_ATTRIBUTE((unused)),
|
|
char *error MY_ATTRIBUTE((unused)))
|
|
{
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
hll->reset();
|
|
}
|
|
|
|
void hyperloglog_add(UDF_INIT *initid, UDF_ARGS *args,
|
|
char *is_null MY_ATTRIBUTE((unused)),
|
|
char *error MY_ATTRIBUTE((unused)))
|
|
{
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
if (!args->args[0]) {
|
|
hll->add(NULL, 0);
|
|
return;
|
|
}
|
|
|
|
switch (args->arg_type[0]) {
|
|
case STRING_RESULT:
|
|
case DECIMAL_RESULT:
|
|
hll->add((uint8_t *) args->args[0], args->lengths[0]);
|
|
break;
|
|
case INT_RESULT:
|
|
hll->add((uint8_t *) args->args[0], sizeof(longlong));
|
|
break;
|
|
case REAL_RESULT:
|
|
hll->add((uint8_t *) args->args[0], sizeof(double));
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
char *hyperloglog(UDF_INIT *initid MY_ATTRIBUTE((unused)),
|
|
UDF_ARGS *args MY_ATTRIBUTE((unused)), char *result,
|
|
unsigned long *length, char *is_null MY_ATTRIBUTE((unused)),
|
|
char *error) {
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
if (*length < hll->GetRegSize()) {
|
|
*error = 1;
|
|
*length = hll->GetRegSize();
|
|
return NULL;
|
|
}
|
|
|
|
gs::udf::udf_counter.hyperloglog_counter++;
|
|
|
|
if (hll->DumpRegs(result, length) != 0) {
|
|
*error = 1;
|
|
return NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void hyperloglog_udf(gs::udf::Udf_definition *def) {
|
|
def->m_name = const_cast<char *>("hyperloglog");
|
|
def->m_result = STRING_RESULT;
|
|
def->m_type = UDFTYPE_AGGREGATE;
|
|
def->m_func_init = (Udf_func_init)hyperloglog_init;
|
|
def->m_func_deinit = (Udf_func_deinit)hyperloglog_deinit;
|
|
def->m_func_add = (Udf_func_add)hyperloglog_add;
|
|
def->m_func_clear = (Udf_func_clear)hyperloglog_clear;
|
|
def->m_func = (Udf_func_any)hyperloglog;
|
|
}
|
|
|
|
bool hllndv_init(UDF_INIT *initid, UDF_ARGS *args, char *message)
|
|
{
|
|
if (args->arg_count != 1) {
|
|
strcpy(message, "HLLNDV accepts only one argument");
|
|
return 1;
|
|
}
|
|
|
|
switch (args->arg_type[0]) {
|
|
case ROW_RESULT:
|
|
strcpy(message, "HLLNDV cannot accept row type arguent");
|
|
return 1;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
HyperLogLog<MurmurHash> *hll = new (std::nothrow) HyperLogLog<MurmurHash>();
|
|
if (!hll) return 1;
|
|
initid->const_item = 0;
|
|
initid->maybe_null = 0;
|
|
initid->max_length = hll->GetRegSize();
|
|
initid->ptr = (char *) hll;
|
|
return 0;
|
|
}
|
|
|
|
void hllndv_deinit(UDF_INIT *initid)
|
|
{
|
|
if (initid->ptr) {
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
initid->ptr = NULL;
|
|
delete hll;
|
|
}
|
|
}
|
|
|
|
void hllndv_clear(UDF_INIT *initid,
|
|
char *is_null MY_ATTRIBUTE((unused)),
|
|
char *error MY_ATTRIBUTE((unused)))
|
|
{
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
hll->reset();
|
|
}
|
|
|
|
void hllndv_add(UDF_INIT *initid, UDF_ARGS *args,
|
|
char *is_null MY_ATTRIBUTE((unused)),
|
|
char *error MY_ATTRIBUTE((unused)))
|
|
{
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
if (!args->args[0]) {
|
|
hll->add(NULL, 0);
|
|
return;
|
|
}
|
|
|
|
switch (args->arg_type[0]) {
|
|
case STRING_RESULT:
|
|
case DECIMAL_RESULT:
|
|
hll->add((uint8_t *) args->args[0], args->lengths[0]);
|
|
break;
|
|
case INT_RESULT:
|
|
hll->add((uint8_t *) args->args[0], sizeof(longlong));
|
|
break;
|
|
case REAL_RESULT:
|
|
hll->add((uint8_t *) args->args[0], sizeof(double));
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
longlong hllndv(UDF_INIT *initid MY_ATTRIBUTE((unused)),
|
|
UDF_ARGS *args MY_ATTRIBUTE((unused)),
|
|
char *is_null MY_ATTRIBUTE((unused)),
|
|
char *error MY_ATTRIBUTE((unused))) {
|
|
HyperLogLog<MurmurHash> *hll = (HyperLogLog<MurmurHash> *) initid->ptr;
|
|
|
|
gs::udf::udf_counter.hllndv_counter++;
|
|
|
|
return (longlong) hll->count();
|
|
}
|
|
|
|
void hllndv_udf(gs::udf::Udf_definition *def) {
|
|
def->m_name = const_cast<char *>("hllndv");
|
|
def->m_result = INT_RESULT;
|
|
def->m_type = UDFTYPE_AGGREGATE;
|
|
def->m_func_init = (Udf_func_init)hllndv_init;
|
|
def->m_func_deinit = (Udf_func_deinit)hllndv_deinit;
|
|
def->m_func_add = (Udf_func_add)hllndv_add;
|
|
def->m_func_clear = (Udf_func_clear)hllndv_clear;
|
|
def->m_func = (Udf_func_any)hllndv;
|
|
}
|
|
|