/* Copyright (c) 2018, 2021, Alibaba and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the Free Software Foundation. This program is also distributed with certain software (including but not limited to OpenSSL) that is licensed under separate terms, as designated in a particular file or component or in included license documentation. The authors of MySQL/PolarDB-X Engine hereby grant you an additional permission to link the program and your derivative works with the separately licensed software that they have included with MySQL/PolarDB-X Engine. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "plugin/galaxy/udf/udf.h" #include #include #include #include #include #include #define HLL_P 14 /* The greater is P, the smaller the error. */ #define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for */ /* determining the number of leading zeros. */ #define HLL_REGISTERS (1< class HyperLogLog { class Registers { public: Registers() { /* Allocate 1 byte more explicitly to avoid the out-of-bounds arrayaccess */ m_regs.resize((HLL_REGISTERS * HLL_BITS / 8) + 1 , 0); } inline size_t size() const { return m_regs.size(); } inline void reset() { memset(m_regs.data(), 0, size()); } inline int init(uint8_t *buf, size_t len) { if (len != size()) return -1; uint8_t *p = m_regs.data(); memcpy(p, buf, len); return 0; } inline uint8_t get(uint16_t pos) const { uint32_t byte = pos * HLL_BITS / 8; uint32_t bit = (pos * HLL_BITS) & 7; return ((m_regs[byte] >> bit) | (m_regs[byte+1] << (8-bit))) & HLL_REGISTER_MAX; } inline void set(uint16_t pos, uint8_t val) { uint32_t byte = pos * HLL_BITS / 8; uint32_t bit = (pos * HLL_BITS) & 7; m_regs[byte] &= ~(HLL_REGISTER_MAX << bit); m_regs[byte] |= (val << bit); m_regs[byte+1] &= ~(HLL_REGISTER_MAX >> (8-bit)); m_regs[byte+1] |= (val >> (8-bit)); } inline void ReplaceIfGreater(uint16_t pos, uint8_t val) { if (get(pos) < val) set(pos, val); } void merge(Registers ®s) { for (uint16_t j = 0; j < HLL_REGISTERS; j++) { uint8_t val = regs.get(j); if (val > get(j)) { set(j, val); } } } void GetHistogram(std::vector ®histo) const { if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { const uint8_t *r = m_regs.data(); uint8_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; for (int j = 0; j < 1024; j++) { /* Handle 16 registers per iteration. */ r0 = r[0] & 63; r1 = (r[0] >> 6 | r[1] << 2) & 63; r2 = (r[1] >> 4 | r[2] << 4) & 63; r3 = (r[2] >> 2) & 63; r4 = r[3] & 63; r5 = (r[3] >> 6 | r[4] << 2) & 63; r6 = (r[4] >> 4 | r[5] << 4) & 63; r7 = (r[5] >> 2) & 63; r8 = r[6] & 63; r9 = (r[6] >> 6 | r[7] << 2) & 63; r10 = (r[7] >> 4 | r[8] << 4) & 63; r11 = (r[8] >> 2) & 63; r12 = r[9] & 63; r13 = (r[9] >> 6 | r[10] << 2) & 63; r14 = (r[10] >> 4 | r[11] << 4) & 63; r15 = (r[11] >> 2) & 63; reghisto[r0]++; reghisto[r1]++; reghisto[r2]++; reghisto[r3]++; reghisto[r4]++; reghisto[r5]++; reghisto[r6]++; reghisto[r7]++; reghisto[r8]++; reghisto[r9]++; reghisto[r10]++; reghisto[r11]++; reghisto[r12]++; reghisto[r13]++; reghisto[r14]++; reghisto[r15]++; r += 12; } } else { for (uint16_t j = 0; j < HLL_REGISTERS; j++) { reghisto[get(j)]++; } } } const uint8_t *c_ptr(size_t *len = NULL) const { if (len) *len = this->size(); return m_regs.data(); } private: std::vector m_regs; }; public: void add(uint8_t *p, size_t len) { uint64_t hash = m_hash(p, len); m_registers.ReplaceIfGreater(position(hash), ZeroRunLength(hash)); } size_t count() const { double m = HLL_REGISTERS; double E; std::vector reghisto(64, 0); m_registers.GetHistogram(reghisto); double z = m * tau((m-reghisto[HLL_Q+1])/(double)m); for (int j = HLL_Q; j >= 1; --j) { z += reghisto[j]; z *= 0.5; } z += m * sigma(reghisto[0]/(double)m); E = llroundl(HLL_ALPHA_INF*m*m/z); return (size_t)E; } void merge(HyperLogLog &hll) { m_registers.merge(hll.m_registers); } void inline reset() { m_registers.reset(); } /* The reg data size should be 1 byte less than its real size. */ inline size_t GetRegSize() const { return m_registers.size()-1; } int InitRegs(uint8_t *buf, size_t len) { if (len < GetRegSize()) return -1; len = std::min(len, GetRegSize()); m_registers.init(buf, len); return 0; } int DumpRegs(char *buf, size_t *len) const { if (!buf || !len || !*len) return -1; size_t reg_size = 0; const uint8_t *regs = m_registers.c_ptr(®_size); *len = std::min(reg_size, *len); memcpy(buf, regs, *len); return 0; } private: /* Helper function sigma as defined in * "New cardinality estimation algorithms for HyperLogLog sketches" * Otmar Ertl, arXiv:1702.01284 */ static double sigma(double x) { if (x == 1.) return INFINITY; double zPrime; double y = 1; double z = x; do { x *= x; zPrime = z; z += x * y; y += y; } while(zPrime != z); return z; } /* Helper function tau as defined in * "New cardinality estimation algorithms for HyperLogLog sketches" * Otmar Ertl, arXiv:1702.01284 */ static double tau(double x) { if (x == 0. || x == 1.) return 0.; double zPrime; double y = 1.0; double z = 1 - x; do { x = sqrt(x); zPrime = z; y *= 0.5; z -= pow(1 - x, 2)*y; } while(zPrime != z); return z / 3; } static inline uint16_t position(uint16_t hash) { return (hash & HLL_P_MASK); } static uint8_t ZeroRunLength(uint64_t hash) { uint8_t rl = 1; hash >>= HLL_P; hash |= ((uint64_t)1 << HLL_Q); while (!(hash & (uint64_t)1)) { rl++; hash >>= 1; } return rl; } private: HashT m_hash; Registers m_registers; }; class MurmurHash { public: MurmurHash(uint64_t seed = 0xadc83b19ULL) : m_seed(seed) {} ~MurmurHash() {} uint64_t operator() (uint8_t *p, size_t len) { const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; uint64_t h = m_seed ^ (len * m); const uint8_t *data = (const uint8_t *) p; const uint8_t *end = data + (len-(len&7)); while(data != end) { uint64_t k; #ifdef WORDS_BIGENDIAN k = (uint64_t) data[0]; k |= (uint64_t) data[1] << 8; k |= (uint64_t) data[2] << 16; k |= (uint64_t) data[3] << 24; k |= (uint64_t) data[4] << 32; k |= (uint64_t) data[5] << 40; k |= (uint64_t) data[6] << 48; k |= (uint64_t) data[7] << 56; #else k = *((const uint64_t*)data); #endif k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; data += 8; } switch(len & 7) { case 7: h ^= (uint64_t)data[6] << 48; /* fall-thru */ case 6: h ^= (uint64_t)data[5] << 40; /* fall-thru */ case 5: h ^= (uint64_t)data[4] << 32; /* fall-thru */ case 4: h ^= (uint64_t)data[3] << 24; /* fall-thru */ case 3: h ^= (uint64_t)data[2] << 16; /* fall-thru */ case 2: h ^= (uint64_t)data[1] << 8; /* fall-thru */ case 1: h ^= (uint64_t)data[0]; h *= m; /* fall-thru */ }; h ^= h >> r; h *= m; h ^= h >> r; return h; } private: uint64_t m_seed; }; /* bool hyperloglog_init(UDF_INIT *initid, UDF_ARGS *args, char *message); void hyperloglog_deinit(UDF_INIT *initid); void hyperloglog_clear(UDF_INIT *initid, char *is_null, char *error); void hyperloglog_add(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); char *hyperloglog(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error); bool hllndv_init(UDF_INIT *initid, UDF_ARGS *args, char *message); void hllndv_deinit(UDF_INIT *initid); void hllndv_clear(UDF_INIT *initid, char *is_null, char *error); void hllndv_add(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); longlong hllndv(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); */ bool hyperloglog_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { if (args->arg_count != 1) { strcpy(message, "HyperLogLog accepts only one argument"); return 1; } switch (args->arg_type[0]) { case ROW_RESULT: strcpy(message, "HyperLogLog cannot accept row type arguent"); return 1; default: break; } HyperLogLog *hll = new (std::nothrow) HyperLogLog(); if (!hll) return 1; initid->const_item = 0; initid->maybe_null = 0; initid->max_length = hll->GetRegSize(); initid->ptr = (char *) hll; return 0; } void hyperloglog_deinit(UDF_INIT *initid) { if (initid->ptr) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; initid->ptr = NULL; delete hll; } } void hyperloglog_clear(UDF_INIT *initid, char *is_null MY_ATTRIBUTE((unused)), char *error MY_ATTRIBUTE((unused))) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; hll->reset(); } void hyperloglog_add(UDF_INIT *initid, UDF_ARGS *args, char *is_null MY_ATTRIBUTE((unused)), char *error MY_ATTRIBUTE((unused))) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; if (!args->args[0]) { hll->add(NULL, 0); return; } switch (args->arg_type[0]) { case STRING_RESULT: case DECIMAL_RESULT: hll->add((uint8_t *) args->args[0], args->lengths[0]); break; case INT_RESULT: hll->add((uint8_t *) args->args[0], sizeof(longlong)); break; case REAL_RESULT: hll->add((uint8_t *) args->args[0], sizeof(double)); break; default: break; } } char *hyperloglog(UDF_INIT *initid MY_ATTRIBUTE((unused)), UDF_ARGS *args MY_ATTRIBUTE((unused)), char *result, unsigned long *length, char *is_null MY_ATTRIBUTE((unused)), char *error) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; if (*length < hll->GetRegSize()) { *error = 1; *length = hll->GetRegSize(); return NULL; } gs::udf::udf_counter.hyperloglog_counter++; if (hll->DumpRegs(result, length) != 0) { *error = 1; return NULL; } return result; } void hyperloglog_udf(gs::udf::Udf_definition *def) { def->m_name = const_cast("hyperloglog"); def->m_result = STRING_RESULT; def->m_type = UDFTYPE_AGGREGATE; def->m_func_init = (Udf_func_init)hyperloglog_init; def->m_func_deinit = (Udf_func_deinit)hyperloglog_deinit; def->m_func_add = (Udf_func_add)hyperloglog_add; def->m_func_clear = (Udf_func_clear)hyperloglog_clear; def->m_func = (Udf_func_any)hyperloglog; } bool hllndv_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { if (args->arg_count != 1) { strcpy(message, "HLLNDV accepts only one argument"); return 1; } switch (args->arg_type[0]) { case ROW_RESULT: strcpy(message, "HLLNDV cannot accept row type arguent"); return 1; default: break; } HyperLogLog *hll = new (std::nothrow) HyperLogLog(); if (!hll) return 1; initid->const_item = 0; initid->maybe_null = 0; initid->max_length = hll->GetRegSize(); initid->ptr = (char *) hll; return 0; } void hllndv_deinit(UDF_INIT *initid) { if (initid->ptr) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; initid->ptr = NULL; delete hll; } } void hllndv_clear(UDF_INIT *initid, char *is_null MY_ATTRIBUTE((unused)), char *error MY_ATTRIBUTE((unused))) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; hll->reset(); } void hllndv_add(UDF_INIT *initid, UDF_ARGS *args, char *is_null MY_ATTRIBUTE((unused)), char *error MY_ATTRIBUTE((unused))) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; if (!args->args[0]) { hll->add(NULL, 0); return; } switch (args->arg_type[0]) { case STRING_RESULT: case DECIMAL_RESULT: hll->add((uint8_t *) args->args[0], args->lengths[0]); break; case INT_RESULT: hll->add((uint8_t *) args->args[0], sizeof(longlong)); break; case REAL_RESULT: hll->add((uint8_t *) args->args[0], sizeof(double)); break; default: break; } } longlong hllndv(UDF_INIT *initid MY_ATTRIBUTE((unused)), UDF_ARGS *args MY_ATTRIBUTE((unused)), char *is_null MY_ATTRIBUTE((unused)), char *error MY_ATTRIBUTE((unused))) { HyperLogLog *hll = (HyperLogLog *) initid->ptr; gs::udf::udf_counter.hllndv_counter++; return (longlong) hll->count(); } void hllndv_udf(gs::udf::Udf_definition *def) { def->m_name = const_cast("hllndv"); def->m_result = INT_RESULT; def->m_type = UDFTYPE_AGGREGATE; def->m_func_init = (Udf_func_init)hllndv_init; def->m_func_deinit = (Udf_func_deinit)hllndv_deinit; def->m_func_add = (Udf_func_add)hllndv_add; def->m_func_clear = (Udf_func_clear)hllndv_clear; def->m_func = (Udf_func_any)hllndv; }