polardbxengine/storage/ndb/include/util/Checksum.hpp

221 lines
5.9 KiB
C++

/*
Copyright (c) 2014, 2017, Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2.0,
as published by the Free Software Foundation.
This program is also distributed with certain software (including
but not limited to OpenSSL) that is licensed under separate terms,
as designated in a particular file or component or in included license
documentation. The authors of MySQL hereby grant you an additional
permission to link the program and your derivative works with the
separately licensed software that they have included with MySQL.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License, version 2.0, for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef CHECKSUM_HPP
#define CHECKSUM_HPP
/**
Optimized XOR checksum calculation. Loop unrolling will
reduce relative loop overhead and encourace usage of parallel
arithmetic adders which are common on most modern CPUs.
*/
inline
Uint32
computeXorChecksumShort(const Uint32 *buf, Uint32 words, Uint32 sum = 0)
{
const Uint32 *end_unroll = buf + (words & ~3);
const Uint32 *end = buf + words;
/**
* Aggregate as chunks of 4*Uint32 words:
* Take care if rewriting this part, code has intentionally
* been unrolled in order to take advantage of HW parallelism
* where there are multiple adders in the CPU core.
*/
while (buf < end_unroll)
{
sum ^= buf[0] ^ buf[1] ^ buf[2] ^ buf[3];
buf += 4;
}
// Wrap up remaining part
while (buf < end)
{
sum ^= buf[0];
buf++;
}
return sum;
}
/**
Optimized XOR checksum calculation intended for longer strings.
Temporary aggregate XOR-sums into Uint64 which are folded into
Uint32 in the final stage.
Also unrool loop as above to take advantage of HW parallelism.
Callee is responsible for checking that there are sufficient 'words'
to be checksumed to complete at least a chunk of 4*Uint64 words.
*/
inline
Uint32
computeXorChecksumLong(const Uint32 *buf, Uint32 words, Uint32 sum = 0)
{
// Align to Uint64 boundary to optimize mem. access below
if (((size_t)(buf) % 8) != 0)
{
sum ^= buf[0];
buf++;
words--;
}
const Uint64 *p = reinterpret_cast<const Uint64*>(buf);
Uint64 sum64 = *p++;
const Uint32 words64 = (words/2) - 1; // Rem. after init of sum64
const Uint64 *end = p + (words64 & ~3);
/**
* Aggregate as chunks of 4*Uint64 words:
* Take care if rewriting this part: code has intentionally
* been unrolled in order to take advantage of HW parallelism
* where there are multiple adders in the CPU core.
*/
do
{
sum64 ^= p[0] ^ p[1] ^ p[2] ^ p[3];
p+=4;
} while (p < end);
// Wrap up last part which didn't fit in a 4*Uint64 chunk
end += (words64 % 4);
while (p < end)
{
sum64 ^= p[0];
p++;
}
// Fold temp Uint64 sum into a final Uint32 sum
sum ^= (Uint32)(sum64 & 0xffffffff) ^
(Uint32)(sum64 >> 32);
// Append last odd Uint32 word
if ((words%2) != 0)
sum ^= buf[words-1];
return sum;
}
inline
Uint32
computeXorChecksum(const Uint32 *buf, Uint32 words, Uint32 sum = 0)
{
if (words < 16) // Decided by empirical experiments
return computeXorChecksumShort(buf,words,sum);
else
return computeXorChecksumLong(buf,words,sum);
}
inline
Uint32
rotateChecksum(const Uint32 sum, Uint32 byte_steps)
{
assert(byte_steps > 0);
assert(byte_steps < 4);
const unsigned char *psum = static_cast<const unsigned char*>(static_cast<const void*>(&sum));
Uint32 rot;
unsigned char *prot = static_cast<unsigned char*>(static_cast<void*>(&rot));
for (int i=0, j = byte_steps; i < 4; i ++, j = (j + 1) % 4)
{
prot[i] = psum[j];
}
return rot;
}
/**
* @buf series of bytes for which the checksum has to be computed
* @bytes size of buf in bytes
* @sum checksum
*/
inline
Uint32
computeXorChecksumBytes(const unsigned char* buf, size_t bytes, Uint32 sum = 0)
{
assert(bytes > 0);
// For undoing rotate
size_t rotate_back = (size_t)buf % sizeof(Uint32);
/**
* Number of bytes at the start of buf that are not word aligned.
* Also the index to the original byte 0 in checksum word.
*/
size_t rotate = (sizeof(Uint32) - rotate_back) % sizeof(Uint32);
size_t words = (bytes > rotate) ? (bytes - rotate) / 4 : 0;
// checksum buf[0..rotate-1] per byte
if (rotate > 0)
{
unsigned char * psum = static_cast<unsigned char*>(static_cast<void*>(&sum));
for (size_t i = 0; i < rotate && i < bytes; i ++ )
{
psum[i] ^= buf[i];
}
}
// checksum buf[rotate..rotate+4*words-1] per word
if (words > 0)
{
// Rotate sum to match alignment
if (rotate > 0)
{
sum = rotateChecksum(sum, rotate);
}
sum = computeXorChecksum(static_cast<const Uint32*>(static_cast<const void*>(buf + rotate)), words, sum);
// Rotate back sum
if (rotate > 0)
{
sum = rotateChecksum(sum, rotate_back);
}
}
// checksum buf[rotate+4*words..bytes-1] per byte
{
unsigned char * psum = static_cast<unsigned char*>(static_cast<void*>(&sum));
for (size_t i = rotate, j = rotate + 4 * words; j < bytes; j ++, i = (i + 1) %4)
{
psum[i] ^= buf[j];
}
}
/**
* Return checksum rotated such that it can be passed in as checksum for
* next buffer. The 'next byte to XOR' can be memorised in the checksum
* itself by rotating the checksum so that byte 0 is always next.
*/
{
size_t rotate_forward = bytes % 4;
if (rotate_forward > 0)
{
sum = rotateChecksum(sum, rotate_forward);
}
}
return sum;
}
#endif // CHECKSUM_HPP