/* Copyright (c) 2014, 2017, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the Free Software Foundation. This program is also distributed with certain software (including but not limited to OpenSSL) that is licensed under separate terms, as designated in a particular file or component or in included license documentation. The authors of MySQL hereby grant you an additional permission to link the program and your derivative works with the separately licensed software that they have included with MySQL. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CHECKSUM_HPP #define CHECKSUM_HPP /** Optimized XOR checksum calculation. Loop unrolling will reduce relative loop overhead and encourace usage of parallel arithmetic adders which are common on most modern CPUs. */ inline Uint32 computeXorChecksumShort(const Uint32 *buf, Uint32 words, Uint32 sum = 0) { const Uint32 *end_unroll = buf + (words & ~3); const Uint32 *end = buf + words; /** * Aggregate as chunks of 4*Uint32 words: * Take care if rewriting this part, code has intentionally * been unrolled in order to take advantage of HW parallelism * where there are multiple adders in the CPU core. */ while (buf < end_unroll) { sum ^= buf[0] ^ buf[1] ^ buf[2] ^ buf[3]; buf += 4; } // Wrap up remaining part while (buf < end) { sum ^= buf[0]; buf++; } return sum; } /** Optimized XOR checksum calculation intended for longer strings. Temporary aggregate XOR-sums into Uint64 which are folded into Uint32 in the final stage. Also unrool loop as above to take advantage of HW parallelism. Callee is responsible for checking that there are sufficient 'words' to be checksumed to complete at least a chunk of 4*Uint64 words. */ inline Uint32 computeXorChecksumLong(const Uint32 *buf, Uint32 words, Uint32 sum = 0) { // Align to Uint64 boundary to optimize mem. access below if (((size_t)(buf) % 8) != 0) { sum ^= buf[0]; buf++; words--; } const Uint64 *p = reinterpret_cast(buf); Uint64 sum64 = *p++; const Uint32 words64 = (words/2) - 1; // Rem. after init of sum64 const Uint64 *end = p + (words64 & ~3); /** * Aggregate as chunks of 4*Uint64 words: * Take care if rewriting this part: code has intentionally * been unrolled in order to take advantage of HW parallelism * where there are multiple adders in the CPU core. */ do { sum64 ^= p[0] ^ p[1] ^ p[2] ^ p[3]; p+=4; } while (p < end); // Wrap up last part which didn't fit in a 4*Uint64 chunk end += (words64 % 4); while (p < end) { sum64 ^= p[0]; p++; } // Fold temp Uint64 sum into a final Uint32 sum sum ^= (Uint32)(sum64 & 0xffffffff) ^ (Uint32)(sum64 >> 32); // Append last odd Uint32 word if ((words%2) != 0) sum ^= buf[words-1]; return sum; } inline Uint32 computeXorChecksum(const Uint32 *buf, Uint32 words, Uint32 sum = 0) { if (words < 16) // Decided by empirical experiments return computeXorChecksumShort(buf,words,sum); else return computeXorChecksumLong(buf,words,sum); } inline Uint32 rotateChecksum(const Uint32 sum, Uint32 byte_steps) { assert(byte_steps > 0); assert(byte_steps < 4); const unsigned char *psum = static_cast(static_cast(&sum)); Uint32 rot; unsigned char *prot = static_cast(static_cast(&rot)); for (int i=0, j = byte_steps; i < 4; i ++, j = (j + 1) % 4) { prot[i] = psum[j]; } return rot; } /** * @buf series of bytes for which the checksum has to be computed * @bytes size of buf in bytes * @sum checksum */ inline Uint32 computeXorChecksumBytes(const unsigned char* buf, size_t bytes, Uint32 sum = 0) { assert(bytes > 0); // For undoing rotate size_t rotate_back = (size_t)buf % sizeof(Uint32); /** * Number of bytes at the start of buf that are not word aligned. * Also the index to the original byte 0 in checksum word. */ size_t rotate = (sizeof(Uint32) - rotate_back) % sizeof(Uint32); size_t words = (bytes > rotate) ? (bytes - rotate) / 4 : 0; // checksum buf[0..rotate-1] per byte if (rotate > 0) { unsigned char * psum = static_cast(static_cast(&sum)); for (size_t i = 0; i < rotate && i < bytes; i ++ ) { psum[i] ^= buf[i]; } } // checksum buf[rotate..rotate+4*words-1] per word if (words > 0) { // Rotate sum to match alignment if (rotate > 0) { sum = rotateChecksum(sum, rotate); } sum = computeXorChecksum(static_cast(static_cast(buf + rotate)), words, sum); // Rotate back sum if (rotate > 0) { sum = rotateChecksum(sum, rotate_back); } } // checksum buf[rotate+4*words..bytes-1] per byte { unsigned char * psum = static_cast(static_cast(&sum)); for (size_t i = rotate, j = rotate + 4 * words; j < bytes; j ++, i = (i + 1) %4) { psum[i] ^= buf[j]; } } /** * Return checksum rotated such that it can be passed in as checksum for * next buffer. The 'next byte to XOR' can be memorised in the checksum * itself by rotating the checksum so that byte 0 is always next. */ { size_t rotate_forward = bytes % 4; if (rotate_forward > 0) { sum = rotateChecksum(sum, rotate_forward); } } return sum; } #endif // CHECKSUM_HPP