/* Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, as published by the Free Software Foundation. This program is also distributed with certain software (including but not limited to OpenSSL) that is licensed under separate terms, as designated in a particular file or component or in included license documentation. The authors of MySQL hereby grant you an additional permission to link the program and your derivative works with the separately licensed software that they have included with MySQL. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Bug#16403708 SUBOPTIMAL CODE IN MY_STRNXFRM_SIMPLE() Bug#68476 Suboptimal code in my_strnxfrm_simple() Below we test some alternative implementations for my_strnxfrm_simple. In order to do benchmarking, configure in optimized mode, and generate a separate executable for this file: cmake -DMERGE_UNITTESTS=0 You may want to tweak some constants below: - experiment with num_iterations run './strings_strnxfrm-t --disable-tap-output' to see timing reports for your platform. Benchmarking with gcc and clang indicates that: There is insignificant difference between my_strnxfrm_simple and strnxfrm_new when src != dst my_strnxfrm_simple() is significantly faster than strnxfrm_new when src == dst, especially for long strings. Loop unrolling gives significant speedup for large strings. */ #include #include #include #include #include #include #include #include #include #include "my_inttypes.h" #include "my_sys.h" #include "template_utils.h" #include "unittest/gunit/benchmark.h" #include "unittest/gunit/strnxfrm.h" using std::make_pair; using std::max; using std::pair; using std::string; using std::to_string; using std::unordered_map; namespace strnxfrm_unittest { namespace { // Simply print out an array. void print_array(const uchar *arr, size_t len) { for (size_t i = 0; i < len; ++i) { fprintf(stderr, " %02x", arr[i]); if ((i % 8) == 7 || i == len - 1) fprintf(stderr, "\n"); } fprintf(stderr, "\n"); } // A function to compare two arrays and print them out in its entirety // (for easier context) if they are not equal. void expect_arrays_equal(const uchar *expected, const uchar *got, size_t len) { int num_err = 0; for (size_t i = 0; i < len && num_err < 5; ++i) { EXPECT_EQ(expected[i], got[i]); if (expected[i] != got[i]) ++num_err; } if (num_err) { fprintf(stderr, "Expected:\n"); for (size_t i = 0; i < len; ++i) { fprintf(stderr, " %c%02x", expected[i] != got[i] ? '*' : ' ', expected[i]); if ((i % 8) == 7 || i == len - 1) fprintf(stderr, "\n"); } fprintf(stderr, "\nGot:\n"); for (size_t i = 0; i < len; ++i) { fprintf(stderr, " %c%02x", expected[i] != got[i] ? '*' : ' ', got[i]); if ((i % 8) == 7 || i == len - 1) fprintf(stderr, "\n"); } fprintf(stderr, "\n"); } } CHARSET_INFO *init_collation(const char *name) { MY_CHARSET_LOADER loader; my_charset_loader_init_mysys(&loader); return my_collation_get_by_name(&loader, name, MYF(0)); } int compare_through_strxfrm(CHARSET_INFO *cs, const char *a, const char *b) { uchar abuf[256], bbuf[256]; int alen = my_strnxfrm(cs, abuf, sizeof(abuf), pointer_cast(a), strlen(a)); int blen = my_strnxfrm(cs, bbuf, sizeof(bbuf), pointer_cast(b), strlen(b)); if (false) // Enable this for debugging. { fprintf(stderr, "\n\nstrxfrm for '%s':\n", a); print_array(abuf, alen); fprintf(stderr, "strxfrm for '%s':\n", b); print_array(bbuf, blen); } int cmp = memcmp(abuf, bbuf, std::min(alen, blen)); if (cmp != 0) return cmp; if (alen == blen) { return 0; } else { return (alen < blen) ? -1 : 1; } } } // namespace #if !defined(DBUG_OFF) // There is no point in benchmarking anything in debug mode. const size_t num_iterations = 1ULL; #else // Set this so that each test case takes a few seconds. // And set it back to a small value before pushing!! // const size_t num_iterations= 20000000ULL; const size_t num_iterations = 2ULL; #endif class StrnxfrmTest : public ::testing::TestWithParam { protected: virtual void SetUp() { m_length = GetParam(); m_src.assign(m_length, 0x20); m_dst.assign(m_length, 0x20); } std::vector m_src; std::vector m_dst; size_t m_length; }; size_t test_values[] = {1, 10, 100, 1000}; INSTANTIATE_TEST_CASE_P(Strnxfrm, StrnxfrmTest, ::testing::ValuesIn(test_values)); TEST_P(StrnxfrmTest, OriginalSrcDst) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_orig(cs, &m_dst[0], m_length, m_length, &m_src[0], m_length, 192); } TEST_P(StrnxfrmTest, OriginalUnrolledSrcDst) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_orig_unrolled(cs, &m_dst[0], m_length, m_length, &m_src[0], m_length, 192); } TEST_P(StrnxfrmTest, ModifiedSrcDst) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_new(cs, &m_dst[0], m_length, m_length, &m_src[0], m_length, 192); } TEST_P(StrnxfrmTest, ModifiedUnrolledSrcDst) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_new_unrolled(cs, &m_dst[0], m_length, m_length, &m_src[0], m_length, 192); } TEST_P(StrnxfrmTest, OriginalSrcSrc) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_orig(cs, &m_src[0], m_length, m_length, &m_src[0], m_length, 192); } TEST_P(StrnxfrmTest, OriginalUnrolledSrcSrc) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_orig_unrolled(cs, &m_src[0], m_length, m_length, &m_src[0], m_length, 192); } TEST_P(StrnxfrmTest, ModifiedSrcSrc) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_new(cs, &m_src[0], m_length, m_length, &m_src[0], m_length, 192); } TEST_P(StrnxfrmTest, ModifiedUnrolledSrcSrc) { CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); for (size_t ix = 0; ix < num_iterations; ++ix) strnxfrm_new_unrolled(cs, &m_src[0], m_length, m_length, &m_src[0], m_length, 192); } TEST(StrXfrmTest, SimpleUTF8Correctness) { CHARSET_INFO *cs = init_collation("utf8_bin"); const char *src = "abc æøå 日本語"; unsigned char buf[32]; static const unsigned char full_answer_with_pad[32] = { 0x00, 0x61, 0x00, 0x62, 0x00, 0x63, // abc 0x00, 0x20, // space 0x00, 0xe6, 0x00, 0xf8, 0x00, 0xe5, // æøå 0x00, 0x20, // space 0x65, 0xe5, 0x67, 0x2c, 0x8a, 0x9e, // 日本語 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20 // space for padding }; for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) { memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, maxlen, pointer_cast(src), strlen(src)); expect_arrays_equal(full_answer_with_pad, buf, maxlen); } } TEST(StrXfrmTest, SimpleUTF8MB4Correctness) { CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); const char *src = "abc æøå 日本語"; unsigned char buf[30]; static const unsigned char full_answer_with_pad[30] = { 0x1c, 0x47, 0x1c, 0x60, 0x1c, 0x7a, // abc 0x02, 0x09, // space 0x1c, 0x47, 0x1c, 0xaa, 0x1d, 0xdd, 0x1c, 0x47, // æøå 0x02, 0x09, // space 0xfb, 0x40, 0xe5, 0xe5, 0xfb, 0x40, 0xe7, 0x2c, 0xfb, 0x41, 0x8a, 0x9e, // 日本語 }; for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) { memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, maxlen, pointer_cast(src), strlen(src)); expect_arrays_equal(full_answer_with_pad, buf, maxlen); } } TEST(StrXfrmTest, UTF8MB4Correctness_as_ci) { CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_ci"); const char *src = "abc æøå 日本語"; unsigned char buf[62]; static const unsigned char full_answer_with_pad[62] = { 0x1c, 0x47, 0x1c, 0x60, 0x1c, 0x7a, // abc 0x02, 0x09, // space 0x1c, 0x47, 0x1c, 0xaa, 0x1d, 0xdd, 0x1c, 0x47, // æøå 0x02, 0x09, // space 0xfb, 0x40, 0xe5, 0xe5, 0xfb, 0x40, 0xe7, 0x2c, // 日本語 0xfb, 0x41, 0x8a, 0x9e, 0x00, 0x00, // level separator 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // abc 0x00, 0x20, // space 0x00, 0x20, 0x01, 0x10, 0x00, 0x20, 0x00, 0x20, // æøå 0x00, 0x2F, 0x00, 0x20, 0x00, 0x29, 0x00, 0x20, // space 0x00, 0x20, 0x00, 0x20, 0x00, 0x20 // 日本語 }; for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) { memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, maxlen, pointer_cast(src), strlen(src)); expect_arrays_equal(full_answer_with_pad, buf, maxlen); } } TEST(StrXfrmTest, UTF8MB4Correctness_as_ci_1) { CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_ci"); // case insensitive EXPECT_EQ(compare_through_strxfrm(cs, "Abc", "aBC"), 0); // accent sensitive EXPECT_NE(compare_through_strxfrm(cs, "ǍḄÇ", "ÁḆĈ"), 0); EXPECT_NE(compare_through_strxfrm(cs, u8"\uA73A", u8"\uA738"), 0); // Hangul decomposition EXPECT_EQ(compare_through_strxfrm(cs, u8"\uAC00", u8"\u326E"), 0); } TEST(StrXfrmTest, JapaneseUTF8MB4) { CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs"); const char *src = "\x61\x41\xCA\xAC\xCA\xAD" // latin 'aAʬʭ' // Hiragana and Katakana 'ぁンはばぱ' "\xE3\x81\x81\xE3\x83\xB3\xE3\x81\xAF\xE3\x81\xB0\xE3\x81\xB1" // Japanese Han '亜熙憐' "\xE4\xBA\x9C\xE7\x86\x99\xE6\x86\x90" // Other Han '﨎㐀' "\xEF\xA8\x8E\xE3\x90\x80" // Greek, Coptic etc. 'αⲁаⳤퟻ' "\xCE\xB1\xE2\xB2\x81\xD0\xB0\xE2\xB3\xA4\xED\x9F\xBB"; static const unsigned char full_answer_with_pad[156] = { // Level 1 0x1C, 0x47, 0x1C, 0x47, 0x1F, 0xB1, 0x1F, 0xB5, // latin 0x1F, 0xB6, 0x1F, 0xE7, 0x1F, 0xD0, 0x1F, 0xD0, // Hiragana and Katakana 0x1F, 0xD0, 0x54, 0xA4, 0x6D, 0x76, 0x60, 0x00, // Japanese Han 0xFB, 0x41, 0xFA, 0x0E, 0xFB, 0x80, 0xB4, 0x00, // Other Han 0xFB, 0x86, 0x1F, 0xB9, 0xFB, 0x86, 0x1F, 0xE6, // Greek, Coptic etc. 0xFB, 0x86, 0x20, 0x22, 0xFB, 0x86, 0x1F, 0xF1, 0xFB, 0x86, 0x1F, 0xE6, 0xFB, 0x86, 0x1F, 0xF0, 0xFB, 0x86, 0x3D, 0x59, 0x00, 0x00, // Level separator // Level 2 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // latin 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Hiragana and Katakana 0x00, 0x37, 0x00, 0x20, 0x00, 0x38, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Japanese Han 0x00, 0x20, 0x00, 0x20, // Other Han 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Greek, Coptic etc. 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, // Level separator // Level 3 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, // latin 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, // Hiragana and Katakana 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, // Japanese Han 0x00, 0x02, 0x00, 0x02, // Other Han 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x04, // Greek, Coptic etc. 0x00, 0x04, 0x00, 0x04, 0x00, 0x02}; unsigned char buf[sizeof(full_answer_with_pad)]; for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) { memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, maxlen, pointer_cast(src), strlen(src)); expect_arrays_equal(full_answer_with_pad, buf, maxlen); } } TEST(StrXfrmTest, Japanese_ks_UTF8MB4) { CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs_ks"); /* Weights of Japanese Han, Other Han, Greek, Coptic are not changed comparing to the test result of collation utf8mb4_ja_0900_as_cs (in test JapaneseUtf8mb4 above). But additional quaternary weight is added for Hiragana and Katakana characters. */ const char *src = "\x61\x41\xCA\xAC\xCA\xAD" // latin 'aAʬʭ' // Hiragana and Katakana 'ぁンはばぱ' "\xE3\x81\x81\xE3\x83\xB3\xE3\x81\xAF\xE3\x81\xB0\xE3\x81\xB1" // Japanese Han '亜熙憐' "\xE4\xBA\x9C\xE7\x86\x99\xE6\x86\x90" // Other Han '﨎㐀' "\xEF\xA8\x8E\xE3\x90\x80" // Greek, Coptic etc. 'αⲁаⳤퟻ' "\xCE\xB1\xE2\xB2\x81\xD0\xB0\xE2\xB3\xA4\xED\x9F\xBB" // Prefix context 'さー' and 'サー' "\xE3\x81\x95\xE3\x83\xBC\xE3\x82\xB5\xE3\x83\xBC"; static const unsigned char full_answer_with_pad[] = { // Level 1 0x1C, 0x47, 0x1C, 0x47, 0x1F, 0xB1, 0x1F, 0xB5, // latin 0x1F, 0xB6, 0x1F, 0xE7, 0x1F, 0xD0, 0x1F, 0xD0, // Hiragana and Katakana 0x1F, 0xD0, 0x54, 0xA4, 0x6D, 0x76, 0x60, 0x00, // Japanese Han 0xFB, 0x41, 0xFA, 0x0E, 0xFB, 0x80, 0xB4, 0x00, // Other Han 0xFB, 0x86, 0x1F, 0xB9, 0xFB, 0x86, 0x1F, 0xE6, // Greek, Coptic etc. 0xFB, 0x86, 0x20, 0x22, 0xFB, 0x86, 0x1F, 0xF1, 0xFB, 0x86, 0x1F, 0xE6, 0xFB, 0x86, 0x1F, 0xF0, 0xFB, 0x86, 0x3D, 0x59, 0x1F, 0xC1, 0x1F, 0xB6, 0x1F, 0xC1, 0x1F, 0xB6, // Prefix context 0x00, 0x00, // Level separator // Level 2 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // latin 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Hiragana and Katakana 0x00, 0x37, 0x00, 0x20, 0x00, 0x38, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Japanese Han 0x00, 0x20, 0x00, 0x20, // Other Han 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Greek, Coptic etc. 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Prefix context 0x00, 0x00, // Level separator // Level 3 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, // latin 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, // Hiragana and Katakana 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, // Japanese Han 0x00, 0x02, 0x00, 0x02, // Other Han 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x04, // Greek, Coptic etc. 0x00, 0x04, 0x00, 0x04, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, // Prefix context 0x00, 0x0C, 0x00, 0x21, 0x00, 0x00, // Level separator // Level 4 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, // Hiragana and Katakana 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08 // Prefix context }; unsigned char buf[sizeof(full_answer_with_pad)]; for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) { memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, maxlen, pointer_cast(src), strlen(src)); expect_arrays_equal(full_answer_with_pad, buf, maxlen); } CHARSET_INFO *as_cs = init_collation("utf8mb4_ja_0900_as_cs"); CHARSET_INFO *as_cs_ks = init_collation("utf8mb4_ja_0900_as_cs_ks"); // utf8 "にほんご" const char *str1 = "\xE3\x81\xAB\xE3\x81\xBB\xE3\x82\x93\xE3\x81\x94"; // utf8 "ニホンゴ" const char *str2 = "\xE3\x83\x8B\xE3\x83\x9B\xE3\x83\xB3\xE3\x82\xB4"; EXPECT_EQ(compare_through_strxfrm(as_cs, str1, str2), 0); EXPECT_LT(compare_through_strxfrm(as_cs_ks, str1, str2), 0); str1 = "\xE3\x81\xAF\xE3\x81\xAF"; // utf8 "はは" str2 = "\xE3\x81\xAF\xE3\x83\x8F"; // utf8 "はハ" const char *str3 = "\xE3\x83\x8F\xE3\x81\xAF"; // utf8 "ハは" const char *str4 = "\xE3\x83\x8F\xE3\x83\x8F"; // utf8 "ハハ" EXPECT_EQ(compare_through_strxfrm(as_cs, str1, str2), 0); EXPECT_EQ(compare_through_strxfrm(as_cs, str2, str3), 0); EXPECT_EQ(compare_through_strxfrm(as_cs, str3, str4), 0); EXPECT_LT(compare_through_strxfrm(as_cs_ks, str1, str2), 0); EXPECT_LT(compare_through_strxfrm(as_cs_ks, str2, str3), 0); EXPECT_LT(compare_through_strxfrm(as_cs_ks, str3, str4), 0); } TEST(StrXfrmTest, JapaneseUTF8MB4_1) { CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs"); // Japanese HE followed with Handakuten mark const char *src1 = "\xE3\x81\xB8\xE3\x82\x99"; // Japanese HE followed with voiced length mark const char *src2 = "\xE3\x81\xB8\xE3\x82\x9E"; /* When the voiced length mark is after 'HE', it should sort before 'HE followed with Handakuten mark'on tertiary level. */ static const unsigned char answer1[] = {0x1F, 0xD3, 0x00, 0x00, 0x00, 0x20, 0x00, 0x37, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x02}; static const unsigned char answer2[] = { 0x1F, 0xD3, 0x1F, 0xD3, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x01, 0x00, 0x21}; unsigned char buf[32]; size_t buf_len = sizeof(answer1); memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, buf_len, pointer_cast(src1), strlen(src1)); expect_arrays_equal(answer1, buf, buf_len); buf_len = sizeof(answer2); memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, buf_len, pointer_cast(src2), strlen(src2)); expect_arrays_equal(answer2, buf, buf_len); } TEST(StrXfrmTest, UTF8MB4PadCorrectness) { CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_cs"); const char *src = "abc "; unsigned char buf[46]; static const unsigned char full_answer[52] = { 0x1c, 0x47, 0x1c, 0x60, 0x1c, 0x7a, // abc 0x02, 0x09, 0x02, 0x09, 0x02, 0x09, 0x02, 0x09, // Four spaces. 0x00, 0x00, // Level separator. 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Accents for abc. 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Accents for four spaces. 0x00, 0x00, // Level separator. 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, // Case for abc. 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, // Case for four spaces. }; for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) { SCOPED_TRACE("maxlen=" + to_string(maxlen) + "/" + to_string(sizeof(buf))); memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, maxlen, pointer_cast(src), strlen(src)); expect_arrays_equal(full_answer, buf, maxlen); } } TEST(StrXfrmTest, NullPointer) { CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); unsigned char buf[256]; memset(buf, 0x33, sizeof(buf)); cs->coll->strnxfrm(cs, buf, sizeof(buf), sizeof(buf), nullptr, 0, MY_STRXFRM_PAD_TO_MAXLEN); for (size_t i = 0; i < sizeof(buf); ++i) { EXPECT_EQ(0, buf[i]); } } // Benchmark based on reduced test case in Bug #83247 / #24788778. // // Note: This benchmark does not exercise any real multibyte characters; // it is mostly exercising padding. If we change the test string to contain // e.g. Japanese characters, performance goes down by ~20%. static void BM_SimpleUTF8(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8_bin"); static constexpr int key_cols = 12; static constexpr int set_key_cols = 6; // Only the first half is set. static constexpr int key_col_chars = 80; static constexpr int bytes_per_char = 3; static constexpr int key_bytes = key_col_chars * bytes_per_char; static constexpr int buffer_bytes = key_cols * key_bytes; unsigned char source[buffer_bytes]; unsigned char dest[buffer_bytes]; const char *content = "PolyFilla27773"; const int len = strlen(content); memset(source, 0, sizeof(source)); for (int k = 0, offset = 0; k < set_key_cols; ++k, offset += key_bytes) { memcpy(source + offset, content, len); } StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { for (int k = 0, offset = 0; k < key_cols; ++k, offset += key_bytes) { if (k < set_key_cols) { my_strnxfrm(cs, dest + offset, key_bytes, source + offset, len); } else { my_strnxfrm(cs, dest + offset, key_bytes, source + offset, 0); } } } StopBenchmarkTiming(); } BENCHMARK(BM_SimpleUTF8) // Verifies using my_charpos to find the length of a string. // hp_hash.c does this extensively. Not really a strnxfrm benchmark, // but belongs to the same optimization effort. static void BM_UTF8MB4StringLength(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); // Some English text, then some Norwegian text, then some Japanese, // and then a few emoji (the last with skin tone modifiers). const char *content = "Premature optimization is the root of all evil. " "Våre norske tegn bør æres. 日本語が少しわかります。 " "✌️🐶👩🏽"; const int len = strlen(content); int tot_len = 0; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { tot_len += my_charpos(cs, content, content + len, len / cs->mbmaxlen); } StopBenchmarkTiming(); EXPECT_NE(0, tot_len); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_UTF8MB4StringLength) // Benchmark testing the default recommended collation for 8.0, without // stressing padding as much, but still testing only Latin letters. static void BM_SimpleUTF8MB4(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); const char *content = "This is a rather long string that contains only " "simple letters that are available in ASCII. This is a common special " "case that warrants a benchmark on its own, even if the character set " "and collation supports much more complicated scenarios."; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1e, 0x33, 0x1c, 0x47, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x1e, 0x33, 0x02, 0x09, 0x1d, 0x77, 0x1d, 0xdd, 0x1d, 0xb9, 0x1c, 0xf4, 0x02, 0x09, 0x1e, 0x71, 0x1e, 0x95, 0x1e, 0x33, 0x1d, 0x32, 0x1d, 0xb9, 0x1c, 0xf4, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x95, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0xdd, 0x1d, 0xb9, 0x1e, 0x95, 0x1c, 0x47, 0x1d, 0x32, 0x1d, 0xb9, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0xdd, 0x1d, 0xb9, 0x1d, 0x77, 0x1f, 0x0b, 0x02, 0x09, 0x1e, 0x71, 0x1d, 0x32, 0x1d, 0xaa, 0x1e, 0x0c, 0x1d, 0x77, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0x77, 0x1c, 0xaa, 0x1e, 0x95, 0x1e, 0x95, 0x1c, 0xaa, 0x1e, 0x33, 0x1e, 0x71, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x95, 0x02, 0x09, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0x47, 0x1e, 0xe3, 0x1c, 0x47, 0x1d, 0x32, 0x1d, 0x77, 0x1c, 0x47, 0x1c, 0x60, 0x1d, 0x77, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0x32, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0x47, 0x1e, 0x71, 0x1c, 0x7a, 0x1d, 0x32, 0x1d, 0x32, 0x02, 0x77, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0xdd, 0x1d, 0xaa, 0x1d, 0xaa, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1e, 0x71, 0x1e, 0x0c, 0x1c, 0xaa, 0x1c, 0x7a, 0x1d, 0x32, 0x1c, 0x47, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x71, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x95, 0x02, 0x09, 0x1e, 0xf5, 0x1c, 0x47, 0x1e, 0x33, 0x1e, 0x33, 0x1c, 0x47, 0x1d, 0xb9, 0x1e, 0x95, 0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1c, 0x60, 0x1c, 0xaa, 0x1d, 0xb9, 0x1c, 0x7a, 0x1d, 0x18, 0x1d, 0xaa, 0x1c, 0x47, 0x1e, 0x33, 0x1d, 0x65, 0x02, 0x09, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32, 0x1e, 0x95, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0xdd, 0x1e, 0xf5, 0x1d, 0xb9, 0x02, 0x22, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0xe3, 0x1c, 0xaa, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32, 0x1c, 0xe5, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0x47, 0x1c, 0x7a, 0x1e, 0x95, 0x1c, 0xaa, 0x1e, 0x33, 0x02, 0x09, 0x1e, 0x71, 0x1c, 0xaa, 0x1e, 0x95, 0x02, 0x09, 0x1c, 0x47, 0x1d, 0xb9, 0x1c, 0x8f, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0xdd, 0x1d, 0x77, 0x1d, 0x77, 0x1c, 0x47, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1e, 0x71, 0x1e, 0xb5, 0x1e, 0x0c, 0x1e, 0x0c, 0x1d, 0xdd, 0x1e, 0x33, 0x1e, 0x95, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0xaa, 0x1e, 0xb5, 0x1c, 0x7a, 0x1d, 0x18, 0x02, 0x09, 0x1d, 0xaa, 0x1d, 0xdd, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0x7a, 0x1d, 0xdd, 0x1d, 0xaa, 0x1e, 0x0c, 0x1d, 0x77, 0x1d, 0x32, 0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x95, 0x1c, 0xaa, 0x1c, 0x8f, 0x02, 0x09, 0x1e, 0x71, 0x1c, 0x7a, 0x1c, 0xaa, 0x1d, 0xb9, 0x1c, 0x47, 0x1e, 0x33, 0x1d, 0x32, 0x1d, 0xdd, 0x1e, 0x71, 0x02, 0x77}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_SimpleUTF8MB4) // Benchmark testing a wider variety of character sets on a more complicated // collation (the recommended default collation for 8.0), without stressing // padding as much. static void BM_MixedUTF8MB4(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); // Some English text, then some Norwegian text, then some Japanese, // and then a few emoji (the last with skin tone modifiers). const char *content = "Premature optimization is the root of all evil. " "Våre norske tegn bør æres. 日本語が少しわかります。 " "✌️🐶👩🏽"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x1e, 0x0c, 0x1e, 0x33, 0x1c, 0xaa, 0x1d, 0xaa, 0x1c, 0x47, 0x1e, 0x95, 0x1e, 0xb5, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xdd, 0x1e, 0x0c, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xaa, 0x1d, 0x32, 0x1f, 0x21, 0x1c, 0x47, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x33, 0x1d, 0xdd, 0x1d, 0xdd, 0x1e, 0x95, 0x02, 0x09, 0x1d, 0xdd, 0x1c, 0xe5, 0x02, 0x09, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0xe3, 0x1d, 0x32, 0x1d, 0x77, 0x02, 0x77, 0x02, 0x09, 0x1e, 0xe3, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xb9, 0x1d, 0xdd, 0x1e, 0x33, 0x1e, 0x71, 0x1d, 0x65, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x95, 0x1c, 0xaa, 0x1c, 0xf4, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0x60, 0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0xaa, 0x1e, 0x33, 0x1c, 0xaa, 0x1e, 0x71, 0x02, 0x77, 0x02, 0x09, 0xfb, 0x40, 0xe5, 0xe5, 0xfb, 0x40, 0xe7, 0x2c, 0xfb, 0x41, 0x8a, 0x9e, 0x3d, 0x60, 0xfb, 0x40, 0xdc, 0x11, 0x3d, 0x66, 0x3d, 0x87, 0x3d, 0x60, 0x3d, 0x83, 0x3d, 0x79, 0x3d, 0x67, 0x02, 0x8a, 0x02, 0x09, 0x0a, 0x2d, 0x13, 0xdf, 0x14, 0x12, 0x13, 0xa6}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_MixedUTF8MB4) static void BM_MixedUTF8MB4_AS_CI(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_ci"); // Some English text, then some Norwegian text, then some Japanese, // and then a few emoji (the last with skin tone modifiers). const char *content = "Premature optimization is the root of all evil. " "Våre norske tegn bør æres. 日本語が少しわかります。 " "✌️🐶👩🏽"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x1e, 0x0c, 0x1e, 0x33, 0x1c, 0xaa, 0x1d, 0xaa, 0x1c, 0x47, 0x1e, 0x95, 0x1e, 0xb5, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xdd, 0x1e, 0x0c, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xaa, 0x1d, 0x32, 0x1f, 0x21, 0x1c, 0x47, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x33, 0x1d, 0xdd, 0x1d, 0xdd, 0x1e, 0x95, 0x02, 0x09, 0x1d, 0xdd, 0x1c, 0xe5, 0x02, 0x09, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0xe3, 0x1d, 0x32, 0x1d, 0x77, 0x02, 0x77, 0x02, 0x09, 0x1e, 0xe3, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xb9, 0x1d, 0xdd, 0x1e, 0x33, 0x1e, 0x71, 0x1d, 0x65, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x95, 0x1c, 0xaa, 0x1c, 0xf4, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0x60, 0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0xaa, 0x1e, 0x33, 0x1c, 0xaa, 0x1e, 0x71, 0x02, 0x77, 0x02, 0x09, 0xfb, 0x40, 0xe5, 0xe5, 0xfb, 0x40, 0xe7, 0x2c, 0xfb, 0x41, 0x8a, 0x9e, 0x3d, 0x60, 0xfb, 0x40, 0xdc, 0x11, 0x3d, 0x66, 0x3d, 0x87, 0x3d, 0x60, 0x3d, 0x83, 0x3d, 0x79, 0x3d, 0x67, 0x02, 0x8a, 0x02, 0x09, 0x0a, 0x2d, 0x13, 0xdf, 0x14, 0x12, 0x13, 0xa6, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x29, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x2F, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x01, 0x10, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_MixedUTF8MB4_AS_CI) // Case-sensitive, accent-sensitive benchmark, using the same string as // BM_SimpleUTF8MB4. This will naturally be slower, since many more weights // need to be generated. static void BM_MixedUTF8MB4_AS_CS(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_as_cs"); // Some English text, then some Norwegian text, then some Japanese, // and then a few emoji (the last with skin tone modifiers). const char *content = "Premature optimization is the root of all evil. " "Våre norske tegn bør æres. 日本語が少しわかります。 " "✌️🐶👩🏽"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { // Primary weights. 0x1e, 0x0c, 0x1e, 0x33, 0x1c, 0xaa, 0x1d, 0xaa, 0x1c, 0x47, 0x1e, 0x95, 0x1e, 0xb5, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xdd, 0x1e, 0x0c, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xaa, 0x1d, 0x32, 0x1f, 0x21, 0x1c, 0x47, 0x1e, 0x95, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0xb9, 0x02, 0x09, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1e, 0x95, 0x1d, 0x18, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x33, 0x1d, 0xdd, 0x1d, 0xdd, 0x1e, 0x95, 0x02, 0x09, 0x1d, 0xdd, 0x1c, 0xe5, 0x02, 0x09, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0xe3, 0x1d, 0x32, 0x1d, 0x77, 0x02, 0x77, 0x02, 0x09, 0x1e, 0xe3, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x09, 0x1d, 0xb9, 0x1d, 0xdd, 0x1e, 0x33, 0x1e, 0x71, 0x1d, 0x65, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0x95, 0x1c, 0xaa, 0x1c, 0xf4, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0x60, 0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0xaa, 0x1e, 0x33, 0x1c, 0xaa, 0x1e, 0x71, 0x02, 0x77, 0x02, 0x09, 0xfb, 0x40, 0xe5, 0xe5, 0xfb, 0x40, 0xe7, 0x2c, 0xfb, 0x41, 0x8a, 0x9e, 0x3d, 0x60, 0xfb, 0x40, 0xdc, 0x11, 0x3d, 0x66, 0x3d, 0x87, 0x3d, 0x60, 0x3d, 0x83, 0x3d, 0x79, 0x3d, 0x67, 0x02, 0x8a, 0x02, 0x09, 0x0a, 0x2d, 0x13, 0xdf, 0x14, 0x12, 0x13, 0xa6, // Level separator. 0x00, 0x00, // Secondary weights. 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x29, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x2f, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x01, 0x10, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // Level separator. 0x00, 0x00, // Tertiary weights. 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0e, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0e, 0x00, 0x0e, 0x00, 0x0e, 0x00, 0x0e, 0x00, 0x0e, 0x00, 0x0e, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, }; uchar dest[sizeof(expected)]; size_t ret = 0; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { ret = my_strnxfrm(cs, dest, sizeof(dest), pointer_cast(content), len); } StopBenchmarkTiming(); EXPECT_EQ(sizeof(expected), ret); expect_arrays_equal(expected, dest, ret); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_MixedUTF8MB4_AS_CS) // Specifically benchmark Japanese text. static void BM_JapaneseUTF8MB4(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); const char *content = "データの保存とアクセスを行うストレージエンジンがSQLパーサとは" "分離独立しており、用途に応じたストレージエンジンを選択できる" "「マルチストレージエンジン」方式を採用している。"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x3d, 0x6d, 0x1c, 0x0e, 0x3d, 0x6a, 0x3d, 0x73, 0xfb, 0x40, 0xcf, 0xdd, 0xfb, 0x40, 0xdb, 0x58, 0x3d, 0x6e, 0x3d, 0x5a, 0x3d, 0x62, 0x3d, 0x68, 0x3d, 0x67, 0x3d, 0x8a, 0xfb, 0x41, 0x88, 0x4c, 0x3d, 0x5c, 0x3d, 0x67, 0x3d, 0x6e, 0x3d, 0x85, 0x1c, 0x0e, 0x3d, 0x66, 0x3d, 0x5e, 0x3d, 0x8b, 0x3d, 0x66, 0x3d, 0x8b, 0x3d, 0x60, 0x1e, 0x71, 0x1e, 0x21, 0x1d, 0x77, 0x3d, 0x74, 0x1c, 0x0e, 0x3d, 0x65, 0x3d, 0x6e, 0x3d, 0x74, 0xfb, 0x40, 0xd2, 0x06, 0xfb, 0x41, 0x96, 0xe2, 0xfb, 0x40, 0xf2, 0xec, 0xfb, 0x40, 0xfa, 0xcb, 0x3d, 0x66, 0x3d, 0x6d, 0x3d, 0x5f, 0x3d, 0x83, 0x02, 0x31, 0xfb, 0x40, 0xf5, 0x28, 0xfb, 0x41, 0x90, 0x14, 0x3d, 0x70, 0xfb, 0x40, 0xdf, 0xdc, 0x3d, 0x66, 0x3d, 0x6a, 0x3d, 0x67, 0x3d, 0x6e, 0x3d, 0x85, 0x1c, 0x0e, 0x3d, 0x66, 0x3d, 0x5e, 0x3d, 0x8b, 0x3d, 0x66, 0x3d, 0x8b, 0x3d, 0x8a, 0xfb, 0x41, 0x90, 0x78, 0xfb, 0x40, 0xe2, 0x9e, 0x3d, 0x6d, 0x3d, 0x61, 0x3d, 0x84, 0x03, 0x73, 0x3d, 0x79, 0x3d, 0x84, 0x3d, 0x6b, 0x3d, 0x67, 0x3d, 0x6e, 0x3d, 0x85, 0x1c, 0x0e, 0x3d, 0x66, 0x3d, 0x5e, 0x3d, 0x8b, 0x3d, 0x66, 0x3d, 0x8b, 0x03, 0x74, 0xfb, 0x40, 0xe5, 0xb9, 0xfb, 0x40, 0xdf, 0x0f, 0x3d, 0x8a, 0xfb, 0x40, 0xe3, 0xa1, 0xfb, 0x40, 0xf5, 0x28, 0x3d, 0x66, 0x3d, 0x6d, 0x3d, 0x5b, 0x3d, 0x84, 0x02, 0x8a}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_JapaneseUTF8MB4) /* A benchmark that illustrates the potential perils of not including the range [0x00,0x20) in our fast path; newlines throw us off the fast path and reduce speed. The newlines are spaced a bit randomly in order not to create a perfectly predictable pattern for the branch predictor (benchmark paranoia). */ static void BM_NewlineFilledUTF8MB4(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); const char *content = "This is a\n prett\ny unrealist\nic case; a\nn " "Eng\nlish sente\nnce where\n we'\nve added a new\nline every te\nn " "bytes or\n so.\n"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0x32, 0x1e, 0x71, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x02, 0x02, 0x09, 0x1e, 0x0c, 0x1e, 0x33, 0x1c, 0xaa, 0x1e, 0x95, 0x1e, 0x95, 0x02, 0x02, 0x1f, 0x0b, 0x02, 0x09, 0x1e, 0xb5, 0x1d, 0xb9, 0x1e, 0x33, 0x1c, 0xaa, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x32, 0x1e, 0x71, 0x1e, 0x95, 0x02, 0x02, 0x1d, 0x32, 0x1c, 0x7a, 0x02, 0x09, 0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x71, 0x1c, 0xaa, 0x02, 0x34, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x02, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0xaa, 0x1d, 0xb9, 0x1c, 0xf4, 0x02, 0x02, 0x1d, 0x77, 0x1d, 0x32, 0x1e, 0x71, 0x1d, 0x18, 0x02, 0x09, 0x1e, 0x71, 0x1c, 0xaa, 0x1d, 0xb9, 0x1e, 0x95, 0x1c, 0xaa, 0x02, 0x02, 0x1d, 0xb9, 0x1c, 0x7a, 0x1c, 0xaa, 0x02, 0x09, 0x1e, 0xf5, 0x1d, 0x18, 0x1c, 0xaa, 0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x02, 0x02, 0x09, 0x1e, 0xf5, 0x1c, 0xaa, 0x03, 0x05, 0x02, 0x02, 0x1e, 0xe3, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0x8f, 0x1c, 0x8f, 0x1c, 0xaa, 0x1c, 0x8f, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1d, 0xb9, 0x1c, 0xaa, 0x1e, 0xf5, 0x02, 0x02, 0x1d, 0x77, 0x1d, 0x32, 0x1d, 0xb9, 0x1c, 0xaa, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0xe3, 0x1c, 0xaa, 0x1e, 0x33, 0x1f, 0x0b, 0x02, 0x09, 0x1e, 0x95, 0x1c, 0xaa, 0x02, 0x02, 0x1d, 0xb9, 0x02, 0x09, 0x1c, 0x60, 0x1f, 0x0b, 0x1e, 0x95, 0x1c, 0xaa, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x02, 0x02, 0x09, 0x1e, 0x71, 0x1d, 0xdd, 0x02, 0x77, 0x02, 0x02}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_NewlineFilledUTF8MB4) static void BM_HashSimpleUTF8MB4(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); const char *content = "This is a rather long string that contains only " "simple letters that are available in ASCII. This is a common special " "case that warrants a benchmark on its own, even if the character set " "and collation supports much more complicated scenarios."; const int len = strlen(content); uint64 nr1 = 1, nr2 = 4; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { cs->coll->hash_sort(cs, reinterpret_cast(content), len, &nr1, &nr2); } StopBenchmarkTiming(); /* Just to keep the compiler from optimizing away everything; this is highly unlikely to ever happen given hash function that's not totally broken. Don't test for an exact value; it will vary by platform and number of iterations. */ EXPECT_FALSE(nr1 == 0 && nr2 == 0); } BENCHMARK(BM_HashSimpleUTF8MB4) /* Test a non-trivial collation with contractions, to highlight the performance difference. */ static void BM_Hungarian_AS_CS(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_hu_0900_as_cs"); // Text snippet from Wikipedia. const char *content = "A MySQL adatbázisok adminisztrációjára a mellékelt " "parancssori eszközöket (mysql és mysqladmin) használhatjuk."; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x1c, 0x47, 0x02, 0x09, 0x1d, 0xaa, 0x1f, 0x0b, 0x1e, 0x71, 0x1e, 0x21, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0x8f, 0x1c, 0x47, 0x1e, 0x95, 0x1c, 0x60, 0x1c, 0x47, 0x1f, 0x21, 0x1d, 0x32, 0x1e, 0x71, 0x1d, 0xdd, 0x1d, 0x65, 0x02, 0x09, 0x1c, 0x47, 0x1c, 0x8f, 0x1d, 0xaa, 0x1d, 0x32, 0x1d, 0xb9, 0x1d, 0x32, 0x1e, 0x71, 0x54, 0xa5, 0x1e, 0x95, 0x1e, 0x33, 0x1c, 0x47, 0x1c, 0x7a, 0x1d, 0x32, 0x1d, 0xdd, 0x1d, 0x4c, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0x47, 0x02, 0x09, 0x1c, 0x47, 0x02, 0x09, 0x1d, 0xaa, 0x1c, 0xaa, 0x1d, 0x77, 0x1d, 0x77, 0x1c, 0xaa, 0x1d, 0x65, 0x1c, 0xaa, 0x1d, 0x77, 0x1e, 0x95, 0x02, 0x09, 0x1e, 0x0c, 0x1c, 0x47, 0x1e, 0x33, 0x1c, 0x47, 0x1d, 0xb9, 0x1c, 0x7a, 0x54, 0xa5, 0x1e, 0x71, 0x1d, 0xdd, 0x1e, 0x33, 0x1d, 0x32, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0x71, 0x54, 0xa5, 0x1d, 0x65, 0x1d, 0xdd, 0x54, 0xa5, 0x1f, 0x21, 0x1d, 0xdd, 0x54, 0xa5, 0x1d, 0x65, 0x1c, 0xaa, 0x1e, 0x95, 0x02, 0x09, 0x03, 0x17, 0x1d, 0xaa, 0x1f, 0x0b, 0x1e, 0x71, 0x1e, 0x21, 0x1d, 0x77, 0x02, 0x09, 0x1c, 0xaa, 0x1e, 0x71, 0x02, 0x09, 0x1d, 0xaa, 0x1f, 0x0b, 0x1e, 0x71, 0x1e, 0x21, 0x1d, 0x77, 0x1c, 0x47, 0x1c, 0x8f, 0x1d, 0xaa, 0x1d, 0x32, 0x1d, 0xb9, 0x03, 0x18, 0x02, 0x09, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x71, 0x54, 0xa5, 0x1d, 0xb9, 0x1c, 0x47, 0x1d, 0x77, 0x1d, 0x18, 0x1c, 0x47, 0x1e, 0x95, 0x1d, 0x4c, 0x1e, 0xb5, 0x1d, 0x65, 0x02, 0x77, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x24, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x08, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02}; uchar dest[sizeof(expected)] = {0}; size_t ret = 0; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { ret = my_strnxfrm(cs, dest, sizeof(dest), pointer_cast(content), len); } StopBenchmarkTiming(); EXPECT_EQ(sizeof(expected), ret); expect_arrays_equal(expected, dest, ret); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_Hungarian_AS_CS) static void BM_Japanese_AS_CS(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs"); const char *content = "サーバー SQL モードの設定方法。この設定は、たとえば" "別のデータベースシステムからのコードとの互換性を保ったり、特定の状況に" "ついてのエラー処理を制御したりするために、SQL の構文およびセマンティクス" "の特定の側面を変更します。"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x1F, 0xC1, 0x1F, 0xB6, 0x1F, 0xD0, 0x1F, 0xB6, 0x02, 0x09, 0x1E, 0x71, 0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xD9, 0x1F, 0xBB, 0x1F, 0xCA, 0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x5E, 0x8C, 0x5E, 0x8E, 0x02, 0x8A, 0x1F, 0xC0, 0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x1F, 0xD0, 0x02, 0x31, 0x1F, 0xC6, 0x1F, 0xCA, 0x1F, 0xBA, 0x1F, 0xD0, 0x5E, 0x5B, 0x1F, 0xCF, 0x1F, 0xC9, 0x1F, 0xBA, 0x1F, 0xC6, 0x1F, 0xD3, 0x1F, 0xBA, 0x1F, 0xC3, 0x1F, 0xC2, 0x1F, 0xC3, 0x1F, 0xC9, 0x1F, 0xD7, 0x1F, 0xBC, 0x1F, 0xDE, 0x1F, 0xCF, 0x1F, 0xC0, 0x1F, 0xBB, 0x1F, 0xCA, 0x1F, 0xCA, 0x1F, 0xCF, 0x57, 0xD2, 0x56, 0x34, 0x5A, 0x90, 0x1F, 0xE6, 0x5E, 0x6C, 0x1F, 0xC8, 0x1F, 0xC6, 0x1F, 0xDF, 0x02, 0x31, 0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF, 0x5A, 0x1C, 0x56, 0xEE, 0x1F, 0xCC, 0x1F, 0xC8, 0x1F, 0xB7, 0x1F, 0xC9, 0x1F, 0xCF, 0x1F, 0xBA, 0x1F, 0xDE, 0x1F, 0xB6, 0x59, 0xB1, 0x5F, 0xA6, 0x1F, 0xE6, 0x5A, 0x8C, 0x57, 0xD9, 0x1F, 0xC2, 0x1F, 0xC6, 0x1F, 0xDF, 0x1F, 0xC3, 0x1F, 0xE0, 0x1F, 0xC6, 0x1F, 0xD8, 0x1F, 0xCC, 0x02, 0x31, 0x1E, 0x71, 0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xCF, 0x58, 0x0E, 0x5E, 0x47, 0x1F, 0xBB, 0x1F, 0xDD, 0x1F, 0xD1, 0x1F, 0xC4, 0x1F, 0xD5, 0x1F, 0xE7, 0x1F, 0xC9, 0x1F, 0xB7, 0x1F, 0xBE, 0x1F, 0xC3, 0x1F, 0xCF, 0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF, 0x5B, 0x45, 0x5F, 0x17, 0x1F, 0xE6, 0x5E, 0x60, 0x58, 0x0A, 0x1F, 0xC2, 0x1F, 0xD5, 0x1F, 0xC3, 0x02, 0x8A, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_Japanese_AS_CS) static void BM_Japanese_AS_CS_KS(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_ja_0900_as_cs_ks"); const char *content = "サーバー SQL モードの設定方法。この設定は、たとえば" "別のデータベースシステムからのコードとの互換性を保ったり、特定の状況に" "ついてのエラー処理を制御したりするために、SQL の構文およびセマンティクス" "の特定の側面を変更します。"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x1F, 0xC1, 0x1F, 0xB6, 0x1F, 0xD0, 0x1F, 0xB6, 0x02, 0x09, 0x1E, 0x71, 0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xD9, 0x1F, 0xBB, 0x1F, 0xCA, 0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x5E, 0x8C, 0x5E, 0x8E, 0x02, 0x8A, 0x1F, 0xC0, 0x1F, 0xCF, 0x5A, 0xC2, 0x5C, 0x45, 0x1F, 0xD0, 0x02, 0x31, 0x1F, 0xC6, 0x1F, 0xCA, 0x1F, 0xBA, 0x1F, 0xD0, 0x5E, 0x5B, 0x1F, 0xCF, 0x1F, 0xC9, 0x1F, 0xBA, 0x1F, 0xC6, 0x1F, 0xD3, 0x1F, 0xBA, 0x1F, 0xC3, 0x1F, 0xC2, 0x1F, 0xC3, 0x1F, 0xC9, 0x1F, 0xD7, 0x1F, 0xBC, 0x1F, 0xDE, 0x1F, 0xCF, 0x1F, 0xC0, 0x1F, 0xBB, 0x1F, 0xCA, 0x1F, 0xCA, 0x1F, 0xCF, 0x57, 0xD2, 0x56, 0x34, 0x5A, 0x90, 0x1F, 0xE6, 0x5E, 0x6C, 0x1F, 0xC8, 0x1F, 0xC6, 0x1F, 0xDF, 0x02, 0x31, 0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF, 0x5A, 0x1C, 0x56, 0xEE, 0x1F, 0xCC, 0x1F, 0xC8, 0x1F, 0xB7, 0x1F, 0xC9, 0x1F, 0xCF, 0x1F, 0xBA, 0x1F, 0xDE, 0x1F, 0xB6, 0x59, 0xB1, 0x5F, 0xA6, 0x1F, 0xE6, 0x5A, 0x8C, 0x57, 0xD9, 0x1F, 0xC2, 0x1F, 0xC6, 0x1F, 0xDF, 0x1F, 0xC3, 0x1F, 0xE0, 0x1F, 0xC6, 0x1F, 0xD8, 0x1F, 0xCC, 0x02, 0x31, 0x1E, 0x71, 0x1E, 0x21, 0x1D, 0x77, 0x02, 0x09, 0x1F, 0xCF, 0x58, 0x0E, 0x5E, 0x47, 0x1F, 0xBB, 0x1F, 0xDD, 0x1F, 0xD1, 0x1F, 0xC4, 0x1F, 0xD5, 0x1F, 0xE7, 0x1F, 0xC9, 0x1F, 0xB7, 0x1F, 0xBE, 0x1F, 0xC3, 0x1F, 0xCF, 0x5C, 0xDA, 0x5C, 0x45, 0x1F, 0xCF, 0x5B, 0x45, 0x5F, 0x17, 0x1F, 0xE6, 0x5E, 0x60, 0x58, 0x0A, 0x1F, 0xC2, 0x1F, 0xD5, 0x1F, 0xC3, 0x02, 0x8A, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x37, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0C, 0x00, 0x21, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x02, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x0E, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_Japanese_AS_CS_KS) TEST(StrXfrmTest, ChineseUTF8MB4) { CHARSET_INFO *cs = init_collation("utf8mb4_zh_0900_as_cs"); const char *src = "\xE9\x98\xBF\xE5\x92\x97" // The first and last Han character in zh.xml "\xF0\xAC\xBA\xA1" // The last Han character "\xC4\x81\x61\x62\xC5\xAB\x75\x55\xC7\x96\x5A" // Some latin characters // are used as Bopomofo. "\xF0\x94\x99\x86" // The last character that has explicit weight // in the DUCET. /* Non-Han characters that have implicit weight. */ "\xF0\x97\x86\xA0\xF0\xAC\xBA\xA2\xF0\xAE\xAF\xA0\xF0\xB3\x8C\xB3"; static const unsigned char full_answer_with_pad[116] = { // level 1 0x1C, 0x47, 0xBD, 0xBE, // The first and last Han character in zh.xml 0xBD, 0xC3, 0xCE, 0xA1, // The last Han character /* Latin characters. Some are used as Bopomofo. */ 0xBD, 0xC4, 0xBD, 0xC4, 0xBD, 0xDD, 0xC0, 0x32, 0xC0, 0x32, 0xC0, 0x32, 0xC0, 0x32, 0xC0, 0x9E, 0xF6, 0x20, // The last character that has explicit weight in the DUCET. /* Non-Han characters that have implicit weight. */ 0xF6, 0x21, 0x81, 0xA0, 0xF6, 0x27, 0xCE, 0xA2, 0xF6, 0x27, 0xEB, 0xE0, 0xF6, 0x28, 0xB3, 0x33, // level separator. 0x00, 0x00, // level 2 0x00, 0x20, 0x00, 0x20, // The first and last Han character in zh.xml 0x00, 0x20, // The last Han character /* Latin characters. Some are used as Bopomofo. */ 0x00, 0x1F, 0x01, 0x16, 0x00, 0x20, 0x00, 0x20, 0x00, 0x1F, 0x01, 0x16, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x01, 0x16, 0x00, 0x20, 0x00, 0x20, // The last character that has explicit weight in the DUCET. /* Non-Han characters that have implicit weight. */ 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, // level separator. 0x00, 0x00, // level 3 0x00, 0x02, 0x00, 0x02, // The first and last Han character in zh.xml 0x00, 0x02, // The last Han character /* Latin characters. Some are used as Bopomofo. */ 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x02, // The last character that has explicit weight in the DUCET. /* Non-Han characters that have implicit weight. */ 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02}; unsigned char buf[sizeof(full_answer_with_pad)]; for (size_t maxlen = 0; maxlen < sizeof(buf); maxlen += 2) { memset(buf, 0xff, sizeof(buf)); my_strnxfrm(cs, buf, maxlen, pointer_cast(src), strlen(src)); expect_arrays_equal(full_answer_with_pad, buf, maxlen); } } static void BM_Chinese_AS_CS(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_zh_0900_as_cs"); const char *content = "春江潮水连海平,海上明月共潮生。" "滟滟随波千万里,何处春江无月明!" "江流宛转绕芳甸,月照花林皆似霰;" "空里流霜不觉飞,汀上白沙看不见。" "江天一色无纤尘,皎皎空中孤月轮。" "江畔何人初见月?江月何年初照人?" "人生代代无穷已,江月年年只相似。" "不知江月待何人,但见长江送流水。" "白云一片去悠悠,青枫浦上不胜愁。" "谁家今夜扁舟子?何处相思明月楼?"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x2C, 0xD0, 0x4F, 0xF1, 0x28, 0x08, 0x87, 0xE8, 0x60, 0x4C, 0x42, 0xEF, 0x75, 0x93, 0x02, 0x22, 0x42, 0xEF, 0x83, 0x8A, 0x6C, 0x4F, 0xAF, 0x96, 0x3F, 0x58, 0x28, 0x08, 0x84, 0xCF, 0x02, 0x8A, 0xA3, 0xA4, 0xA3, 0xA4, 0x8A, 0x5F, 0x23, 0x71, 0x78, 0xA8, 0x93, 0x1A, 0x5E, 0xD9, 0x02, 0x22, 0x44, 0xAC, 0x2B, 0xD5, 0x2C, 0xD0, 0x4F, 0xF1, 0x96, 0x31, 0xAF, 0x96, 0x6C, 0x4F, 0x02, 0x60, 0x4F, 0xF1, 0x63, 0x7B, 0x92, 0xDD, 0xBA, 0x2E, 0x7F, 0x07, 0x39, 0x15, 0x32, 0xB2, 0x02, 0x22, 0xAF, 0x96, 0xB4, 0x41, 0x47, 0xD7, 0x62, 0x27, 0x51, 0x4C, 0x85, 0xE9, 0x81, 0x86, 0x02, 0x34, 0x59, 0x09, 0x5E, 0xD9, 0x63, 0x7B, 0x87, 0xBA, 0x24, 0x78, 0x56, 0x5A, 0x39, 0x48, 0x02, 0x22, 0x8F, 0x74, 0x83, 0x8A, 0x1E, 0x4D, 0x82, 0x46, 0x57, 0xD9, 0x24, 0x78, 0x4F, 0x79, 0x02, 0x8A, 0x4F, 0xF1, 0x8E, 0x8A, 0xA6, 0x3E, 0x81, 0xEE, 0x96, 0x31, 0x99, 0x9E, 0x28, 0x97, 0x02, 0x22, 0x50, 0xC2, 0x50, 0xC2, 0x59, 0x09, 0xB8, 0x20, 0x3F, 0xCC, 0xAF, 0x96, 0x66, 0xC9, 0x02, 0x8A, 0x4F, 0xF1, 0x72, 0xB6, 0x44, 0xAC, 0x7F, 0x11, 0x2B, 0x7B, 0x4F, 0x79, 0xAF, 0x96, 0x02, 0x66, 0x4F, 0xF1, 0xAF, 0x96, 0x44, 0xAC, 0x6F, 0xD5, 0x2B, 0x7B, 0xB4, 0x41, 0x7F, 0x11, 0x02, 0x66, 0x7F, 0x11, 0x84, 0xCF, 0x2F, 0xE2, 0x2F, 0xE2, 0x96, 0x31, 0x7B, 0xE1, 0xA7, 0x41, 0x02, 0x22, 0x4F, 0xF1, 0xAF, 0x96, 0x6F, 0xD5, 0x6F, 0xD5, 0xB6, 0xC3, 0x9B, 0x15, 0x85, 0xE9, 0x02, 0x8A, 0x24, 0x78, 0xB6, 0x2E, 0x4F, 0xF1, 0xAF, 0x96, 0x2F, 0xF4, 0x44, 0xAC, 0x7F, 0x11, 0x02, 0x22, 0x30, 0x86, 0x4F, 0x79, 0xB3, 0xDD, 0x4F, 0xF1, 0x89, 0x2A, 0x63, 0x7B, 0x87, 0xE8, 0x02, 0x8A, 0x1E, 0x4D, 0xB0, 0x1B, 0xA6, 0x3E, 0x75, 0x00, 0x7D, 0x93, 0xAB, 0xAF, 0xAB, 0xAF, 0x02, 0x22, 0x7B, 0x7D, 0x3A, 0x63, 0x76, 0xA2, 0x83, 0x8A, 0x24, 0x78, 0x85, 0x16, 0x2B, 0x2D, 0x02, 0x8A, 0x84, 0x30, 0x4D, 0xF3, 0x52, 0x63, 0xA5, 0xC7, 0x21, 0xE0, 0xB8, 0x87, 0xBC, 0x16, 0x02, 0x66, 0x44, 0xAC, 0x2B, 0xD5, 0x9B, 0x15, 0x88, 0x52, 0x6C, 0x4F, 0xAF, 0x96, 0x64, 0xA1, 0x02, 0x66, 0x00, 0x00, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x03}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_Chinese_AS_CS) static void BM_UTF8MB4_bin(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_bin"); const char *content = "Premature optimization is the root of all evil. " "Våre norske tegn bør æres. 日本語が少しわかります。 " "✌️🐶👩🏽"; const int len = strlen(content); // Just recorded from a trial run on the string above. static constexpr uchar expected[] = { 0x00, 0x00, 0x50, 0x00, 0x00, 0x72, 0x00, 0x00, 0x65, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x61, 0x00, 0x00, 0x74, 0x00, 0x00, 0x75, 0x00, 0x00, 0x72, 0x00, 0x00, 0x65, 0x00, 0x00, 0x20, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x70, 0x00, 0x00, 0x74, 0x00, 0x00, 0x69, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x69, 0x00, 0x00, 0x7A, 0x00, 0x00, 0x61, 0x00, 0x00, 0x74, 0x00, 0x00, 0x69, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x20, 0x00, 0x00, 0x69, 0x00, 0x00, 0x73, 0x00, 0x00, 0x20, 0x00, 0x00, 0x74, 0x00, 0x00, 0x68, 0x00, 0x00, 0x65, 0x00, 0x00, 0x20, 0x00, 0x00, 0x72, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x74, 0x00, 0x00, 0x20, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x66, 0x00, 0x00, 0x20, 0x00, 0x00, 0x61, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x20, 0x00, 0x00, 0x65, 0x00, 0x00, 0x76, 0x00, 0x00, 0x69, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x20, 0x00, 0x00, 0x56, 0x00, 0x00, 0xE5, 0x00, 0x00, 0x72, 0x00, 0x00, 0x65, 0x00, 0x00, 0x20, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x72, 0x00, 0x00, 0x73, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x65, 0x00, 0x00, 0x20, 0x00, 0x00, 0x74, 0x00, 0x00, 0x65, 0x00, 0x00, 0x67, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x20, 0x00, 0x00, 0x62, 0x00, 0x00, 0xF8, 0x00, 0x00, 0x72, 0x00, 0x00, 0x20, 0x00, 0x00, 0xE6, 0x00, 0x00, 0x72, 0x00, 0x00, 0x65, 0x00, 0x00, 0x73, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x20, 0x00, 0x65, 0xE5, 0x00, 0x67, 0x2C, 0x00, 0x8A, 0x9E, 0x00, 0x30, 0x4C, 0x00, 0x5C, 0x11, 0x00, 0x30, 0x57, 0x00, 0x30, 0x8F, 0x00, 0x30, 0x4B, 0x00, 0x30, 0x8A, 0x00, 0x30, 0x7E, 0x00, 0x30, 0x59, 0x00, 0x30, 0x02, 0x00, 0x00, 0x20, 0x00, 0x27, 0x0C, 0x00, 0xFE, 0x0F, 0x01, 0xF4, 0x36, 0x01, 0xF4, 0x69, 0x01, 0xF3, 0xFD}; uchar dest[sizeof(expected)]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, sizeof(dest), reinterpret_cast(content), len); } StopBenchmarkTiming(); expect_arrays_equal(expected, dest, sizeof(dest)); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_UTF8MB4_bin) static void BM_UTF8MB4_0900_bin(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("utf8mb4_0900_bin"); const char *content = "Premature optimization is the root of all evil. " "Våre norske tegn bør æres. 日本語が少しわかります。 " "✌️🐶👩🏽"; const int len = strlen(content); uchar *dest = new uchar[len]; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { my_strnxfrm(cs, dest, len, reinterpret_cast(content), len); } StopBenchmarkTiming(); /* utf8mb4_0900_bin_nopad gives the weight that has same bytes and length as source string. */ expect_arrays_equal((const uchar *)content, dest, len); delete[] dest; SetBytesProcessed(num_iterations * len); } BENCHMARK(BM_UTF8MB4_0900_bin) // The classic MySQL latin1 collation, for reference. static void BM_Latin1_CI(size_t num_iterations) { StopBenchmarkTiming(); CHARSET_INFO *cs = init_collation("latin1_swedish_ci"); const char *content = "Alla människor är födda fria och lika i värde " "och rättigheter. De är utrustade med förnuft och samvete och bör " "handla gentemot varandra i en anda av broderskap."; const int len = strlen(content); /* Just recorded from a trial run on the string above. The entire last row is padding. */ static constexpr uchar expected[] = { 0x41, 0x4c, 0x4c, 0x41, 0x20, 0x4d, 0x41, 0xa4, 0x4e, 0x4e, 0x49, 0x53, 0x4b, 0x4f, 0x52, 0x20, 0x41, 0xa4, 0x52, 0x20, 0x46, 0x41, 0xb6, 0x44, 0x44, 0x41, 0x20, 0x46, 0x52, 0x49, 0x41, 0x20, 0x4f, 0x43, 0x48, 0x20, 0x4c, 0x49, 0x4b, 0x41, 0x20, 0x49, 0x20, 0x56, 0x41, 0xa4, 0x52, 0x44, 0x45, 0x20, 0x4f, 0x43, 0x48, 0x20, 0x52, 0x41, 0xa4, 0x54, 0x54, 0x49, 0x47, 0x48, 0x45, 0x54, 0x45, 0x52, 0x2e, 0x20, 0x44, 0x45, 0x20, 0x41, 0xa4, 0x52, 0x20, 0x55, 0x54, 0x52, 0x55, 0x53, 0x54, 0x41, 0x44, 0x45, 0x20, 0x4d, 0x45, 0x44, 0x20, 0x46, 0x41, 0xb6, 0x52, 0x4e, 0x55, 0x46, 0x54, 0x20, 0x4f, 0x43, 0x48, 0x20, 0x53, 0x41, 0x4d, 0x56, 0x45, 0x54, 0x45, 0x20, 0x4f, 0x43, 0x48, 0x20, 0x42, 0x41, 0xb6, 0x52, 0x20, 0x48, 0x41, 0x4e, 0x44, 0x4c, 0x41, 0x20, 0x47, 0x45, 0x4e, 0x54, 0x45, 0x4d, 0x4f, 0x54, 0x20, 0x56, 0x41, 0x52, 0x41, 0x4e, 0x44, 0x52, 0x41, 0x20, 0x49, 0x20, 0x45, 0x4e, 0x20, 0x41, 0x4e, 0x44, 0x41, 0x20, 0x41, 0x56, 0x20, 0x42, 0x52, 0x4f, 0x44, 0x45, 0x52, 0x53, 0x4b, 0x41, 0x50, 0x2e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, }; uchar dest[sizeof(expected)]; size_t ret = 0; StartBenchmarkTiming(); for (size_t i = 0; i < num_iterations; ++i) { ret = cs->coll->strnxfrm(cs, dest, sizeof(dest), sizeof(dest), pointer_cast(content), len, MY_STRXFRM_PAD_TO_MAXLEN); } StopBenchmarkTiming(); EXPECT_EQ(sizeof(expected), ret); expect_arrays_equal(expected, dest, ret); SetBytesProcessed(num_iterations * strlen(content)); } BENCHMARK(BM_Latin1_CI) // Since the UCA collations are NO PAD, strnncollsp should heed spaces. TEST(PadCollationTest, BasicTest) { constexpr char foo[] = "foo"; constexpr char foosp[] = "foo "; constexpr char bar[] = "bar"; constexpr char foobar[] = "foobar"; CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); auto my_strnncollsp = cs->coll->strnncollsp; // "foo" == "foo" EXPECT_EQ(my_strnncollsp(cs, pointer_cast(foo), strlen(foo), pointer_cast(foo), strlen(foo)), 0); // "foo" < "foo " EXPECT_LT(my_strnncollsp(cs, pointer_cast(foo), strlen(foo), pointer_cast(foosp), strlen(foosp)), 0); // "foo" > "bar" EXPECT_GT(my_strnncollsp(cs, pointer_cast(foo), strlen(foo), pointer_cast(bar), strlen(bar)), 0); // "foo" < "foobar". EXPECT_LT(my_strnncollsp(cs, pointer_cast(foo), strlen(foo), pointer_cast(foobar), strlen(foobar)), 0); // Exactly the same tests in reverse. // "foo " > "foo" EXPECT_GT( my_strnncollsp(cs, pointer_cast(foosp), strlen(foosp), pointer_cast(foo), strlen(foo)), 0); // "bar" < "foo" EXPECT_LT(my_strnncollsp(cs, pointer_cast(bar), strlen(bar), pointer_cast(foo), strlen(foo)), 0); // "foobar" > "foo". EXPECT_GT( my_strnncollsp(cs, pointer_cast(foobar), strlen(foobar), pointer_cast(foo), strlen(foo)), 0); } TEST(StrxfrmTest, NoPadCollation) { CHARSET_INFO *ai_ci = init_collation("utf8mb4_0900_ai_ci"); CHARSET_INFO *as_cs = init_collation("utf8mb4_0900_as_cs"); CHARSET_INFO *as_ci = init_collation("utf8mb4_0900_as_ci"); // Basic sanity checks. EXPECT_EQ(compare_through_strxfrm(ai_ci, "abc", "abc"), 0); EXPECT_NE(compare_through_strxfrm(as_ci, "abc", "Ǎḅç"), 0); EXPECT_NE(compare_through_strxfrm(ai_ci, "abc", "def"), 0); EXPECT_NE(compare_through_strxfrm(as_ci, "abc", "def"), 0); // Spaces from the end should matter, no matter the collation. EXPECT_LT(compare_through_strxfrm(ai_ci, "abc", "abc "), 0); EXPECT_LT(compare_through_strxfrm(as_ci, "abc", "Ǎḅç "), 0); EXPECT_LT(compare_through_strxfrm(as_cs, "abc", "abc "), 0); EXPECT_LT(compare_through_strxfrm(as_cs, "abc", "Abc "), 0); // Same with other types of spaces. EXPECT_LT(compare_through_strxfrm(ai_ci, "abc", u8"abc \u00a0"), 0); // Non-breaking space should compare _equal_ to space in ai_ci and as_ci, // but _after_ in as_cs. EXPECT_EQ(compare_through_strxfrm(ai_ci, "abc ", u8"abc\u00a0"), 0); EXPECT_EQ(compare_through_strxfrm(as_ci, "abc ", u8"abc\u00a0"), 0); EXPECT_LT(compare_through_strxfrm(as_cs, "abc ", u8"abc\u00a0"), 0); // Also in the middle of the string. EXPECT_EQ(compare_through_strxfrm(ai_ci, "a c", u8"a\u00a0c"), 0); EXPECT_EQ(compare_through_strxfrm(as_ci, "a c", u8"a\u00a0c"), 0); EXPECT_LT(compare_through_strxfrm(as_cs, "a c", u8"a\u00a0c"), 0); // Verify that space in the middle of the string isn't stripped. EXPECT_LT(compare_through_strxfrm(ai_ci, "ab c", "abc"), 0); EXPECT_LT(compare_through_strxfrm(as_ci, "ab c", "abc"), 0); EXPECT_LT(compare_through_strxfrm(as_cs, "ab c", "abc"), 0); // Whitespace ordering as specified by DUCET. EXPECT_GT(compare_through_strxfrm(as_ci, " ", "\t"), 0); EXPECT_GT(compare_through_strxfrm(as_cs, " ", "\t"), 0); EXPECT_LT(compare_through_strxfrm(as_cs, "", "\t"), 0); } TEST(StrxfrmTest, Contractions) { CHARSET_INFO *hu_ai_ci = init_collation("utf8mb4_hu_0900_ai_ci"); // Basic sanity checks. EXPECT_EQ(compare_through_strxfrm(hu_ai_ci, "abc", "abc"), 0); EXPECT_NE(compare_through_strxfrm(hu_ai_ci, "abc", "def"), 0); EXPECT_EQ(compare_through_strxfrm(hu_ai_ci, "abc", "Abc"), 0); // "cs" counts as a separate letter, where c < cs < d, so: EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "c", "cs"), 0); EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "cs", "d"), 0); EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "ct", "cst"), 0); EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "cst", "dt"), 0); // Wikipedia gives this as an example. EXPECT_LT(compare_through_strxfrm(hu_ai_ci, "cukor", "csak"), 0); } /* This test is disabled by default since it needs ~10 seconds to run, even in optimized mode. */ TEST(BitfiddlingTest, DISABLED_FastOutOfRange) { unsigned char bytes[4]; for (int a = 0; a < 256; ++a) { bytes[0] = a; for (int b = 0; b < 256; ++b) { bytes[1] = b; for (int c = 0; c < 256; ++c) { bytes[2] = c; for (int d = 0; d < 256; ++d) { bytes[3] = d; bool any_out_of_range_slow = (a < 0x20 || a > 0x7e) || (b < 0x20 || b > 0x7e) || (c < 0x20 || c > 0x7e) || (d < 0x20 || d > 0x7e); uint32 four_bytes; memcpy(&four_bytes, bytes, sizeof(four_bytes)); bool any_out_of_range_fast = (((four_bytes + 0x01010101u) & 0x80808080) || ((four_bytes - 0x20202020u) & 0x80808080)); EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast); } } } } } /* A version of FastOutOfRange that tests the analogous trick for 16-bit integers instead (much, much faster). */ TEST(BitfiddlingTest, FastOutOfRange16) { unsigned char bytes[2]; for (int a = 0; a < 256; ++a) { bytes[0] = a; for (int b = 0; b < 256; ++b) { bytes[1] = b; bool any_out_of_range_slow = (a < 0x20 || a > 0x7e) || (b < 0x20 || b > 0x7e); uint16 two_bytes; memcpy(&two_bytes, bytes, sizeof(two_bytes)); bool any_out_of_range_fast = (((two_bytes + uint16{0x0101}) & uint16{0x8080}) || ((two_bytes - uint16{0x2020}) & uint16{0x8080})); EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast); } } } uint64 hash(CHARSET_INFO *cs, const char *str) { uint64 nr1 = 1, nr2 = 4; cs->coll->hash_sort(cs, pointer_cast(str), strlen(str), &nr1, &nr2); return nr1; } /* NOTE: In this entire test, there's an infinitesimal chance that something that we expect doesn't match, still matches by pure accident. */ TEST(PadCollationTest, HashSort) { CHARSET_INFO *ai_ci = init_collation("utf8mb4_0900_ai_ci"); CHARSET_INFO *as_cs = init_collation("utf8mb4_0900_as_cs"); // Basic sanity checks. EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, "abc")); EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, "def")); // Spaces from the end should matter, no matter the collation. EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, "abc ")); EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, "abc ")); EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, "Abc ")); // Same with other types of spaces. EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, u8"abc \u00a0")); // Non-breaking space should compare _equal_ to space in ai_ci, // but _inequal_ in as_cs. EXPECT_EQ(hash(ai_ci, "abc "), hash(ai_ci, u8"abc\u00a0")); EXPECT_NE(hash(as_cs, "abc "), hash(as_cs, u8"abc\u00a0")); EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, u8"abc\u00a0")); // Also in the middle of the string. EXPECT_EQ(hash(ai_ci, "a c"), hash(ai_ci, u8"a\u00a0c")); EXPECT_NE(hash(as_cs, "a c"), hash(as_cs, u8"a\u00a0c")); // Verify that space in the middle of the string isn't stripped. EXPECT_NE(hash(ai_ci, "ab c"), hash(ai_ci, "abc")); EXPECT_NE(hash(as_cs, "ab c"), hash(as_cs, "abc")); } TEST(HashTest, NullPointer) { CHARSET_INFO *cs = init_collation("utf8mb4_0900_ai_ci"); uint64 nr1 = 1, nr2 = 4; /* We should get the same hash from the empty string no matter what the pointer is. */ cs->coll->hash_sort(cs, nullptr, 0, &nr1, &nr2); EXPECT_EQ(nr1, hash(cs, "")); cs->coll->hash_sort(cs, pointer_cast(" "), 8, &nr1, &nr2); // Don't care what the values are, just that we don't crash. } namespace { // Test that strnxfrmlen() holds for all single characters. void test_strnxfrmlen(CHARSET_INFO *cs) { pair longest{0, 0}; uchar inbuf[16], outbuf[256]; // Ought to be enough for anyone. const size_t max_len = cs->coll->strnxfrmlen(cs, cs->mbmaxlen); for (my_wc_t ch = 0; ch <= 0x10ffff; ++ch) { size_t in_len = cs->cset->wc_mb(cs, ch, inbuf, inbuf + sizeof(inbuf)); if (in_len <= 0) { continue; // Not representable in this character set. } size_t out_len = cs->coll->strnxfrm(cs, outbuf, sizeof(outbuf), 1, inbuf, in_len, 0); EXPECT_LE(out_len, max_len); if (out_len > max_len) { fprintf(stderr, "U+%04lX needed more room than strnxfrmlen() claimed\n", ch); fprintf(stderr, "Weight string:"); for (size_t i = 0; i < out_len; ++i) { fprintf(stderr, " %02x", outbuf[i]); } fprintf(stderr, "\n\n"); } longest = max(longest, make_pair(out_len, ch)); } fprintf(stderr, "Longest character in '%s': U+%04lX, %d bytes (strnxfrm_len=%d)\n", cs->name, longest.second, static_cast(longest.first), static_cast(max_len)); } } // namespace TEST(StrxfrmLenTest, StrnxfrmLenIsLongEnoughForAllCharacters) { // Load one collation to get everything going. init_collation("utf8mb4_0900_ai_ci"); for (CHARSET_INFO *cs : all_charsets) { if (cs && (cs->state & MY_CS_AVAILABLE)) { SCOPED_TRACE(cs->name); test_strnxfrmlen(init_collation(cs->name)); } } } // Golden hashes for a test string. These may be stored on disk, so we need to // make sure that they never change. struct GoldenHashResult { pair hash_value; }; TEST(StrmxfrmHashTest, HashStability) { // Load one collation to get everything going. init_collation("utf8mb4_0900_ai_ci"); // Reference values. Please keep this list sorted. unordered_map expected = { {"armscii8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"armscii8_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"ascii_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"ascii_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"big5_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"big5_chinese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"binary", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp1250_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp1250_croatian_ci", {{0xe25aa32298f78f4aLL, 0x000002b0LL}}}, {"cp1250_czech_cs", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp1250_general_ci", {{0x81c46f6c6b06f8fcLL, 0x000002b0LL}}}, {"cp1250_polish_ci", {{0xe25aa32298f78f4aLL, 0x000002b0LL}}}, {"cp1251_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp1251_bulgarian_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"cp1251_general_ci", {{0xce71da5364c300a4LL, 0x000002b0LL}}}, {"cp1251_general_cs", {{0xff44ce45c6d3d142LL, 0x000002b0LL}}}, {"cp1251_ukrainian_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"cp1256_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp1256_general_ci", {{0x44ed84e7ad4a6c1cLL, 0x000002b0LL}}}, {"cp1257_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp1257_general_ci", {{0x15219f243a38ad58LL, 0x000002b0LL}}}, {"cp1257_lithuanian_ci", {{0xaa3ef638e5e056e8LL, 0x000002b0LL}}}, {"cp850_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp850_general_ci", {{0xf32b1cf4087a0b08LL, 0x000002b0LL}}}, {"cp852_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp852_general_ci", {{0x60dce9bffdeccd52LL, 0x000002b0LL}}}, {"cp866_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp866_general_ci", {{0xce71da5364c300a4LL, 0x000002b0LL}}}, {"cp932_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"cp932_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"dec8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"dec8_swedish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"eucjpms_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"eucjpms_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"euckr_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"euckr_korean_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"gb18030_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"gb18030_chinese_ci", {{0xb7b6676124243e73LL, 0x00000abdLL}}}, {"gb18030_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}}, {"gb2312_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"gb2312_chinese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"gbk_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"gbk_chinese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"geostd8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"geostd8_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"greek_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"greek_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"hebrew_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"hebrew_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"hp8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"hp8_english_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"keybcs2_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"keybcs2_general_ci", {{0xd2d54c0201229650LL, 0x000002b0LL}}}, {"koi8r_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"koi8r_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"koi8u_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"koi8u_general_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"latin1_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"latin1_danish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"latin1_general_ci", {{0xd7d424d55cb8f402LL, 0x000002b0LL}}}, {"latin1_general_cs", {{0x96b2a3f94ffe41f9LL, 0x000002b0LL}}}, {"latin1_german1_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"latin1_german2_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"latin1_spanish_ci", {{0xd7d424d55cb8f402LL, 0x000002b0LL}}}, {"latin1_swedish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"latin2_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"latin2_croatian_ci", {{0xe25aa32298f78f4aLL, 0x000002b0LL}}}, {"latin2_czech_cs", {{0xba89a4855c3a88b6LL, 0x000002b0LL}}}, {"latin2_general_ci", {{0xd9179195a5ddebf8LL, 0x000002b0LL}}}, {"latin2_hungarian_ci", {{0xba89a4855c3a88b6LL, 0x000002b0LL}}}, {"latin5_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"latin5_turkish_ci", {{0x68989a162aab9f1cLL, 0x000002b0LL}}}, {"latin7_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"latin7_estonian_cs", {{0xa281f3df87b89fe1LL, 0x000002b0LL}}}, {"latin7_general_ci", {{0xc6808727382ffb41LL, 0x000002b0LL}}}, {"latin7_general_cs", {{0xf70d2b9f0d640804LL, 0x000002b0LL}}}, {"macce_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"macce_general_ci", {{0xb27ca521eb9b7492LL, 0x000002b0LL}}}, {"macroman_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"macroman_general_ci", {{0x3254bac0fa3625efLL, 0x000002b0LL}}}, {"sjis_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"sjis_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"swe7_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"swe7_swedish_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"tis620_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"tis620_thai_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"ucs2_bin", {{0x1877f0a25b18b4c6LL, 0x0000055fLL}}}, {"ucs2_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"ucs2_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}}, {"ucs2_general_mysql500_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}}, {"ucs2_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}}, {"ucs2_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}}, {"ucs2_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}}, {"ucs2_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"ucs2_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}}, {"ucs2_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}}, {"ucs2_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}}, {"ucs2_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ucs2_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"ujis_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"ujis_japanese_ci", {{0xdae43ea5cabac97cLL, 0x000002b0LL}}}, {"utf16_bin", {{0x1877f0a25b18b4c6LL, 0x0000055fLL}}}, {"utf16_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf16_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}}, {"utf16_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}}, {"utf16_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}}, {"utf16_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}}, {"utf16_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf16_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}}, {"utf16_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}}, {"utf16_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}}, {"utf16_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf16le_bin", {{0x3da26ce08ecbfaf9LL, 0x0000055fLL}}}, {"utf16le_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}}, {"utf32_bin", {{0x353330032692faLL, 0x00000abdLL}}}, {"utf32_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf32_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_general_ci", {{0x353330032692faLL, 0x00000abdLL}}}, {"utf32_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}}, {"utf32_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}}, {"utf32_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}}, {"utf32_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf32_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}}, {"utf32_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}}, {"utf32_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}}, {"utf32_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf32_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"utf8_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf8_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}}, {"utf8_general_mysql500_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}}, {"utf8_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}}, {"utf8_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}}, {"utf8_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}}, {"utf8_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf8_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}}, {"utf8_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_tolower_ci", {{0x8eab9a2c403c8eb9LL, 0x0000055fLL}}}, {"utf8_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}}, {"utf8_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}}, {"utf8_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_0900_as_ci", {{0xfc978781d49d0d9bLL, 0x00000001LL}}}, {"utf8mb4_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_0900_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"utf8mb4_bin", {{0xb6240d9a0a0f7efcLL, 0x000002b0LL}}}, {"utf8mb4_croatian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_cs_0900_ai_ci", {{0x36582be4fafa0bbbLL, 0x00000001LL}}}, {"utf8mb4_cs_0900_as_cs", {{0xac403419684d8c71LL, 0x00000001LL}}}, {"utf8mb4_czech_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf8mb4_da_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_da_0900_as_cs", {{0xbd24fdcb7b0cf519LL, 0x00000001LL}}}, {"utf8mb4_danish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_de_pb_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_de_pb_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_eo_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_eo_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_es_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_es_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_es_trad_0900_ai_ci", {{0x555a77b8a263f17fLL, 0x00000001LL}}}, {"utf8mb4_es_trad_0900_as_cs", {{0xae993a138c5c030dLL, 0x00000001LL}}}, {"utf8mb4_esperanto_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_estonian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_et_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_et_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_general_ci", {{0xfb66c3f2301bd579LL, 0x0000055fLL}}}, {"utf8mb4_german2_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_hr_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_hr_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_hu_0900_ai_ci", {{0x3162e9e9cebb9148LL, 0x00000001LL}}}, {"utf8mb4_hu_0900_as_cs", {{0x88842661c548eec1LL, 0x00000001LL}}}, {"utf8mb4_hungarian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_icelandic_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_is_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_is_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_ja_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_ja_0900_as_cs_ks", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_la_0900_ai_ci", {{0x2928cd07bca9a85dLL, 0x00000001LL}}}, {"utf8mb4_la_0900_as_cs", {{0x29a7f3eb43a9819LL, 0x00000001LL}}}, {"utf8mb4_latvian_ci", {{0x6473871765c3455cLL, 0x0000055fLL}}}, {"utf8mb4_lithuanian_ci", {{0xccb8395ef1969f40LL, 0x00000553LL}}}, {"utf8mb4_lt_0900_ai_ci", {{0xcd5ce469f67f6792LL, 0x00000001LL}}}, {"utf8mb4_lt_0900_as_cs", {{0xe2e6dc41a4d6b3c1LL, 0x00000001LL}}}, {"utf8mb4_lv_0900_ai_ci", {{0xcd5ce469f67f6792LL, 0x00000001LL}}}, {"utf8mb4_lv_0900_as_cs", {{0xfe377cec9551f0f4LL, 0x00000001LL}}}, {"utf8mb4_persian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_pl_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_pl_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_polish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_ro_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_ro_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_roman_ci", {{0xf40d4b3c957fccdcLL, 0x0000055fLL}}}, {"utf8mb4_romanian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_ru_0900_ai_ci", {{0xb55bc2bf5ab2bf53LL, 0x00000001LL}}}, {"utf8mb4_ru_0900_as_cs", {{0x36f5a31292841899LL, 0x00000001LL}}}, {"utf8mb4_sinhala_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_sk_0900_ai_ci", {{0x36582be4fafa0bbbLL, 0x00000001LL}}}, {"utf8mb4_sk_0900_as_cs", {{0xac403419684d8c71LL, 0x00000001LL}}}, {"utf8mb4_sl_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_sl_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_slovak_ci", {{0x1dc65c2738ed47c0LL, 0x00000553LL}}}, {"utf8mb4_slovenian_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_spanish2_ci", {{0x3e79d9277da1beb4LL, 0x00000547LL}}}, {"utf8mb4_spanish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_sv_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_sv_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_swedish_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_tr_0900_ai_ci", {{0x7ea67be76364740fLL, 0x00000001LL}}}, {"utf8mb4_tr_0900_as_cs", {{0xfa4556e24336675eLL, 0x00000001LL}}}, {"utf8mb4_turkish_ci", {{0x3fb28acb6e515c9cLL, 0x0000055fLL}}}, {"utf8mb4_unicode_520_ci", {{0x5c1f019a21e3d464LL, 0x0000055fLL}}}, {"utf8mb4_unicode_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_vi_0900_ai_ci", {{0x3329a425d0f7f8d3LL, 0x00000001LL}}}, {"utf8mb4_vi_0900_as_cs", {{0xcfb3e3073c9f5a19LL, 0x00000001LL}}}, {"utf8mb4_vietnamese_ci", {{0x3acdfaa93364f55cLL, 0x0000055fLL}}}, {"utf8mb4_zh_0900_as_cs", {{0x23c370d9ac589d1fLL, 0x00000001LL}}}, }; string test_str = "This is a fairly long string. It does not contain any special " "characters since they are probably not universally supported across all " "character sets, but should at least be enough to make the nr1 value go " "up past the 32-bit mark."; for (CHARSET_INFO *cs : all_charsets) { if (cs && (cs->state & MY_CS_AVAILABLE)) { init_collation(cs->name); char buf[4096]; uint errors; size_t len = my_convert(buf, sizeof(buf), cs, test_str.data(), test_str.size(), &my_charset_utf8mb4_0900_ai_ci, &errors); ASSERT_EQ(0, errors); uint64 nr1 = 4, nr2 = 1; cs->coll->hash_sort(cs, pointer_cast(buf), len, &nr1, &nr2); // Change this from false to true to output source code you can paste // into “expected” above. if (false) { printf(" {\"%s\", {{0x%016" PRIx64 "LL, 0x%" PRIx64 "LL}}},\n", cs->name, nr1, nr2); continue; } ASSERT_EQ(1, expected.count(cs->name)) << "Character set " << cs->name << " is missing in the database"; SCOPED_TRACE(cs->name); EXPECT_EQ(expected[cs->name].hash_value.first, nr1); EXPECT_EQ(expected[cs->name].hash_value.second, nr2); } } } } // namespace strnxfrm_unittest