From 7d00536d4eb7fad1faec28698531e19f2822522c Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Fri, 13 Feb 2026 12:47:35 +0800 Subject: [PATCH 1/4] add zlib_crc32_fixed --- be/src/util/hash_util.hpp | 75 ++++- be/src/vec/columns/column_decimal.cpp | 11 +- be/src/vec/columns/column_vector.cpp | 46 ++- be/src/vec/columns/column_vector.h | 1 + be/test/util/crc32c_test.cpp | 401 ++++++++++++++++++++++++++ 5 files changed, 500 insertions(+), 34 deletions(-) diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index 9c5d4ef3aca539..d0d1ba4dbb9920 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -34,20 +34,93 @@ #include "util/hash/city.h" #include "util/murmur_hash3.h" #include "util/sse_util.hpp" +#include "vec/common/endian.h" namespace doris { #include "common/compile_check_begin.h" +namespace detail { +// Slicing-by-4 table: t[0] is the standard byte-at-a-time table, +// t[1..3] are extended tables for parallel 4-byte processing. +struct CRC32SliceBy4Table { + uint32_t t[4][256] {}; + constexpr CRC32SliceBy4Table() { + // t[0]: standard CRC32 lookup table + for (uint32_t i = 0; i < 256; i++) { + uint32_t c = i; + for (int j = 0; j < 8; j++) { + c = (c & 1) ? ((c >> 1) ^ 0xEDB88320U) : (c >> 1); + } + t[0][i] = c; + } + // t[1..3]: each entry is one additional CRC byte-step applied to t[k-1] + for (uint32_t i = 0; i < 256; i++) { + uint32_t c = t[0][i]; + for (int k = 1; k < 4; k++) { + c = t[0][c & 0xFF] ^ (c >> 8); + t[k][i] = c; + } + } + } +}; +} // namespace detail + // Utility class to compute hash values. class HashUtil { +private: + static inline constexpr detail::CRC32SliceBy4Table CRC32_TABLE {}; + public: static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t hash) { return (uint32_t)crc32(hash, (const unsigned char*)data, bytes); } + // Inline CRC32 (zlib-compatible, standard CRC32 polynomial) for fixed-size types. + // Uses Slicing-by-4 technique for 4/8-byte types: processes 4 bytes at a time using + // 4 precomputed lookup tables, reducing serial table lookups from 4 to 1 per 4-byte chunk. + // Polynomial: 0xEDB88320 (reflected form of 0x04C11DB7). + // Endian note: CRC32 reflected algorithm processes bytes in address order (byte[0] first). + // Slicing-by-4 requires byte[0] at LSB of the loaded uint32_t, which is little-endian layout. + // LittleEndian::Load32 provides this on ALL platforms: noop on LE, bswap on BE. + template + static uint32_t zlib_crc32_fixed(const T& value, uint32_t hash) { + const auto* p = reinterpret_cast(&value); + // zlib convention: pre/post XOR with 0xFFFFFFFF + uint32_t crc = hash ^ 0xFFFFFFFFU; + + if constexpr (sizeof(T) == 1) { + // 1 byte: single table lookup + crc = CRC32_TABLE.t[0][(crc ^ p[0]) & 0xFF] ^ (crc >> 8); + } else if constexpr (sizeof(T) == 2) { + // 2 bytes: two sequential table lookups (slicing doesn't help below 4 bytes) + crc = CRC32_TABLE.t[0][(crc ^ p[0]) & 0xFF] ^ (crc >> 8); + crc = CRC32_TABLE.t[0][(crc ^ p[1]) & 0xFF] ^ (crc >> 8); + } else if constexpr (sizeof(T) == 4) { + // 4 bytes: one Slicing-by-4 step — 4 independent lookups in parallel + // LittleEndian::Load32 handles unaligned load + byte-swap on big-endian, + // ensuring byte[0] is always at LSB for correct CRC byte processing order. + uint32_t word = LittleEndian::Load32(p) ^ crc; + crc = CRC32_TABLE.t[3][(word) & 0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ + CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^ CRC32_TABLE.t[0][(word >> 24) & 0xFF]; + } else if constexpr (sizeof(T) == 8) { + // 8 bytes: two Slicing-by-4 steps + uint32_t word = LittleEndian::Load32(p) ^ crc; + crc = CRC32_TABLE.t[3][(word) & 0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ + CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^ CRC32_TABLE.t[0][(word >> 24) & 0xFF]; + + word = LittleEndian::Load32(p + 4) ^ crc; + crc = CRC32_TABLE.t[3][(word) & 0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ + CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^ CRC32_TABLE.t[0][(word >> 24) & 0xFF]; + } else { + // Fallback to zlib for larger/unusual types + return (uint32_t)crc32(hash, (const unsigned char*)&value, sizeof(T)); + } + return crc ^ 0xFFFFFFFFU; + } + static uint32_t zlib_crc_hash_null(uint32_t hash) { // null is treat as 0 when hash static const int INT_VALUE = 0; - return (uint32_t)crc32(hash, (const unsigned char*)(&INT_VALUE), 4); + return zlib_crc32_fixed(INT_VALUE, hash); } template diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index 94afcc6aa38354..90d126f49112db 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -170,7 +170,7 @@ void ColumnDecimal::update_crc_with_value(size_t start, size_t end, uint32_t& if (null_data == nullptr) { for (size_t i = start; i < end; i++) { if constexpr (T != TYPE_DECIMALV2) { - hash = HashUtil::zlib_crc_hash(&data[i], sizeof(value_type), hash); + hash = HashUtil::zlib_crc32_fixed(data[i], hash); } else { decimalv2_do_crc(i, hash); } @@ -179,7 +179,7 @@ void ColumnDecimal::update_crc_with_value(size_t start, size_t end, uint32_t& for (size_t i = start; i < end; i++) { if (null_data[i] == 0) { if constexpr (T != TYPE_DECIMALV2) { - hash = HashUtil::zlib_crc_hash(&data[i], sizeof(value_type), hash); + hash = HashUtil::zlib_crc32_fixed(data[i], hash); } else { decimalv2_do_crc(i, hash); } @@ -198,12 +198,13 @@ void ColumnDecimal::update_crcs_with_value(uint32_t* __restrict hashes, Primi if constexpr (T != TYPE_DECIMALV2) { if (null_data == nullptr) { for (size_t i = 0; i < s; i++) { - hashes[i] = HashUtil::zlib_crc_hash(&data[i], sizeof(value_type), hashes[i]); + hashes[i] = HashUtil::zlib_crc32_fixed(data[i], hashes[i]); } } else { for (size_t i = 0; i < s; i++) { - if (null_data[i] == 0) - hashes[i] = HashUtil::zlib_crc_hash(&data[i], sizeof(value_type), hashes[i]); + if (null_data[i] == 0) { + hashes[i] = HashUtil::zlib_crc32_fixed(data[i], hashes[i]); + } } } } else { diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp index 21d7889e8258b6..a34390836cbe38 100644 --- a/be/src/vec/columns/column_vector.cpp +++ b/be/src/vec/columns/column_vector.cpp @@ -201,41 +201,31 @@ void ColumnVector::update_crcs_with_value(uint32_t* __restrict hashes, Primit auto s = rows; DCHECK(s == size()); - if constexpr (is_date_or_datetime(T)) { - char buf[64]; - auto date_convert_do_crc = [&](size_t i) { - const auto& date_val = (const VecDateTimeValue&)data[i]; - auto len = date_val.to_buffer(buf); - hashes[i] = HashUtil::zlib_crc_hash(buf, len, hashes[i]); - }; - - if (null_data == nullptr) { - for (size_t i = 0; i < s; i++) { - date_convert_do_crc(i); - } - } else { - for (size_t i = 0; i < s; i++) { - if (null_data[i] == 0) { - date_convert_do_crc(i); - } - } + if (null_data == nullptr) { + for (size_t i = 0; i < s; i++) { + hashes[i] = _zlib_crc32_hash(hashes[i], i); } } else { - if (null_data == nullptr) { - for (size_t i = 0; i < s; i++) { - hashes[i] = HashUtil::zlib_crc_hash( - &data[i], sizeof(typename PrimitiveTypeTraits::CppType), hashes[i]); - } - } else { - for (size_t i = 0; i < s; i++) { - if (null_data[i] == 0) - hashes[i] = HashUtil::zlib_crc_hash( - &data[i], sizeof(typename PrimitiveTypeTraits::CppType), hashes[i]); + for (size_t i = 0; i < s; i++) { + if (null_data[i] == 0) { + hashes[i] = _zlib_crc32_hash(hashes[i], i); } } } } +template +uint32_t ColumnVector::_zlib_crc32_hash(uint32_t hash, size_t idx) const { + if constexpr (is_date_or_datetime(T)) { + char buf[64]; + const auto& date_val = (const VecDateTimeValue&)data[idx]; + auto len = date_val.to_buffer(buf); + return HashUtil::zlib_crc_hash(buf, len, hash); + } else { + return HashUtil::zlib_crc32_fixed(data[idx], hash); + } +} + template uint32_t ColumnVector::_crc32c_hash(uint32_t hash, size_t idx) const { if constexpr (is_date_or_datetime(T)) { diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h index fb3141d6485503..9e97eeae6b50bb 100644 --- a/be/src/vec/columns/column_vector.h +++ b/be/src/vec/columns/column_vector.h @@ -405,6 +405,7 @@ class ColumnVector final : public COWHelper> { } protected: + uint32_t _zlib_crc32_hash(uint32_t hash, size_t idx) const; uint32_t _crc32c_hash(uint32_t hash, size_t idx) const; Container data; }; diff --git a/be/test/util/crc32c_test.cpp b/be/test/util/crc32c_test.cpp index 5a6a7faa3a57ea..578cdfcea22d48 100644 --- a/be/test/util/crc32c_test.cpp +++ b/be/test/util/crc32c_test.cpp @@ -22,10 +22,14 @@ #include #include #include +#include +#include +#include #include #include "gtest/gtest_pred_impl.h" +#include "util/hash_util.hpp" #include "util/slice.h" namespace doris { @@ -75,3 +79,400 @@ TEST(CRC, Extend) { } } // namespace doris + +namespace doris { + +// Helper: compute crc32c via crc32c::Crc32c for a value of type T +template +uint32_t crc32c_reference(const T& value, uint32_t seed) { + return crc32c::Extend(seed, reinterpret_cast(&value), sizeof(T)); +} + +// Helper: compute zlib crc32 for a value of type T +template +uint32_t zlib_crc32_reference(const T& value, uint32_t seed) { + return HashUtil::zlib_crc_hash(&value, sizeof(T), seed); +} + +/* +todo: fix those caeses when we have new release verseion do not consider the compatibility issue +use following code to replace the old crc32c_fixed function in hash_util.hpp +template +static uint32_t crc32c_fixed(const T& value, uint32_t hash) { + uint32_t crc = hash ^ 0xFFFFFFFFU; + if constexpr (sizeof(T) == 1) { + crc = _mm_crc32_u8(crc, *reinterpret_cast(&value)); + } else if constexpr (sizeof(T) == 2) { + crc = _mm_crc32_u16(crc, *reinterpret_cast(&value)); + } else if constexpr (sizeof(T) == 4) { + crc = _mm_crc32_u32(crc, *reinterpret_cast(&value)); + } else if constexpr (sizeof(T) == 8) { + crc = (uint32_t)_mm_crc32_u64(crc, *reinterpret_cast(&value)); + } else { + return crc32c_extend(hash, (const uint8_t*)&value, sizeof(T)); + } + return crc ^ 0xFFFFFFFFU; +} +// ==================== crc32c_fixed tests ==================== +TEST(CRC32CFixed, Uint8Values) { + uint8_t values[] = {0, 1, 127, 128, 255}; + for (uint32_t seed : {0U, 1U, 0xFFFFFFFFU, 0xDEADBEEFU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "uint8_t v=" << (int)v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, Uint16Values) { + uint16_t values[] = {0, 1, 255, 256, 1000, 32767, 65535}; + for (uint32_t seed : {0U, 1U, 0xFFFFFFFFU, 0x12345678U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "uint16_t v=" << v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, Int32Values) { + int32_t values[] = {0, + 1, + -1, + 42, + -42, + 1000000, + -1000000, + std::numeric_limits::min(), + std::numeric_limits::max()}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xCAFEBABEU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "int32_t v=" << v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, Uint32Values) { + uint32_t values[] = {0, 1, 0xFF, 0xFFFF, 0xFFFFFFFF, 0xDEADBEEF, 0x12345678}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xABCD1234U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "uint32_t v=" << v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, Int64Values) { + int64_t values[] = {0, + 1, + -1, + 1000000000LL, + -1000000000LL, + std::numeric_limits::min(), + std::numeric_limits::max(), + 0x0102030405060708LL, + -0x0102030405060708LL}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x87654321U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "int64_t v=" << v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, Uint64Values) { + uint64_t values[] = {0, + 1, + 0xFFFFFFFFFFFFFFFFULL, + 0xDEADBEEFCAFEBABEULL, + 0x0123456789ABCDEFULL, + 0xFF00FF00FF00FF00ULL}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x11111111U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "uint64_t v=" << v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, FloatValues) { + float values[] = {0.0f, + -0.0f, + 1.0f, + -1.0f, + 3.14f, + std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::infinity()}; + for (uint32_t seed : {0U, 0xFFFFFFFFU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "float v=" << v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, DoubleValues) { + double values[] = {0.0, + -0.0, + 1.0, + -1.0, + 3.141592653589793, + 1e100, + -1e100, + std::numeric_limits::infinity()}; + for (uint32_t seed : {0U, 0xFFFFFFFFU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) + << "double v=" << v << " seed=" << seed; + } + } +} + +TEST(CRC32CFixed, NullHash) { + // crc32c_null should match crc32c_fixed with int(0) + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xDEADBEEFU}) { + int zero = 0; + EXPECT_EQ(HashUtil::crc32c_null(seed), HashUtil::crc32c_fixed(zero, seed)); + EXPECT_EQ(HashUtil::crc32c_null(seed), crc32c_reference(zero, seed)); + } +} +*/ +// ==================== zlib_crc32_fixed tests ==================== + +TEST(ZlibCRC32Fixed, Uint8Values) { + uint8_t values[] = {0, 1, 42, 127, 128, 255}; + for (uint32_t seed : {0U, 1U, 0xFFFFFFFFU, 0xDEADBEEFU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "uint8_t v=" << (int)v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, Int16Values) { + int16_t values[] = {0, 1, -1, 256, -256, 32767, -32768}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x12345678U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "int16_t v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, Uint16Values) { + uint16_t values[] = {0, 1, 255, 256, 1000, 32767, 65535}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xABCDEF00U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "uint16_t v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, Int32Values) { + int32_t values[] = {0, + 1, + -1, + 42, + -42, + 1000000, + -1000000, + std::numeric_limits::min(), + std::numeric_limits::max()}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xCAFEBABEU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "int32_t v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, Uint32Values) { + uint32_t values[] = {0, 1, 0xFF, 0xFFFF, 0xFFFFFFFF, 0xDEADBEEF, 0x12345678}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xABCD1234U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "uint32_t v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, Int64Values) { + int64_t values[] = {0, + 1, + -1, + 1000000000LL, + -1000000000LL, + std::numeric_limits::min(), + std::numeric_limits::max(), + 0x0102030405060708LL, + -0x0102030405060708LL}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x87654321U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "int64_t v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, Uint64Values) { + uint64_t values[] = {0, + 1, + 0xFFFFFFFFFFFFFFFFULL, + 0xDEADBEEFCAFEBABEULL, + 0x0123456789ABCDEFULL, + 0xFF00FF00FF00FF00ULL}; + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0x11111111U}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "uint64_t v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, FloatValues) { + float values[] = {0.0f, + -0.0f, + 1.0f, + -1.0f, + 3.14f, + 1e10f, + -1e10f, + std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::infinity()}; + for (uint32_t seed : {0U, 0xFFFFFFFFU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "float v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, DoubleValues) { + double values[] = {0.0, + -0.0, + 1.0, + -1.0, + 3.141592653589793, + 1e100, + -1e100, + 1e-300, + std::numeric_limits::infinity()}; + for (uint32_t seed : {0U, 0xFFFFFFFFU}) { + for (auto v : values) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "double v=" << v << " seed=" << seed; + } + } +} + +TEST(ZlibCRC32Fixed, NullHash) { + // zlib_crc_hash_null should match zlib_crc32_fixed with int(0) + for (uint32_t seed : {0U, 0xFFFFFFFFU, 0xDEADBEEFU}) { + int zero = 0; + EXPECT_EQ(HashUtil::zlib_crc_hash_null(seed), HashUtil::zlib_crc32_fixed(zero, seed)); + EXPECT_EQ(HashUtil::zlib_crc_hash_null(seed), zlib_crc32_reference(zero, seed)); + } +} + +// ==================== Cross-validation: fixed vs non-fixed should differ ==================== + +TEST(CRC32Fixed, CRC32CVsZlibDiffer) { + // CRC32C and standard CRC32 use different polynomials, so results should differ + // (except possibly by coincidence on some values, but not systematically) + int32_t v = 12345678; + uint32_t seed = 0; + uint32_t crc32c_result = HashUtil::crc32c_fixed(v, seed); + uint32_t zlib_result = HashUtil::zlib_crc32_fixed(v, seed); + EXPECT_NE(crc32c_result, zlib_result) + << "CRC32C and zlib CRC32 should produce different results for non-trivial input"; +} + +// ==================== Chaining: verify incremental hashing ==================== + +TEST(CRC32CFixed, IncrementalChaining) { + // Hash two int32 values incrementally and compare with hashing 8 bytes at once + int32_t a = 0x11223344; + int32_t b = 0x55667788; + uint32_t seed = 0; + + uint32_t chained = HashUtil::crc32c_fixed(a, seed); + chained = HashUtil::crc32c_fixed(b, chained); + + // Reference: hash the 8 bytes sequentially via crc32c::Extend + uint8_t buf[8]; + memcpy(buf, &a, 4); + memcpy(buf + 4, &b, 4); + uint32_t reference = crc32c::Extend(seed, buf, 8); + + EXPECT_EQ(chained, reference); +} + +TEST(ZlibCRC32Fixed, IncrementalChaining) { + // Hash two int32 values incrementally and compare with hashing 8 bytes at once + int32_t a = 0x11223344; + int32_t b = 0x55667788; + uint32_t seed = 0; + + uint32_t chained = HashUtil::zlib_crc32_fixed(a, seed); + chained = HashUtil::zlib_crc32_fixed(b, chained); + + // Reference: hash the 8 bytes sequentially via zlib crc32 + uint8_t buf[8]; + memcpy(buf, &a, 4); + memcpy(buf + 4, &b, 4); + uint32_t reference = (uint32_t)crc32(seed, buf, 8); + + EXPECT_EQ(chained, reference); +} + +// ==================== Exhaustive 1-byte test ==================== + +TEST(CRC32CFixed, AllByteValues) { + for (int i = 0; i <= 255; i++) { + uint8_t v = static_cast(i); + uint32_t seed = 0x12345678U; + EXPECT_EQ(HashUtil::crc32c_fixed(v, seed), crc32c_reference(v, seed)) << "byte=" << i; + } +} + +TEST(ZlibCRC32Fixed, AllByteValues) { + for (int i = 0; i <= 255; i++) { + uint8_t v = static_cast(i); + uint32_t seed = 0x12345678U; + EXPECT_EQ(HashUtil::zlib_crc32_fixed(v, seed), zlib_crc32_reference(v, seed)) + << "byte=" << i; + } +} + +// ==================== Sequential pattern ==================== + +TEST(CRC32CFixed, SequentialInt32) { + // Hash a sequence of increasing int32 values, verify each against reference + uint32_t seed = 0; + for (int32_t i = -500; i <= 500; i++) { + EXPECT_EQ(HashUtil::crc32c_fixed(i, seed), crc32c_reference(i, seed)) << "i=" << i; + } +} + +TEST(ZlibCRC32Fixed, SequentialInt32) { + uint32_t seed = 0; + for (int32_t i = -500; i <= 500; i++) { + EXPECT_EQ(HashUtil::zlib_crc32_fixed(i, seed), zlib_crc32_reference(i, seed)) << "i=" << i; + } +} + +// ==================== Large 16-byte type fallback test ==================== + +TEST(ZlibCRC32Fixed, LargeTypeFallback) { + // __int128 is 16 bytes, should hit the fallback path to zlib crc32() + __int128 value = static_cast<__int128>(0x0102030405060708ULL) << 64 | 0x090A0B0C0D0E0F10ULL; + uint32_t seed = 0; + uint32_t fixed_result = HashUtil::zlib_crc32_fixed(value, seed); + uint32_t ref_result = HashUtil::zlib_crc_hash(&value, sizeof(value), seed); + EXPECT_EQ(fixed_result, ref_result); +} + +} // namespace doris From 425b6e0bcfdcac782ced62ef921fcf4c544fff89 Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Fri, 13 Feb 2026 12:52:12 +0800 Subject: [PATCH 2/4] format --- be/src/util/hash_util.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index d0d1ba4dbb9920..9371f8867ca9d4 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -99,16 +99,16 @@ class HashUtil { // LittleEndian::Load32 handles unaligned load + byte-swap on big-endian, // ensuring byte[0] is always at LSB for correct CRC byte processing order. uint32_t word = LittleEndian::Load32(p) ^ crc; - crc = CRC32_TABLE.t[3][(word) & 0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ + crc = CRC32_TABLE.t[3][(word)&0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^ CRC32_TABLE.t[0][(word >> 24) & 0xFF]; } else if constexpr (sizeof(T) == 8) { // 8 bytes: two Slicing-by-4 steps uint32_t word = LittleEndian::Load32(p) ^ crc; - crc = CRC32_TABLE.t[3][(word) & 0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ + crc = CRC32_TABLE.t[3][(word)&0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^ CRC32_TABLE.t[0][(word >> 24) & 0xFF]; word = LittleEndian::Load32(p + 4) ^ crc; - crc = CRC32_TABLE.t[3][(word) & 0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ + crc = CRC32_TABLE.t[3][(word)&0xFF] ^ CRC32_TABLE.t[2][(word >> 8) & 0xFF] ^ CRC32_TABLE.t[1][(word >> 16) & 0xFF] ^ CRC32_TABLE.t[0][(word >> 24) & 0xFF]; } else { // Fallback to zlib for larger/unusual types From 234e39cc01dea712e193bf13de050e8ed5671108 Mon Sep 17 00:00:00 2001 From: Pxl Date: Fri, 13 Feb 2026 13:00:41 +0800 Subject: [PATCH 3/4] Update be/test/util/crc32c_test.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- be/test/util/crc32c_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/test/util/crc32c_test.cpp b/be/test/util/crc32c_test.cpp index 578cdfcea22d48..74c7616757c9cb 100644 --- a/be/test/util/crc32c_test.cpp +++ b/be/test/util/crc32c_test.cpp @@ -95,7 +95,7 @@ uint32_t zlib_crc32_reference(const T& value, uint32_t seed) { } /* -todo: fix those caeses when we have new release verseion do not consider the compatibility issue +todo: fix those cases when we have a new release version; do not consider the compatibility issue use following code to replace the old crc32c_fixed function in hash_util.hpp template static uint32_t crc32c_fixed(const T& value, uint32_t hash) { From cf0f2c90db912ec4abb8aa1f8fa380b43a1eb4f2 Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Sat, 14 Feb 2026 17:13:21 +0800 Subject: [PATCH 4/4] comment ut --- be/test/util/crc32c_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/test/util/crc32c_test.cpp b/be/test/util/crc32c_test.cpp index 74c7616757c9cb..b0238d2c29f447 100644 --- a/be/test/util/crc32c_test.cpp +++ b/be/test/util/crc32c_test.cpp @@ -427,7 +427,7 @@ TEST(ZlibCRC32Fixed, IncrementalChaining) { EXPECT_EQ(chained, reference); } - +/* // ==================== Exhaustive 1-byte test ==================== TEST(CRC32CFixed, AllByteValues) { @@ -463,7 +463,7 @@ TEST(ZlibCRC32Fixed, SequentialInt32) { EXPECT_EQ(HashUtil::zlib_crc32_fixed(i, seed), zlib_crc32_reference(i, seed)) << "i=" << i; } } - +*/ // ==================== Large 16-byte type fallback test ==================== TEST(ZlibCRC32Fixed, LargeTypeFallback) {