diff options
author | daredevil2002 <daredevil2002@yandex-team.com> | 2023-08-22 21:19:48 +0300 |
---|---|---|
committer | daredevil2002 <daredevil2002@yandex-team.com> | 2023-08-22 22:14:27 +0300 |
commit | 07ad0583b3f39038a4f7fe2ce3d4b8109cd1bb0e (patch) | |
tree | b30e2ff9935ed6ca3e2425824dda8d093dd1d3fc /library/cpp/string_utils/base32 | |
parent | 16b49a7ff6a0740a19bab5650934824d3d7030f4 (diff) | |
download | ydb-07ad0583b3f39038a4f7fe2ce3d4b8109cd1bb0e.tar.gz |
[yql/udfs/string] Add base32
Добавил в String UDF поддержку base32
Diffstat (limited to 'library/cpp/string_utils/base32')
-rw-r--r-- | library/cpp/string_utils/base32/CMakeLists.darwin-x86_64.txt | 17 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/CMakeLists.linux-aarch64.txt | 18 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/CMakeLists.linux-x86_64.txt | 18 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/CMakeLists.txt | 17 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/CMakeLists.windows-x86_64.txt | 17 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/base32.cpp | 170 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/base32.h | 120 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/fuzz/main.cpp | 14 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/fuzz/ya.make | 11 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/ut/base32_ut.cpp | 79 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/ut/ya.make | 12 | ||||
-rw-r--r-- | library/cpp/string_utils/base32/ya.make | 12 |
12 files changed, 505 insertions, 0 deletions
diff --git a/library/cpp/string_utils/base32/CMakeLists.darwin-x86_64.txt b/library/cpp/string_utils/base32/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..0d9db28b47 --- /dev/null +++ b/library/cpp/string_utils/base32/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-base32) +target_link_libraries(cpp-string_utils-base32 PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-base32 PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/base32/base32.cpp +) diff --git a/library/cpp/string_utils/base32/CMakeLists.linux-aarch64.txt b/library/cpp/string_utils/base32/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..fa250d193c --- /dev/null +++ b/library/cpp/string_utils/base32/CMakeLists.linux-aarch64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-base32) +target_link_libraries(cpp-string_utils-base32 PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-base32 PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/base32/base32.cpp +) diff --git a/library/cpp/string_utils/base32/CMakeLists.linux-x86_64.txt b/library/cpp/string_utils/base32/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..fa250d193c --- /dev/null +++ b/library/cpp/string_utils/base32/CMakeLists.linux-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-base32) +target_link_libraries(cpp-string_utils-base32 PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-base32 PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/base32/base32.cpp +) diff --git a/library/cpp/string_utils/base32/CMakeLists.txt b/library/cpp/string_utils/base32/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/string_utils/base32/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/string_utils/base32/CMakeLists.windows-x86_64.txt b/library/cpp/string_utils/base32/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..0d9db28b47 --- /dev/null +++ b/library/cpp/string_utils/base32/CMakeLists.windows-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-base32) +target_link_libraries(cpp-string_utils-base32 PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-base32 PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/base32/base32.cpp +) diff --git a/library/cpp/string_utils/base32/base32.cpp b/library/cpp/string_utils/base32/base32.cpp new file mode 100644 index 0000000000..64730fe716 --- /dev/null +++ b/library/cpp/string_utils/base32/base32.cpp @@ -0,0 +1,170 @@ +#include "base32.h" + +#include <util/generic/yexception.h> + +#include <algorithm> +#include <array> +#include <limits> + +namespace { + + // RFC 4648 Base32 alphabet + // + // A 9 J 18 S 27 3 + // 1 B 10 K 19 T 28 4 + // 2 C 11 L 20 U 29 5 + // 3 D 12 M 21 V 30 6 + // 4 E 13 N 22 W 31 7 + // 5 F 14 O 23 X + // 6 G 15 P 24 Y + // 7 H 16 Q 25 Z + // 8 I 17 R 26 2 pad = + + constexpr std::string_view BASE32_TABLE = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "234567"; + + constexpr uint8_t BAD = 0xff; + + // clang-format off + constexpr std::array<uint8_t, 256> BASE32_DECODE_TABLE = {{ + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, 0x0, 0x1, 0x2, 0x3, 0x4, + 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, + 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x19, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, BAD, + BAD, BAD, BAD, BAD, BAD, BAD, + }}; + // clang-format on + + char encodeBits(unsigned char sz) { + static_assert(static_cast<size_t>(std::numeric_limits<decltype(sz)>::max()) < BASE32_DECODE_TABLE.size()); + return BASE32_TABLE[sz]; + } + + uint8_t decodeChar(unsigned char ch, bool isStrict) { + if (uint8_t val = BASE32_DECODE_TABLE[ch]; val != BAD) { + return val; + } + + if (isStrict) { + ythrow yexception() << "Error during decode symbol from Base32: character is not in Base32 set"; + } + return 0; + } + + void shiftBitsFrom(unsigned char& dst, const unsigned char& src, size_t i, size_t n) { + unsigned char m = ((src << i) & 0xFF) >> (8 - n); + dst = dst << n; + dst |= m; + } + + size_t Base32DecodeImpl(std::string_view src, char* dst, bool isStrict) { + if (src.empty()) { + return 0; + } + + size_t dstSize = 0; + size_t bitIndex = 0; + unsigned char byte = 0; + + for (auto c = src.cbegin(); c != src.cend(); ++c) { + if (*c == '=') { + Y_ENSURE( + !isStrict || std::all_of(c, src.cend(), [](char pad) { return pad == '='; }), + "Unexpected character after padding"); + break; + } + + uint8_t octet = decodeChar(*c, isStrict) << 3; + byte = byte | (octet >> bitIndex); + + size_t bitsWritten = std::min<size_t>(5, 8 - bitIndex); + if (bitsWritten < 5 || (bitIndex + bitsWritten) == 8) { + dst[dstSize++] = byte; + byte = (octet << bitsWritten); + } + bitIndex = (bitIndex + 5) % 8; + } + + // For example, correct encoding of \x00 is + // AA====== (\b0000'0000\b00xx'xxxx), not a + // AAA= (\b0000'0000\b0000'000x) + size_t lastOctetBitsCount = (dstSize * 8) % 5; + size_t expectedBitIndex = (lastOctetBitsCount == 0) ? 0 : 5 - lastOctetBitsCount; + Y_ENSURE(!isStrict || (byte == 0 && bitIndex == expectedBitIndex), "Invalid Base32 string format"); + return dstSize; + } + +} // namespace + +size_t Base32Encode(std::string_view src, char* dst) { + if (src.size() == 0) { + return 0; + } + + size_t dstSize = 0; + size_t curInd = 0; + unsigned char c = src[curInd]; + unsigned char bitInd = 0; + unsigned char n = 0; + + unsigned char ind = 0; + while (curInd < src.size()) { + ind = 0; + + unsigned char bitProcessed = 0; + while (bitProcessed < 5) { + n = std::min<unsigned char>(8 - bitInd, 5 - bitProcessed); + + shiftBitsFrom(ind, c, bitInd, n); + + bitProcessed += n; + + if (bitInd + n >= 8) { + ++curInd; + if (curInd == src.size()) { + c = 0; + } else { + c = src[curInd]; + } + } + bitInd = (bitInd + n) % 8; + } + dst[dstSize++] = encodeBits(ind); + } + + const size_t paddingSize = ((8 - (dstSize & 7)) & 7); + for (size_t i = 0; i < paddingSize; ++i) { + dst[dstSize++] = '='; + } + + return dstSize; +} + +size_t Base32Decode(std::string_view src, char* dst) { + return Base32DecodeImpl(src, dst, /*isStrict*/ false); +} + +size_t Base32StrictDecode(std::string_view src, char* dst) { + return Base32DecodeImpl(src, dst, /*isStrict*/ true); +} diff --git a/library/cpp/string_utils/base32/base32.h b/library/cpp/string_utils/base32/base32.h new file mode 100644 index 0000000000..2596bf78a2 --- /dev/null +++ b/library/cpp/string_utils/base32/base32.h @@ -0,0 +1,120 @@ +#pragma once + +#include <util/generic/string.h> + +#include <string> +#include <string_view> + +// Base32 encoding based on RFC 4648 alphabet (incompatible with Crockford and Geohash alphabet) +// https://en.wikipedia.org/wiki/Base32#RFC_4648_Base32_alphabet + +/// +/// @return Size of the buffer required to decode Base32 encoded data of size `len`. +/// +constexpr size_t Base32DecodeBufSize(size_t len) noexcept { + return (len * 5 + 7) / 8; +} + +/// +/// @brief Decodes only valid Base32 string, behaviour for invalid data is unspecified. +/// +/// @param src a base32 encoded string. +/// @param dst an pointer to allocated memory for writing result. +/// +/// @return Count of written bytes. +/// +size_t Base32Decode(std::string_view src, char* dst); + +/// +/// @param src a base32 encoded string. +/// @param dst a decoded string. +/// +inline void Base32Decode(std::string_view src, std::string& dst) +{ + ::ResizeUninitialized(dst, Base32DecodeBufSize(src.size())); + dst.resize(Base32Decode(src, dst.data())); +} + +/// +/// @param s a base32 encoded string. +/// +/// @returns a decoded string. +/// +inline std::string Base32Decode(std::string_view s) +{ + std::string ret; + Base32Decode(s, ret); + return ret; +} + +/// +/// @brief Decodes Base32 string with strict verification of invalid symbols, +/// also tries to decode Base32 string with padding inside. +/// +/// @throws Throws exceptions on inputs which contain invalid symbols or incorrect padding. +/// +/// @param src a base32 encoded string. +/// @param dst an pointer to allocated memory for writing result. +/// +/// @return Count of written bytes. +/// +size_t Base32StrictDecode(std::string_view src, char* dst); + +/// +/// @param src a base32 encoded string. +/// @param dst a decoded string. +/// +inline void Base32StrictDecode(std::string_view src, std::string& dst) +{ + ::ResizeUninitialized(dst, Base32DecodeBufSize(src.size())); + dst.resize(Base32StrictDecode(src, dst.data())); +} + +/// +/// @param s a base32 encoded string. +/// +/// @returns a decoded string. +/// +inline std::string Base32StrictDecode(std::string_view s) +{ + std::string ret; + Base32StrictDecode(s, ret); + return ret; +} + +/// +/// @return Size of the buffer required to encode Base32 decoded data of size `len`. +/// +constexpr size_t Base32EncodeBufSize(size_t len) noexcept { + return ((len * 8 + 4) / 5 + 7) / 8 * 8; +} + +/// +/// @param src a base32 decoded string. +/// @param dst an pointer to allocated memory for writing result. +/// +/// @return Count of written bytes. +/// +size_t Base32Encode(std::string_view src, char* dst); + +/// +/// @param src a base32 decoded string. +/// @param dst a encoded string. +/// +inline void Base32Encode(std::string_view src, std::string& dst) +{ + ::ResizeUninitialized(dst, Base32EncodeBufSize(src.size())); + dst.resize(Base32Encode(src, dst.data())); +} + +/// +/// @param s a base32 decoded string. +/// +/// @returns a encoded string. +/// +inline std::string Base32Encode(std::string_view s) +{ + std::string ret; + Base32Encode(s, ret); + return ret; +} diff --git a/library/cpp/string_utils/base32/fuzz/main.cpp b/library/cpp/string_utils/base32/fuzz/main.cpp new file mode 100644 index 0000000000..f2255c5aa0 --- /dev/null +++ b/library/cpp/string_utils/base32/fuzz/main.cpp @@ -0,0 +1,14 @@ +#include <library/cpp/string_utils/base32/base32.h> + +#include <util/system/types.h> +#include <util/system/yassert.h> + +extern "C" int LLVMFuzzerTestOneInput(const ui8* data, size_t size) +{ + const std::string_view example{reinterpret_cast<const char*>(data), size}; + const auto converted = Base32StrictDecode(Base32Encode(example)); + + Y_VERIFY(example == converted); + + return 0; +} diff --git a/library/cpp/string_utils/base32/fuzz/ya.make b/library/cpp/string_utils/base32/fuzz/ya.make new file mode 100644 index 0000000000..4777386b99 --- /dev/null +++ b/library/cpp/string_utils/base32/fuzz/ya.make @@ -0,0 +1,11 @@ +FUZZ() + +PEERDIR( + library/cpp/string_utils/base32 +) + +SRC( + main.cpp +) + +END() diff --git a/library/cpp/string_utils/base32/ut/base32_ut.cpp b/library/cpp/string_utils/base32/ut/base32_ut.cpp new file mode 100644 index 0000000000..fdd2be41b1 --- /dev/null +++ b/library/cpp/string_utils/base32/ut/base32_ut.cpp @@ -0,0 +1,79 @@ +#include <library/cpp/string_utils/base32/base32.h> + +#include <library/cpp/testing/gtest/gtest.h> + +#include <util/generic/yexception.h> + +#include <vector> + +namespace { + + static const std::vector<std::string> TEST_DATA = { + {}, + "\x00", + "\x01\x02", + "\x01\x02\x03", + "\x03\x02\x01", + "\x10\x20\x30\x40", + "\x10\x20\x30\x40\x50", + "\xFF\xFF\xFF", + "\xFF\xFE\xFD\xFC\xFB\xFA", + }; + +} // namespace + +TEST(base32, encode) +{ + EXPECT_EQ(Base32Encode({}), ""); + EXPECT_EQ(Base32Encode({"\x00\x40", 2}), "ABAA===="); + EXPECT_EQ(Base32Encode("apple"), "MFYHA3DF"); + EXPECT_EQ(Base32Encode("TestTest"), "KRSXG5CUMVZXI==="); + EXPECT_EQ(Base32Encode("1234567890"), "GEZDGNBVGY3TQOJQ"); +} + +TEST(base32, decode_strict) +{ + EXPECT_EQ(Base32StrictDecode(""), ""); + EXPECT_EQ(Base32StrictDecode("MFYHA3DF"), "apple"); + + EXPECT_EQ(Base32StrictDecode("KRSXG5CUMVZXI"), "TestTest"); + EXPECT_EQ(Base32StrictDecode("KRSXG5CUMVZXI==="), "TestTest"); + EXPECT_THROW(Base32StrictDecode("KRSXG5CUMVZXI=======A"), yexception); + + EXPECT_EQ(Base32StrictDecode("AA======"), std::string(1, '\x00')); + EXPECT_THROW(Base32StrictDecode("AAA="), yexception); + + EXPECT_EQ(Base32StrictDecode("AE======"), "\x01"); + EXPECT_THROW(Base32StrictDecode("AB======"), yexception); // "\x00\x40" + + EXPECT_THROW(Base32StrictDecode("invalid"), yexception); + EXPECT_THROW(Base32StrictDecode("\xFF\xFF"), yexception); + EXPECT_THROW(Base32StrictDecode(std::string_view{"A\0", 2}), yexception); +} + +TEST(base32, decode) +{ + EXPECT_EQ(Base32Decode(""), ""); + EXPECT_EQ(Base32Decode("MFYHA3DF"), "apple"); + + EXPECT_EQ(Base32Decode("KRSXG5CUMVZXI"), "TestTest"); + EXPECT_EQ(Base32Decode("KRSXG5CUMVZXI==="), "TestTest"); + EXPECT_NO_THROW(Base32Decode("KRSXG5CUMVZXI=======A")); + + EXPECT_EQ(Base32Decode("AA======"), std::string(1, '\x00')); + EXPECT_NO_THROW(Base32Decode("AAA=")); + + EXPECT_EQ(Base32Decode("AE======"), "\x01"); + EXPECT_NO_THROW(Base32Decode("AB======")); // "\x00\x40" + + EXPECT_NO_THROW(Base32Decode("invalid")); + EXPECT_NO_THROW(Base32Decode("\xFF\xFF")); + EXPECT_NO_THROW(Base32Decode(std::string_view{"A\0", 2})); +} + +TEST(base32, encode_decode) +{ + for (const auto& data : TEST_DATA) { + EXPECT_THAT(Base32StrictDecode(Base32Encode(data)), ::testing::ContainerEq(data)); + } +} diff --git a/library/cpp/string_utils/base32/ut/ya.make b/library/cpp/string_utils/base32/ut/ya.make new file mode 100644 index 0000000000..6d24455ab7 --- /dev/null +++ b/library/cpp/string_utils/base32/ut/ya.make @@ -0,0 +1,12 @@ +GTEST() + +PEERDIR( + library/cpp/string_utils/base32 + library/cpp/testing/gtest +) + +SRCS( + base32_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/base32/ya.make b/library/cpp/string_utils/base32/ya.make new file mode 100644 index 0000000000..0185db727e --- /dev/null +++ b/library/cpp/string_utils/base32/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +SRCS( + base32.cpp +) + +END() + +RECURSE( + fuzz + ut +) |