diff options
author | ivanmorozov <ivanmorozov@ydb.tech> | 2023-12-07 17:25:56 +0300 |
---|---|---|
committer | ivanmorozov <ivanmorozov@ydb.tech> | 2023-12-07 18:11:27 +0300 |
commit | bfe35496dd4ccfe4c31566a7de9180513e800c8f (patch) | |
tree | dc1a65883698241236bc003e0bcf33cded0c555d | |
parent | 96f8591f14a18413954744a99956939fc47a6304 (diff) | |
download | ydb-bfe35496dd4ccfe4c31566a7de9180513e800c8f.tar.gz |
efficiency comparision for dict encoding in case different compressors
-rw-r--r-- | ydb/core/formats/arrow/ut/ut_dictionary.cpp | 121 |
1 files changed, 100 insertions, 21 deletions
diff --git a/ydb/core/formats/arrow/ut/ut_dictionary.cpp b/ydb/core/formats/arrow/ut/ut_dictionary.cpp index 9b186fd1ea..c723eb1957 100644 --- a/ydb/core/formats/arrow/ut/ut_dictionary.cpp +++ b/ydb/core/formats/arrow/ut/ut_dictionary.cpp @@ -25,35 +25,114 @@ Y_UNIT_TEST_SUITE(Dictionary) { return data.size(); } + class TTestConfig { + private: + const ui32 BatchSize; + const ui32 PoolSize; + const ui32 StrLen; + const arrow::Compression::type Codec; + const bool Dict; + arrow::ipc::IpcWriteOptions Options = arrow::ipc::IpcWriteOptions::Defaults(); + YDB_READONLY(ui64, Result, 0); + public: + TTestConfig(const ui32 batchSize, const ui32 poolSize, const ui32 strLen, const arrow::Compression::type codec, const bool dict) + : BatchSize(batchSize) + , PoolSize(poolSize) + , StrLen(strLen) + , Codec(codec) + , Dict(dict) + { + Options.codec = *arrow::util::Codec::Create(Codec); + NConstruction::IArrayBuilder::TPtr column; + if (Dict) { + column = std::make_shared<NConstruction::TDictionaryArrayConstructor<NConstruction::TStringPoolFiller>>( + "field", NConstruction::TStringPoolFiller(PoolSize, StrLen)); + } else { + column = std::make_shared<NConstruction::TSimpleArrayConstructor<NConstruction::TStringPoolFiller>>( + "field", NConstruction::TStringPoolFiller(PoolSize, StrLen)); + } + Result = Test(column, Options, BatchSize); + } + + TString GetRowTitle() const { + TStringBuilder sb; + sb << (Options.codec ? Options.codec->name() : "NO_CODEC") << "("; + sb << "poolsize=" << PoolSize << ";" << "keylen=" << StrLen << ")"; + return sb; + } + + TString GetColTitle() const { + TStringBuilder sb; + sb << BatchSize << (Dict ? "d" : "") << ";"; + return sb; + } + }; + + TString AlignString(const TString& base, const ui32 len) { + if (base.size() > len) { + return base.substr(0, len); + } else if (base.size() < len) { + auto result = base; + for (ui32 i = 0; i < len - base.size(); ++i) { + result += ' '; + } + return result; + } + return base; + } + Y_UNIT_TEST(Simple) { - const std::vector<arrow::Compression::type> codecs = { arrow::Compression::UNCOMPRESSED, arrow::Compression::LZ4_FRAME, }; + const std::vector<arrow::Compression::type> codecs = { arrow::Compression::UNCOMPRESSED, arrow::Compression::LZ4_FRAME, arrow::Compression::ZSTD }; + std::vector<TTestConfig> configs; + std::map<TString, std::map<TString, double>> testResults; for (auto&& codec : codecs) { - arrow::ipc::IpcWriteOptions options = arrow::ipc::IpcWriteOptions::Defaults(); - options.codec = *arrow::util::Codec::Create(codec); - Cerr << (options.codec ? options.codec->name() : "NO_CODEC") << Endl; - for (auto bSize : { 100000 }) { - Cerr << "--" << bSize << Endl; + for (auto bSize : { 10000, 100000 }) { for (auto pSize : { 1, 16, 64, 128, 512, 1024 }) { - Cerr << "----" << pSize << Endl; for (auto&& strLen : { 1, 10, 16, 32, 64 }) { - Cerr << "------" << strLen << Endl; - ui64 bytesDict; - ui64 bytesRaw; - { - NConstruction::IArrayBuilder::TPtr column = std::make_shared<NConstruction::TDictionaryArrayConstructor<NConstruction::TStringPoolFiller>>( - "field", NConstruction::TStringPoolFiller(pSize, strLen)); - bytesDict = Test(column, options, bSize); - } - { - NConstruction::IArrayBuilder::TPtr column = std::make_shared<NConstruction::TSimpleArrayConstructor<NConstruction::TStringPoolFiller>>( - "field", NConstruction::TStringPoolFiller(pSize, strLen)); - bytesRaw = Test(column, options, bSize); - } - Cerr << "--------" << bytesDict << " / " << bytesRaw << " = " << 1.0 * bytesDict / bytesRaw << Endl; + configs.emplace_back(bSize, pSize, strLen, codec, false); + const auto col1 = configs.back().GetColTitle(); + const auto val1 = configs.back().GetResult(); +// testResults[configs.back().GetRowTitle()][configs.back().GetColTitle()] = configs.back().GetResult(); + configs.emplace_back(bSize, pSize, strLen, codec, true); + const auto col2 = configs.back().GetColTitle(); + const auto val2 = configs.back().GetResult(); +// testResults[configs.back().GetRowTitle()][configs.back().GetColTitle()] = configs.back().GetResult(); + testResults[configs.back().GetRowTitle()][col2 + "/" + col1] = 1.0 * val2 / val1; } } } } + std::map<TString, ui32> colLength; + for (auto&& r : testResults) { + colLength["names"] = std::max<ui32>(colLength["names"], r.first.size()); + } + + for (auto&& r : testResults) { + for (auto&& [cName, cVal] : r.second) { + colLength[cName] = std::max<ui32>(colLength[cName], ::ToString(cVal).size()); + colLength[cName] = std::max<ui32>(colLength[cName], cName.size()); + } + } + + { + TStringBuilder sb; + sb << AlignString("", colLength["names"] + 5); + for (auto&& c : testResults.begin()->second) { + sb << AlignString(c.first, colLength[c.first] + 5); + } + Cerr << sb << Endl; + } + + { + for (auto&& r : testResults) { + TStringBuilder sb; + sb << AlignString(r.first, colLength["names"] + 5); + for (auto&& c : r.second) { + sb << AlignString(::ToString(c.second), colLength[c.first] + 5); + } + Cerr << sb << Endl; + } + } } Y_UNIT_TEST(ComparePayloadAndFull) { |