diff options
author | ivanmorozov <ivanmorozov@yandex-team.com> | 2022-12-23 17:22:10 +0300 |
---|---|---|
committer | ivanmorozov <ivanmorozov@yandex-team.com> | 2022-12-23 17:22:10 +0300 |
commit | f6fd83a9bc9c2d6bb9502d3dd77adb4f5c31ceae (patch) | |
tree | cca463108bcf236b61344602b47f474ce42b1ff4 | |
parent | af6d4963aeec3543ef895865a0a24fa2e039efa3 (diff) | |
download | ydb-f6fd83a9bc9c2d6bb9502d3dd77adb4f5c31ceae.tar.gz |
parse line on deploy by option
-rw-r--r-- | library/cpp/string_utils/CMakeLists.txt | 1 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/CMakeLists.darwin.txt | 17 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt | 18 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/CMakeLists.linux.txt | 18 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/CMakeLists.txt | 15 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/csv.cpp | 82 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/csv.h | 64 | ||||
-rw-r--r-- | ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt | 1 | ||||
-rw-r--r-- | ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt | 1 | ||||
-rw-r--r-- | ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt | 1 | ||||
-rw-r--r-- | ydb/public/lib/ydb_cli/import/import.cpp | 25 |
11 files changed, 229 insertions, 14 deletions
diff --git a/library/cpp/string_utils/CMakeLists.txt b/library/cpp/string_utils/CMakeLists.txt index d256782733..bbdcba85d9 100644 --- a/library/cpp/string_utils/CMakeLists.txt +++ b/library/cpp/string_utils/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(base64) +add_subdirectory(csv) add_subdirectory(indent_text) add_subdirectory(levenshtein_diff) add_subdirectory(parse_size) diff --git a/library/cpp/string_utils/csv/CMakeLists.darwin.txt b/library/cpp/string_utils/csv/CMakeLists.darwin.txt new file mode 100644 index 0000000000..7dffad3566 --- /dev/null +++ b/library/cpp/string_utils/csv/CMakeLists.darwin.txt @@ -0,0 +1,17 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-csv) +target_link_libraries(cpp-string_utils-csv PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-csv PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp +) diff --git a/library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt b/library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..1941d5ba17 --- /dev/null +++ b/library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt @@ -0,0 +1,18 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-csv) +target_link_libraries(cpp-string_utils-csv PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-csv PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp +) diff --git a/library/cpp/string_utils/csv/CMakeLists.linux.txt b/library/cpp/string_utils/csv/CMakeLists.linux.txt new file mode 100644 index 0000000000..1941d5ba17 --- /dev/null +++ b/library/cpp/string_utils/csv/CMakeLists.linux.txt @@ -0,0 +1,18 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-csv) +target_link_libraries(cpp-string_utils-csv PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-csv PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp +) diff --git a/library/cpp/string_utils/csv/CMakeLists.txt b/library/cpp/string_utils/csv/CMakeLists.txt new file mode 100644 index 0000000000..3e0811fb22 --- /dev/null +++ b/library/cpp/string_utils/csv/CMakeLists.txt @@ -0,0 +1,15 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND UNIX AND NOT APPLE AND NOT ANDROID) + include(CMakeLists.linux-aarch64.txt) +elseif (APPLE) + include(CMakeLists.darwin.txt) +elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND UNIX AND NOT APPLE AND NOT ANDROID) + include(CMakeLists.linux.txt) +endif() diff --git a/library/cpp/string_utils/csv/csv.cpp b/library/cpp/string_utils/csv/csv.cpp new file mode 100644 index 0000000000..218473c62c --- /dev/null +++ b/library/cpp/string_utils/csv/csv.cpp @@ -0,0 +1,82 @@ +#include "csv.h" + +TStringBuf NCsvFormat::CsvSplitter::Consume() { + if (Begin == End) { + return nullptr; + } + TString::iterator TokenStart = Begin; + TString::iterator TokenEnd = Begin; + if (Quote == '\0') { + while (1) { + if (TokenEnd == End || *TokenEnd == Delimeter) { + Begin = TokenEnd; + return TStringBuf(TokenStart, TokenEnd); + } + ++TokenEnd; + } + } else { + bool Escape = false; + if (*Begin == Quote) { + Escape = true; + ++TokenStart; + ++TokenEnd; + Y_ENSURE(TokenStart != End, TStringBuf("RFC4180 violation: quotation mark must be followed by something")); + } + while (1) { + if (TokenEnd == End || (!Escape && *TokenEnd == Delimeter)) { + Begin = TokenEnd; + return TStringBuf(TokenStart, TokenEnd); + } else if (*TokenEnd == Quote) { + Y_ENSURE(Escape, TStringBuf("RFC4180 violation: quotation mark must be in the escaped string only")); + if (TokenEnd + 1 == End) { + Begin = TokenEnd + 1; + } else if (*(TokenEnd + 1) == Delimeter) { + Begin = TokenEnd + 1; + } else if (*(TokenEnd + 1) == Quote) { + CustomStringBufs.push_back(TStringBuf(TokenStart, (TokenEnd + 1))); + TokenEnd += 2; + TokenStart = TokenEnd; + continue; + } else { + Y_ENSURE(false, TStringBuf("RFC4180 violation: in escaped string quotation mark must be followed by a delimiter, EOL or another quotation mark")); + } + if (CustomStringBufs.size()) { + CustomString.clear(); + for (auto CustomStringBuf : CustomStringBufs) { + CustomString += TString{ CustomStringBuf }; + } + CustomString += TString{ TStringBuf(TokenStart, TokenEnd) }; + CustomStringBufs.clear(); + return TStringBuf(CustomString); + } else { + return TStringBuf(TokenStart, TokenEnd); + } + } + ++TokenEnd; + } + } +}; + +TString NCsvFormat::TLinesSplitter::ConsumeLine() { + bool Escape = false; + TString result; + TString line; + while (Input.ReadLine(line)) { + for (auto it = line.begin(); it != line.end(); ++it) { + if (*it == Quote) { + Escape = !Escape; + } + } + if (!result) { + result = line; + } else { + result += line; + } + if (!Escape) { + break; + } else { + result += "\n"; + } + } + return result; +}; diff --git a/library/cpp/string_utils/csv/csv.h b/library/cpp/string_utils/csv/csv.h new file mode 100644 index 0000000000..8cb96e6bb9 --- /dev/null +++ b/library/cpp/string_utils/csv/csv.h @@ -0,0 +1,64 @@ +#pragma once + +#include <util/generic/yexception.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> +#include <util/stream/input.h> + +/* + Split string by rfc4180 +*/ + +namespace NCsvFormat { + class TLinesSplitter { + private: + IInputStream& Input; + const char Quote; + public: + TLinesSplitter(IInputStream& input, const char quote = '"') + : Input(input) + , Quote(quote) { + } + TString ConsumeLine(); + }; + + class CsvSplitter { + public: + CsvSplitter(TString& data, const char delimeter = ',', const char quote = '"') + // quote = '\0' ignores quoting in values and words like simple split + : Delimeter(delimeter) + , Quote(quote) + , Begin(data.begin()) + , End(data.end()) + { + } + + bool Step() { + if (Begin == End) { + return false; + } + ++Begin; + return true; + } + + TStringBuf Consume(); + explicit operator TVector<TString>() { + TVector<TString> ret; + + do { + TStringBuf buf = Consume(); + ret.push_back(TString{buf}); + } while (Step()); + + return ret; + } + + private: + const char Delimeter; + const char Quote; + TString::iterator Begin; + const TString::const_iterator End; + TString CustomString; + TVector<TStringBuf> CustomStringBufs; + }; +} diff --git a/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt b/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt index 7e7520d171..8febe5a8f0 100644 --- a/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt +++ b/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt @@ -16,6 +16,7 @@ target_link_libraries(lib-ydb_cli-import PUBLIC cpp-client-ydb_proto public-lib-json_value libs-apache-arrow + cpp-string_utils-csv ) target_sources(lib-ydb_cli-import PRIVATE ${CMAKE_SOURCE_DIR}/ydb/public/lib/ydb_cli/import/import.cpp diff --git a/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt b/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt index 3d4f15bb1f..df15e4f250 100644 --- a/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt +++ b/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt @@ -17,6 +17,7 @@ target_link_libraries(lib-ydb_cli-import PUBLIC cpp-client-ydb_proto public-lib-json_value libs-apache-arrow + cpp-string_utils-csv ) target_sources(lib-ydb_cli-import PRIVATE ${CMAKE_SOURCE_DIR}/ydb/public/lib/ydb_cli/import/import.cpp diff --git a/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt b/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt index 3d4f15bb1f..df15e4f250 100644 --- a/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt +++ b/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt @@ -17,6 +17,7 @@ target_link_libraries(lib-ydb_cli-import PUBLIC cpp-client-ydb_proto public-lib-json_value libs-apache-arrow + cpp-string_utils-csv ) target_sources(lib-ydb_cli-import PRIVATE ${CMAKE_SOURCE_DIR}/ydb/public/lib/ydb_cli/import/import.cpp diff --git a/ydb/public/lib/ydb_cli/import/import.cpp b/ydb/public/lib/ydb_cli/import/import.cpp index dce6e5ae5b..160ff81100 100644 --- a/ydb/public/lib/ydb_cli/import/import.cpp +++ b/ydb/public/lib/ydb_cli/import/import.cpp @@ -19,6 +19,7 @@ #include <deque> +#include <library/cpp/string_utils/csv/csv.h> #include <contrib/libs/apache/arrow/cpp/src/arrow/api.h> #include <contrib/libs/apache/arrow/cpp/src/arrow/io/api.h> #include <contrib/libs/apache/arrow/cpp/src/arrow/ipc/api.h> @@ -134,7 +135,6 @@ TAsyncStatus TImportFileClient::UpsertCsvBuffer(const TString& dbPath, const TSt TStatus TImportFileClient::UpsertCsv(IInputStream& input, const TString& dbPath, const TImportFileSettings& settings) { - TString line; TString buffer; Ydb::Formats::CsvSettings csvSettings; @@ -150,20 +150,21 @@ TStatus TImportFileClient::UpsertCsv(IInputStream& input, const TString& dbPath, special = true; } - // Do not use csvSettings.skip_rows. - for (ui32 i = 0; i < settings.SkipRows_; ++i) { - input.ReadLine(line); - } - + NCsvFormat::TLinesSplitter splitter(input); TString headerRow; if (settings.Header_) { - input.ReadLine(headerRow); + headerRow = splitter.ConsumeLine(); headerRow += '\n'; buffer = headerRow; csvSettings.set_header(true); special = true; } + // Do not use csvSettings.skip_rows. + for (ui32 i = 0; i < settings.SkipRows_; ++i) { + splitter.ConsumeLine(); + } + if (special) { TString formatSettings; Y_PROTOBUF_SUPPRESS_NODISCARD csvSettings.SerializeToString(&formatSettings); @@ -172,18 +173,14 @@ TStatus TImportFileClient::UpsertCsv(IInputStream& input, const TString& dbPath, std::deque<TAsyncStatus> inFlightRequests; - // TODO: better read - // * read serveral lines a time - // * support endlines inside quotes - // ReadLine() should count quotes for it and stop the line then counter is odd. ui32 idx = 0; ui64 readSize = 0; const ui32 mb100 = 1 << 27; ui64 nextBorder = mb100; - while (size_t sz = input.ReadLine(line)) { + while (TString line = splitter.ConsumeLine()) { buffer += line; - buffer += '\n'; // TODO: keep original endline? - readSize += sz; + buffer += '\n'; + readSize += line.size(); ++idx; if (readSize >= nextBorder && RetrySettings.Verbose_) { nextBorder += mb100; |