aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorivanmorozov <ivanmorozov@yandex-team.com>2022-12-23 17:22:10 +0300
committerivanmorozov <ivanmorozov@yandex-team.com>2022-12-23 17:22:10 +0300
commitf6fd83a9bc9c2d6bb9502d3dd77adb4f5c31ceae (patch)
treecca463108bcf236b61344602b47f474ce42b1ff4
parentaf6d4963aeec3543ef895865a0a24fa2e039efa3 (diff)
downloadydb-f6fd83a9bc9c2d6bb9502d3dd77adb4f5c31ceae.tar.gz
parse line on deploy by option
-rw-r--r--library/cpp/string_utils/CMakeLists.txt1
-rw-r--r--library/cpp/string_utils/csv/CMakeLists.darwin.txt17
-rw-r--r--library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt18
-rw-r--r--library/cpp/string_utils/csv/CMakeLists.linux.txt18
-rw-r--r--library/cpp/string_utils/csv/CMakeLists.txt15
-rw-r--r--library/cpp/string_utils/csv/csv.cpp82
-rw-r--r--library/cpp/string_utils/csv/csv.h64
-rw-r--r--ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt1
-rw-r--r--ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt1
-rw-r--r--ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt1
-rw-r--r--ydb/public/lib/ydb_cli/import/import.cpp25
11 files changed, 229 insertions, 14 deletions
diff --git a/library/cpp/string_utils/CMakeLists.txt b/library/cpp/string_utils/CMakeLists.txt
index d256782733..bbdcba85d9 100644
--- a/library/cpp/string_utils/CMakeLists.txt
+++ b/library/cpp/string_utils/CMakeLists.txt
@@ -7,6 +7,7 @@
add_subdirectory(base64)
+add_subdirectory(csv)
add_subdirectory(indent_text)
add_subdirectory(levenshtein_diff)
add_subdirectory(parse_size)
diff --git a/library/cpp/string_utils/csv/CMakeLists.darwin.txt b/library/cpp/string_utils/csv/CMakeLists.darwin.txt
new file mode 100644
index 0000000000..7dffad3566
--- /dev/null
+++ b/library/cpp/string_utils/csv/CMakeLists.darwin.txt
@@ -0,0 +1,17 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-string_utils-csv)
+target_link_libraries(cpp-string_utils-csv PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-string_utils-csv PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp
+)
diff --git a/library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt b/library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..1941d5ba17
--- /dev/null
+++ b/library/cpp/string_utils/csv/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,18 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-string_utils-csv)
+target_link_libraries(cpp-string_utils-csv PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-string_utils-csv PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp
+)
diff --git a/library/cpp/string_utils/csv/CMakeLists.linux.txt b/library/cpp/string_utils/csv/CMakeLists.linux.txt
new file mode 100644
index 0000000000..1941d5ba17
--- /dev/null
+++ b/library/cpp/string_utils/csv/CMakeLists.linux.txt
@@ -0,0 +1,18 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-string_utils-csv)
+target_link_libraries(cpp-string_utils-csv PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-string_utils-csv PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp
+)
diff --git a/library/cpp/string_utils/csv/CMakeLists.txt b/library/cpp/string_utils/csv/CMakeLists.txt
new file mode 100644
index 0000000000..3e0811fb22
--- /dev/null
+++ b/library/cpp/string_utils/csv/CMakeLists.txt
@@ -0,0 +1,15 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND UNIX AND NOT APPLE AND NOT ANDROID)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (APPLE)
+ include(CMakeLists.darwin.txt)
+elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND UNIX AND NOT APPLE AND NOT ANDROID)
+ include(CMakeLists.linux.txt)
+endif()
diff --git a/library/cpp/string_utils/csv/csv.cpp b/library/cpp/string_utils/csv/csv.cpp
new file mode 100644
index 0000000000..218473c62c
--- /dev/null
+++ b/library/cpp/string_utils/csv/csv.cpp
@@ -0,0 +1,82 @@
+#include "csv.h"
+
+TStringBuf NCsvFormat::CsvSplitter::Consume() {
+ if (Begin == End) {
+ return nullptr;
+ }
+ TString::iterator TokenStart = Begin;
+ TString::iterator TokenEnd = Begin;
+ if (Quote == '\0') {
+ while (1) {
+ if (TokenEnd == End || *TokenEnd == Delimeter) {
+ Begin = TokenEnd;
+ return TStringBuf(TokenStart, TokenEnd);
+ }
+ ++TokenEnd;
+ }
+ } else {
+ bool Escape = false;
+ if (*Begin == Quote) {
+ Escape = true;
+ ++TokenStart;
+ ++TokenEnd;
+ Y_ENSURE(TokenStart != End, TStringBuf("RFC4180 violation: quotation mark must be followed by something"));
+ }
+ while (1) {
+ if (TokenEnd == End || (!Escape && *TokenEnd == Delimeter)) {
+ Begin = TokenEnd;
+ return TStringBuf(TokenStart, TokenEnd);
+ } else if (*TokenEnd == Quote) {
+ Y_ENSURE(Escape, TStringBuf("RFC4180 violation: quotation mark must be in the escaped string only"));
+ if (TokenEnd + 1 == End) {
+ Begin = TokenEnd + 1;
+ } else if (*(TokenEnd + 1) == Delimeter) {
+ Begin = TokenEnd + 1;
+ } else if (*(TokenEnd + 1) == Quote) {
+ CustomStringBufs.push_back(TStringBuf(TokenStart, (TokenEnd + 1)));
+ TokenEnd += 2;
+ TokenStart = TokenEnd;
+ continue;
+ } else {
+ Y_ENSURE(false, TStringBuf("RFC4180 violation: in escaped string quotation mark must be followed by a delimiter, EOL or another quotation mark"));
+ }
+ if (CustomStringBufs.size()) {
+ CustomString.clear();
+ for (auto CustomStringBuf : CustomStringBufs) {
+ CustomString += TString{ CustomStringBuf };
+ }
+ CustomString += TString{ TStringBuf(TokenStart, TokenEnd) };
+ CustomStringBufs.clear();
+ return TStringBuf(CustomString);
+ } else {
+ return TStringBuf(TokenStart, TokenEnd);
+ }
+ }
+ ++TokenEnd;
+ }
+ }
+};
+
+TString NCsvFormat::TLinesSplitter::ConsumeLine() {
+ bool Escape = false;
+ TString result;
+ TString line;
+ while (Input.ReadLine(line)) {
+ for (auto it = line.begin(); it != line.end(); ++it) {
+ if (*it == Quote) {
+ Escape = !Escape;
+ }
+ }
+ if (!result) {
+ result = line;
+ } else {
+ result += line;
+ }
+ if (!Escape) {
+ break;
+ } else {
+ result += "\n";
+ }
+ }
+ return result;
+};
diff --git a/library/cpp/string_utils/csv/csv.h b/library/cpp/string_utils/csv/csv.h
new file mode 100644
index 0000000000..8cb96e6bb9
--- /dev/null
+++ b/library/cpp/string_utils/csv/csv.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <util/generic/yexception.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/stream/input.h>
+
+/*
+ Split string by rfc4180
+*/
+
+namespace NCsvFormat {
+ class TLinesSplitter {
+ private:
+ IInputStream& Input;
+ const char Quote;
+ public:
+ TLinesSplitter(IInputStream& input, const char quote = '"')
+ : Input(input)
+ , Quote(quote) {
+ }
+ TString ConsumeLine();
+ };
+
+ class CsvSplitter {
+ public:
+ CsvSplitter(TString& data, const char delimeter = ',', const char quote = '"')
+ // quote = '\0' ignores quoting in values and words like simple split
+ : Delimeter(delimeter)
+ , Quote(quote)
+ , Begin(data.begin())
+ , End(data.end())
+ {
+ }
+
+ bool Step() {
+ if (Begin == End) {
+ return false;
+ }
+ ++Begin;
+ return true;
+ }
+
+ TStringBuf Consume();
+ explicit operator TVector<TString>() {
+ TVector<TString> ret;
+
+ do {
+ TStringBuf buf = Consume();
+ ret.push_back(TString{buf});
+ } while (Step());
+
+ return ret;
+ }
+
+ private:
+ const char Delimeter;
+ const char Quote;
+ TString::iterator Begin;
+ const TString::const_iterator End;
+ TString CustomString;
+ TVector<TStringBuf> CustomStringBufs;
+ };
+}
diff --git a/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt b/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt
index 7e7520d171..8febe5a8f0 100644
--- a/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt
+++ b/ydb/public/lib/ydb_cli/import/CMakeLists.darwin.txt
@@ -16,6 +16,7 @@ target_link_libraries(lib-ydb_cli-import PUBLIC
cpp-client-ydb_proto
public-lib-json_value
libs-apache-arrow
+ cpp-string_utils-csv
)
target_sources(lib-ydb_cli-import PRIVATE
${CMAKE_SOURCE_DIR}/ydb/public/lib/ydb_cli/import/import.cpp
diff --git a/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt b/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt
index 3d4f15bb1f..df15e4f250 100644
--- a/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt
+++ b/ydb/public/lib/ydb_cli/import/CMakeLists.linux-aarch64.txt
@@ -17,6 +17,7 @@ target_link_libraries(lib-ydb_cli-import PUBLIC
cpp-client-ydb_proto
public-lib-json_value
libs-apache-arrow
+ cpp-string_utils-csv
)
target_sources(lib-ydb_cli-import PRIVATE
${CMAKE_SOURCE_DIR}/ydb/public/lib/ydb_cli/import/import.cpp
diff --git a/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt b/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt
index 3d4f15bb1f..df15e4f250 100644
--- a/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt
+++ b/ydb/public/lib/ydb_cli/import/CMakeLists.linux.txt
@@ -17,6 +17,7 @@ target_link_libraries(lib-ydb_cli-import PUBLIC
cpp-client-ydb_proto
public-lib-json_value
libs-apache-arrow
+ cpp-string_utils-csv
)
target_sources(lib-ydb_cli-import PRIVATE
${CMAKE_SOURCE_DIR}/ydb/public/lib/ydb_cli/import/import.cpp
diff --git a/ydb/public/lib/ydb_cli/import/import.cpp b/ydb/public/lib/ydb_cli/import/import.cpp
index dce6e5ae5b..160ff81100 100644
--- a/ydb/public/lib/ydb_cli/import/import.cpp
+++ b/ydb/public/lib/ydb_cli/import/import.cpp
@@ -19,6 +19,7 @@
#include <deque>
+#include <library/cpp/string_utils/csv/csv.h>
#include <contrib/libs/apache/arrow/cpp/src/arrow/api.h>
#include <contrib/libs/apache/arrow/cpp/src/arrow/io/api.h>
#include <contrib/libs/apache/arrow/cpp/src/arrow/ipc/api.h>
@@ -134,7 +135,6 @@ TAsyncStatus TImportFileClient::UpsertCsvBuffer(const TString& dbPath, const TSt
TStatus TImportFileClient::UpsertCsv(IInputStream& input, const TString& dbPath,
const TImportFileSettings& settings) {
- TString line;
TString buffer;
Ydb::Formats::CsvSettings csvSettings;
@@ -150,20 +150,21 @@ TStatus TImportFileClient::UpsertCsv(IInputStream& input, const TString& dbPath,
special = true;
}
- // Do not use csvSettings.skip_rows.
- for (ui32 i = 0; i < settings.SkipRows_; ++i) {
- input.ReadLine(line);
- }
-
+ NCsvFormat::TLinesSplitter splitter(input);
TString headerRow;
if (settings.Header_) {
- input.ReadLine(headerRow);
+ headerRow = splitter.ConsumeLine();
headerRow += '\n';
buffer = headerRow;
csvSettings.set_header(true);
special = true;
}
+ // Do not use csvSettings.skip_rows.
+ for (ui32 i = 0; i < settings.SkipRows_; ++i) {
+ splitter.ConsumeLine();
+ }
+
if (special) {
TString formatSettings;
Y_PROTOBUF_SUPPRESS_NODISCARD csvSettings.SerializeToString(&formatSettings);
@@ -172,18 +173,14 @@ TStatus TImportFileClient::UpsertCsv(IInputStream& input, const TString& dbPath,
std::deque<TAsyncStatus> inFlightRequests;
- // TODO: better read
- // * read serveral lines a time
- // * support endlines inside quotes
- // ReadLine() should count quotes for it and stop the line then counter is odd.
ui32 idx = 0;
ui64 readSize = 0;
const ui32 mb100 = 1 << 27;
ui64 nextBorder = mb100;
- while (size_t sz = input.ReadLine(line)) {
+ while (TString line = splitter.ConsumeLine()) {
buffer += line;
- buffer += '\n'; // TODO: keep original endline?
- readSize += sz;
+ buffer += '\n';
+ readSize += line.size();
++idx;
if (readSize >= nextBorder && RetrySettings.Verbose_) {
nextBorder += mb100;