diff options
author | Daniil Cherednik <dan.cherednik@gmail.com> | 2023-02-09 11:44:35 +0300 |
---|---|---|
committer | Daniil Cherednik <dan.cherednik@gmail.com> | 2023-02-09 11:46:17 +0300 |
commit | b0967c30d3706b650b679fe119b6bd7b0924d328 (patch) | |
tree | 25579dfda238c2cc5b00324878303b3a05d09f45 /library/cpp/string_utils | |
parent | 9b78acb9998e4a817a21fe60443c7c5d6a06b947 (diff) | |
download | ydb-b0967c30d3706b650b679fe119b6bd7b0924d328.tar.gz |
Ydb stable 22-5-1022.5.10stable-22-5
x-stable-origin-commit: f696baac1a4b8d48eb52b52b35930eef6d0eab42
Diffstat (limited to 'library/cpp/string_utils')
-rw-r--r-- | library/cpp/string_utils/CMakeLists.txt | 1 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/CMakeLists.txt | 17 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/csv.cpp | 82 | ||||
-rw-r--r-- | library/cpp/string_utils/csv/csv.h | 64 |
4 files changed, 164 insertions, 0 deletions
diff --git a/library/cpp/string_utils/CMakeLists.txt b/library/cpp/string_utils/CMakeLists.txt index d256782733..bbdcba85d9 100644 --- a/library/cpp/string_utils/CMakeLists.txt +++ b/library/cpp/string_utils/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(base64) +add_subdirectory(csv) add_subdirectory(indent_text) add_subdirectory(levenshtein_diff) add_subdirectory(parse_size) diff --git a/library/cpp/string_utils/csv/CMakeLists.txt b/library/cpp/string_utils/csv/CMakeLists.txt new file mode 100644 index 0000000000..7dffad3566 --- /dev/null +++ b/library/cpp/string_utils/csv/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-string_utils-csv) +target_link_libraries(cpp-string_utils-csv PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(cpp-string_utils-csv PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp +) diff --git a/library/cpp/string_utils/csv/csv.cpp b/library/cpp/string_utils/csv/csv.cpp new file mode 100644 index 0000000000..218473c62c --- /dev/null +++ b/library/cpp/string_utils/csv/csv.cpp @@ -0,0 +1,82 @@ +#include "csv.h" + +TStringBuf NCsvFormat::CsvSplitter::Consume() { + if (Begin == End) { + return nullptr; + } + TString::iterator TokenStart = Begin; + TString::iterator TokenEnd = Begin; + if (Quote == '\0') { + while (1) { + if (TokenEnd == End || *TokenEnd == Delimeter) { + Begin = TokenEnd; + return TStringBuf(TokenStart, TokenEnd); + } + ++TokenEnd; + } + } else { + bool Escape = false; + if (*Begin == Quote) { + Escape = true; + ++TokenStart; + ++TokenEnd; + Y_ENSURE(TokenStart != End, TStringBuf("RFC4180 violation: quotation mark must be followed by something")); + } + while (1) { + if (TokenEnd == End || (!Escape && *TokenEnd == Delimeter)) { + Begin = TokenEnd; + return TStringBuf(TokenStart, TokenEnd); + } else if (*TokenEnd == Quote) { + Y_ENSURE(Escape, TStringBuf("RFC4180 violation: quotation mark must be in the escaped string only")); + if (TokenEnd + 1 == End) { + Begin = TokenEnd + 1; + } else if (*(TokenEnd + 1) == Delimeter) { + Begin = TokenEnd + 1; + } else if (*(TokenEnd + 1) == Quote) { + CustomStringBufs.push_back(TStringBuf(TokenStart, (TokenEnd + 1))); + TokenEnd += 2; + TokenStart = TokenEnd; + continue; + } else { + Y_ENSURE(false, TStringBuf("RFC4180 violation: in escaped string quotation mark must be followed by a delimiter, EOL or another quotation mark")); + } + if (CustomStringBufs.size()) { + CustomString.clear(); + for (auto CustomStringBuf : CustomStringBufs) { + CustomString += TString{ CustomStringBuf }; + } + CustomString += TString{ TStringBuf(TokenStart, TokenEnd) }; + CustomStringBufs.clear(); + return TStringBuf(CustomString); + } else { + return TStringBuf(TokenStart, TokenEnd); + } + } + ++TokenEnd; + } + } +}; + +TString NCsvFormat::TLinesSplitter::ConsumeLine() { + bool Escape = false; + TString result; + TString line; + while (Input.ReadLine(line)) { + for (auto it = line.begin(); it != line.end(); ++it) { + if (*it == Quote) { + Escape = !Escape; + } + } + if (!result) { + result = line; + } else { + result += line; + } + if (!Escape) { + break; + } else { + result += "\n"; + } + } + return result; +}; diff --git a/library/cpp/string_utils/csv/csv.h b/library/cpp/string_utils/csv/csv.h new file mode 100644 index 0000000000..8cb96e6bb9 --- /dev/null +++ b/library/cpp/string_utils/csv/csv.h @@ -0,0 +1,64 @@ +#pragma once + +#include <util/generic/yexception.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> +#include <util/stream/input.h> + +/* + Split string by rfc4180 +*/ + +namespace NCsvFormat { + class TLinesSplitter { + private: + IInputStream& Input; + const char Quote; + public: + TLinesSplitter(IInputStream& input, const char quote = '"') + : Input(input) + , Quote(quote) { + } + TString ConsumeLine(); + }; + + class CsvSplitter { + public: + CsvSplitter(TString& data, const char delimeter = ',', const char quote = '"') + // quote = '\0' ignores quoting in values and words like simple split + : Delimeter(delimeter) + , Quote(quote) + , Begin(data.begin()) + , End(data.end()) + { + } + + bool Step() { + if (Begin == End) { + return false; + } + ++Begin; + return true; + } + + TStringBuf Consume(); + explicit operator TVector<TString>() { + TVector<TString> ret; + + do { + TStringBuf buf = Consume(); + ret.push_back(TString{buf}); + } while (Step()); + + return ret; + } + + private: + const char Delimeter; + const char Quote; + TString::iterator Begin; + const TString::const_iterator End; + TString CustomString; + TVector<TStringBuf> CustomStringBufs; + }; +} |