aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/string_utils/csv
diff options
context:
space:
mode:
authorDaniil Cherednik <dan.cherednik@gmail.com>2023-02-09 11:44:35 +0300
committerDaniil Cherednik <dan.cherednik@gmail.com>2023-02-09 11:46:17 +0300
commitb0967c30d3706b650b679fe119b6bd7b0924d328 (patch)
tree25579dfda238c2cc5b00324878303b3a05d09f45 /library/cpp/string_utils/csv
parent9b78acb9998e4a817a21fe60443c7c5d6a06b947 (diff)
downloadydb-470a39568672a2a0d7a4476a228e3ca3bfe4fd8a.tar.gz
Ydb stable 22-5-1022.5.10stable-22-5
x-stable-origin-commit: f696baac1a4b8d48eb52b52b35930eef6d0eab42
Diffstat (limited to 'library/cpp/string_utils/csv')
-rw-r--r--library/cpp/string_utils/csv/CMakeLists.txt17
-rw-r--r--library/cpp/string_utils/csv/csv.cpp82
-rw-r--r--library/cpp/string_utils/csv/csv.h64
3 files changed, 163 insertions, 0 deletions
diff --git a/library/cpp/string_utils/csv/CMakeLists.txt b/library/cpp/string_utils/csv/CMakeLists.txt
new file mode 100644
index 0000000000..7dffad3566
--- /dev/null
+++ b/library/cpp/string_utils/csv/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-string_utils-csv)
+target_link_libraries(cpp-string_utils-csv PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-string_utils-csv PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/string_utils/csv/csv.cpp
+)
diff --git a/library/cpp/string_utils/csv/csv.cpp b/library/cpp/string_utils/csv/csv.cpp
new file mode 100644
index 0000000000..218473c62c
--- /dev/null
+++ b/library/cpp/string_utils/csv/csv.cpp
@@ -0,0 +1,82 @@
+#include "csv.h"
+
+TStringBuf NCsvFormat::CsvSplitter::Consume() {
+ if (Begin == End) {
+ return nullptr;
+ }
+ TString::iterator TokenStart = Begin;
+ TString::iterator TokenEnd = Begin;
+ if (Quote == '\0') {
+ while (1) {
+ if (TokenEnd == End || *TokenEnd == Delimeter) {
+ Begin = TokenEnd;
+ return TStringBuf(TokenStart, TokenEnd);
+ }
+ ++TokenEnd;
+ }
+ } else {
+ bool Escape = false;
+ if (*Begin == Quote) {
+ Escape = true;
+ ++TokenStart;
+ ++TokenEnd;
+ Y_ENSURE(TokenStart != End, TStringBuf("RFC4180 violation: quotation mark must be followed by something"));
+ }
+ while (1) {
+ if (TokenEnd == End || (!Escape && *TokenEnd == Delimeter)) {
+ Begin = TokenEnd;
+ return TStringBuf(TokenStart, TokenEnd);
+ } else if (*TokenEnd == Quote) {
+ Y_ENSURE(Escape, TStringBuf("RFC4180 violation: quotation mark must be in the escaped string only"));
+ if (TokenEnd + 1 == End) {
+ Begin = TokenEnd + 1;
+ } else if (*(TokenEnd + 1) == Delimeter) {
+ Begin = TokenEnd + 1;
+ } else if (*(TokenEnd + 1) == Quote) {
+ CustomStringBufs.push_back(TStringBuf(TokenStart, (TokenEnd + 1)));
+ TokenEnd += 2;
+ TokenStart = TokenEnd;
+ continue;
+ } else {
+ Y_ENSURE(false, TStringBuf("RFC4180 violation: in escaped string quotation mark must be followed by a delimiter, EOL or another quotation mark"));
+ }
+ if (CustomStringBufs.size()) {
+ CustomString.clear();
+ for (auto CustomStringBuf : CustomStringBufs) {
+ CustomString += TString{ CustomStringBuf };
+ }
+ CustomString += TString{ TStringBuf(TokenStart, TokenEnd) };
+ CustomStringBufs.clear();
+ return TStringBuf(CustomString);
+ } else {
+ return TStringBuf(TokenStart, TokenEnd);
+ }
+ }
+ ++TokenEnd;
+ }
+ }
+};
+
+TString NCsvFormat::TLinesSplitter::ConsumeLine() {
+ bool Escape = false;
+ TString result;
+ TString line;
+ while (Input.ReadLine(line)) {
+ for (auto it = line.begin(); it != line.end(); ++it) {
+ if (*it == Quote) {
+ Escape = !Escape;
+ }
+ }
+ if (!result) {
+ result = line;
+ } else {
+ result += line;
+ }
+ if (!Escape) {
+ break;
+ } else {
+ result += "\n";
+ }
+ }
+ return result;
+};
diff --git a/library/cpp/string_utils/csv/csv.h b/library/cpp/string_utils/csv/csv.h
new file mode 100644
index 0000000000..8cb96e6bb9
--- /dev/null
+++ b/library/cpp/string_utils/csv/csv.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <util/generic/yexception.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/stream/input.h>
+
+/*
+ Split string by rfc4180
+*/
+
+namespace NCsvFormat {
+ class TLinesSplitter {
+ private:
+ IInputStream& Input;
+ const char Quote;
+ public:
+ TLinesSplitter(IInputStream& input, const char quote = '"')
+ : Input(input)
+ , Quote(quote) {
+ }
+ TString ConsumeLine();
+ };
+
+ class CsvSplitter {
+ public:
+ CsvSplitter(TString& data, const char delimeter = ',', const char quote = '"')
+ // quote = '\0' ignores quoting in values and words like simple split
+ : Delimeter(delimeter)
+ , Quote(quote)
+ , Begin(data.begin())
+ , End(data.end())
+ {
+ }
+
+ bool Step() {
+ if (Begin == End) {
+ return false;
+ }
+ ++Begin;
+ return true;
+ }
+
+ TStringBuf Consume();
+ explicit operator TVector<TString>() {
+ TVector<TString> ret;
+
+ do {
+ TStringBuf buf = Consume();
+ ret.push_back(TString{buf});
+ } while (Step());
+
+ return ret;
+ }
+
+ private:
+ const char Delimeter;
+ const char Quote;
+ TString::iterator Begin;
+ const TString::const_iterator End;
+ TString CustomString;
+ TVector<TStringBuf> CustomStringBufs;
+ };
+}