diff options
author | bbiff <bbiff@yandex-team.com> | 2022-09-05 18:22:03 +0300 |
---|---|---|
committer | bbiff <bbiff@yandex-team.com> | 2022-09-05 18:22:03 +0300 |
commit | 22f2084eabbc0f21f87664a2ac99d4c2357892e9 (patch) | |
tree | 9c33567512df1390b633478e29d9ad74d1b4efac | |
parent | 403e692d99369c6a0feeab2d690adefcd69a13b4 (diff) | |
download | ydb-22f2084eabbc0f21f87664a2ac99d4c2357892e9.tar.gz |
csv parsing issues
6 files changed, 23 insertions, 7 deletions
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp index ba5b5542bb..016c9cb69c 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp @@ -93,10 +93,10 @@ void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf) throw ParsingException(out.str(), ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED); } -void NO_INLINE throwTypeParseFailed(int column) +void NO_INLINE throwTypeParseFailed(const String& column_name, size_t row_num) { WriteBufferFromOwnString out; - out << "Failed to parse type in column " << column << " of csv"; + out << "failed to parse data in column `" << column_name << "' from row " << row_num << ", probably data type differs from specified in schema"; throw ParsingException(out.str(), ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED); } diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h index 9dff0c9419..e88d3d7cf5 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h @@ -163,7 +163,7 @@ void assertEOF(ReadBuffer & buf); [[noreturn]] void throwAtAssertionFailed(const char * s, ReadBuffer & buf); -[[noreturn]] void throwTypeParseFailed(int column); +[[noreturn]] void throwTypeParseFailed(const String & column, size_t row_num); inline void assertChar(char symbol, ReadBuffer & buf) diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp index 8e5f18a3cb..c804d92970 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp @@ -73,6 +73,7 @@ Chunk IRowInputFormat::generate() { try { + info.current_row = total_rows; ++total_rows; info.read_columns.clear(); diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h index 1e79e904d1..7b64c49583 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h @@ -17,6 +17,7 @@ struct RowReadExtension /// IRowInputFormat::read output. It contains non zero for columns that actually read from the source and zero otherwise. /// It's used to attach defaults for partially filled rows. std::vector<UInt8> read_columns; + size_t current_row = 0; }; /// Common parameters for generating blocks. diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9b86f9565d..002b1f8688 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -36,13 +36,17 @@ CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, co data_types.resize(num_columns); column_indexes_by_names.reserve(num_columns); + names_by_column_indexes.resize(num_columns); + is_required_columns.resize(num_columns); + auto columns = sample.cloneEmptyColumns(); for (size_t i = 0; i < num_columns; ++i) { const auto & column_info = sample.getByPosition(i); - + is_required_columns[i] = !columns[i]->isNullable(); data_types[i] = column_info.type; column_indexes_by_names.emplace(column_info.name, i); + names_by_column_indexes[i] = column_info.name; } } @@ -221,6 +225,13 @@ void CSVRowInputFormat::readPrefix() } } + for (size_t i = 0; i < column_mapping->read_columns.size(); i++) + { + if (!column_mapping->read_columns[i] && is_required_columns[i]) + { + throw Exception(String("Column `") + names_by_column_indexes[i] + "` is marked as not null, but was not found in the csv file", ErrorCodes::INCORRECT_DATA); + } + } return; } else @@ -261,6 +272,10 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext if (!ext.read_columns[*table_column]) have_default_columns = true; skipWhitespacesAndTabs(in); + + if (!checkTypeValidness(in, delimiter, is_last_file_column)) { + throwTypeParseFailed(names_by_column_indexes[*table_column], ext.current_row); + } } else { @@ -269,9 +284,6 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext readCSVString(tmp, in, format_settings.csv); } - if (!checkTypeValidness(in, delimiter, is_last_file_column)) { - throwTypeParseFailed(file_column); - } skipDelimiter(in, delimiter, is_last_file_column); } diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h index 8845149b56..1f357966cb 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -37,6 +37,8 @@ private: DataTypes data_types; using IndexesMap = std::unordered_map<String, size_t>; IndexesMap column_indexes_by_names; + std::vector<String> names_by_column_indexes; + std::vector<bool> is_required_columns; void addInputColumn(const String & column_name); |