aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbbiff <bbiff@yandex-team.com>2022-09-05 18:22:03 +0300
committerbbiff <bbiff@yandex-team.com>2022-09-05 18:22:03 +0300
commit22f2084eabbc0f21f87664a2ac99d4c2357892e9 (patch)
tree9c33567512df1390b633478e29d9ad74d1b4efac
parent403e692d99369c6a0feeab2d690adefcd69a13b4 (diff)
downloadydb-22f2084eabbc0f21f87664a2ac99d4c2357892e9.tar.gz
csv parsing issues
-rw-r--r--ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp4
-rw-r--r--ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h2
-rw-r--r--ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp1
-rw-r--r--ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h1
-rw-r--r--ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp20
-rw-r--r--ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h2
6 files changed, 23 insertions, 7 deletions
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp
index ba5b5542bb..016c9cb69c 100644
--- a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp
+++ b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp
@@ -93,10 +93,10 @@ void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
throw ParsingException(out.str(), ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED);
}
-void NO_INLINE throwTypeParseFailed(int column)
+void NO_INLINE throwTypeParseFailed(const String& column_name, size_t row_num)
{
WriteBufferFromOwnString out;
- out << "Failed to parse type in column " << column << " of csv";
+ out << "failed to parse data in column `" << column_name << "' from row " << row_num << ", probably data type differs from specified in schema";
throw ParsingException(out.str(), ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED);
}
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h
index 9dff0c9419..e88d3d7cf5 100644
--- a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h
+++ b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.h
@@ -163,7 +163,7 @@ void assertEOF(ReadBuffer & buf);
[[noreturn]] void throwAtAssertionFailed(const char * s, ReadBuffer & buf);
-[[noreturn]] void throwTypeParseFailed(int column);
+[[noreturn]] void throwTypeParseFailed(const String & column, size_t row_num);
inline void assertChar(char symbol, ReadBuffer & buf)
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp
index 8e5f18a3cb..c804d92970 100644
--- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp
+++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.cpp
@@ -73,6 +73,7 @@ Chunk IRowInputFormat::generate()
{
try
{
+ info.current_row = total_rows;
++total_rows;
info.read_columns.clear();
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h
index 1e79e904d1..7b64c49583 100644
--- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h
+++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/IRowInputFormat.h
@@ -17,6 +17,7 @@ struct RowReadExtension
/// IRowInputFormat::read output. It contains non zero for columns that actually read from the source and zero otherwise.
/// It's used to attach defaults for partially filled rows.
std::vector<UInt8> read_columns;
+ size_t current_row = 0;
};
/// Common parameters for generating blocks.
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
index 9b86f9565d..002b1f8688 100644
--- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
+++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
@@ -36,13 +36,17 @@ CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, co
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
+ names_by_column_indexes.resize(num_columns);
+ is_required_columns.resize(num_columns);
+ auto columns = sample.cloneEmptyColumns();
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = sample.getByPosition(i);
-
+ is_required_columns[i] = !columns[i]->isNullable();
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
+ names_by_column_indexes[i] = column_info.name;
}
}
@@ -221,6 +225,13 @@ void CSVRowInputFormat::readPrefix()
}
}
+ for (size_t i = 0; i < column_mapping->read_columns.size(); i++)
+ {
+ if (!column_mapping->read_columns[i] && is_required_columns[i])
+ {
+ throw Exception(String("Column `") + names_by_column_indexes[i] + "` is marked as not null, but was not found in the csv file", ErrorCodes::INCORRECT_DATA);
+ }
+ }
return;
}
else
@@ -261,6 +272,10 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext
if (!ext.read_columns[*table_column])
have_default_columns = true;
skipWhitespacesAndTabs(in);
+
+ if (!checkTypeValidness(in, delimiter, is_last_file_column)) {
+ throwTypeParseFailed(names_by_column_indexes[*table_column], ext.current_row);
+ }
}
else
{
@@ -269,9 +284,6 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext
readCSVString(tmp, in, format_settings.csv);
}
- if (!checkTypeValidness(in, delimiter, is_last_file_column)) {
- throwTypeParseFailed(file_column);
- }
skipDelimiter(in, delimiter, is_last_file_column);
}
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h
index 8845149b56..1f357966cb 100644
--- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h
+++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.h
@@ -37,6 +37,8 @@ private:
DataTypes data_types;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
+ std::vector<String> names_by_column_indexes;
+ std::vector<bool> is_required_columns;
void addInputColumn(const String & column_name);