diff options
author | Oleg Doronin <dorooleg@yandex.ru> | 2024-01-29 20:51:03 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-29 20:51:03 +0300 |
commit | f2ccd9c18aeacf78e3b1b29df20634dfc50ce5bf (patch) | |
tree | d7b88c425f70bd4b5145f582bc2b3295bf246c88 | |
parent | 0f64a086eefa2fe6059edf4af6f0b32f5f5383f6 (diff) | |
download | ydb-f2ccd9c18aeacf78e3b1b29df20634dfc50ce5bf.tar.gz |
Improved handling of empty cells in CSV YQ-2727 (#796)
* csv null poc
* cleanup
* not null string as default
* cleanup tests
7 files changed, 135 insertions, 0 deletions
diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index 1c58ffe536..695588bd15 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -3034,6 +3034,10 @@ std::pair<NYql::NDq::IDqComputeActorAsyncInput*, IActor*> CreateS3ReadActor( readSpec->Format = params.GetFormat(); + if (readSpec->Format == "csv_with_names") { + readSpec->Settings.csv.empty_as_default = true; + } + if (const auto it = settings.find("compression"); settings.cend() != it) readSpec->Compression = it->second; diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 125a094f42..537ebe7fd5 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -429,6 +429,9 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) { + if (!type->isNullable() && type->getName() != "String" && !type->getName().starts_with("FixedString")) { + throw ParsingException("Invalid data format", NDB::ErrorCodes::INCORRECT_DATA); + } /// Treat empty unquoted column value as default value, if /// specified in the settings. Tuple columns might seem /// problematic, because they are never quoted but still contain diff --git a/ydb/tests/fq/s3/test_format_setting.py b/ydb/tests/fq/s3/test_format_setting.py index f1267348db..b51693e442 100644 --- a/ydb/tests/fq/s3/test_format_setting.py +++ b/ydb/tests/fq/s3/test_format_setting.py @@ -15,6 +15,7 @@ import ydb.public.api.protos.draft.fq_pb2 as fq import ydb.tests.fq.s3.s3_helpers as s3_helpers from ydb.tests.tools.fq_runner.kikimr_utils import yq_all +from google.protobuf import struct_pb2 class TestS3(TestYdsBase): @@ -759,3 +760,122 @@ Pear;15;33''' assert result_set.columns[3].type.type_id == ydb.Type.INT32 assert len(result_set.rows) == 6 + + @yq_all + @pytest.mark.parametrize("filename", [ + ("date_null/as_default/test.csv"), + ("date_null/parse_error/test.csv") + ]) + def test_date_null(self, kikimr, s3, client, filename): + self.create_bucket_and_upload_file(filename, s3, kikimr) + client.create_storage_connection("hcpp", "fbucket") + + sql = ''' + SELECT + `put` + FROM + `hcpp`.`{name}` + WITH (FORMAT="csv_with_names", + csv_delimiter=",", + SCHEMA=( + `put` Date + )) + LIMIT 10; + '''.format(name="/" + filename) + + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + data = client.get_result_data(query_id, limit=50) + assert data.result.result_set.rows[0].items[0].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set) + + @yq_all + @pytest.mark.parametrize("filename", [ + ("date_null/as_default/test.csv"), + ("date_null/parse_error/test.csv") + ]) + def test_date_null_with_not_null_type(self, kikimr, s3, client, filename): + self.create_bucket_and_upload_file(filename, s3, kikimr) + client.create_storage_connection("hcpp", "fbucket") + + sql = ''' + SELECT + `put` + FROM + `hcpp`.`{name}` + WITH (FORMAT="csv_with_names", + csv_delimiter=",", + SCHEMA=( + `put` Date NOT NULL + )) + LIMIT 10; + '''.format(name="/" + filename) + + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.FAILED) + describe_result = client.describe_query(query_id).result + issues = describe_result.query.issue[0].issues + assert "Invalid data format" in str(issues), str(describe_result) + assert "name: put, type: Date, ERROR: text " in str(issues), str(describe_result) + assert "is not like Date" in str(issues), str(describe_result) + + @yq_all + @pytest.mark.parametrize("filename", [ + ("date_null/as_default/multi_null.csv"), + ("date_null/parse_error/multi_null.csv") + ]) + def test_date_null_multi(self, kikimr, s3, client, filename): + self.create_bucket_and_upload_file(filename, s3, kikimr) + client.create_storage_connection("hcpp", "fbucket") + + sql = ''' + SELECT + `put`, `a`, `t` + FROM + `hcpp`.`{name}` + WITH (FORMAT="csv_with_names", + csv_delimiter=",", + SCHEMA=( + `put` Date, + `a` Date, + `t` Date + )) + LIMIT 10; + '''.format(name="/" + filename) + + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + data = client.get_result_data(query_id, limit=50) + assert data.result.result_set.rows[0].items[0].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set) + assert data.result.result_set.rows[0].items[1].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set) + assert data.result.result_set.rows[0].items[2].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set) + + @yq_all + @pytest.mark.parametrize("filename", [ + ("date_null/as_default/multi_null.csv"), + ("date_null/parse_error/multi_null.csv") + ]) + def test_string_not_null_multi(self, kikimr, s3, client, filename): + self.create_bucket_and_upload_file(filename, s3, kikimr) + client.create_storage_connection("hcpp", "fbucket") + + sql = ''' + SELECT + `put`, `a`, `t` + FROM + `hcpp`.`{name}` + WITH (FORMAT="csv_with_names", + csv_delimiter=",", + SCHEMA=( + `put` String NOT NULL, + `a` Utf8 NOT NULL, + `t` String NOT NULL + )) + LIMIT 10; + '''.format(name="/" + filename) + + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + data = client.get_result_data(query_id, limit=50) + assert data.result.result_set.rows[0].items[0].bytes_value == b"", str(data.result.result_set) + assert data.result.result_set.rows[0].items[1].bytes_value == b"", str(data.result.result_set) + assert data.result.result_set.rows[0].items[2].bytes_value == b"", str(data.result.result_set) diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/as_default/multi_null.csv b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/multi_null.csv new file mode 100644 index 0000000000..6e1aec129f --- /dev/null +++ b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/multi_null.csv @@ -0,0 +1,2 @@ +id,put,call,a,t +id1,,2022-12-22,,
\ No newline at end of file diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/as_default/test.csv b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/test.csv new file mode 100644 index 0000000000..e212764385 --- /dev/null +++ b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/test.csv @@ -0,0 +1,2 @@ +id,put,call +id1,,2022-12-22
\ No newline at end of file diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/multi_null.csv b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/multi_null.csv new file mode 100644 index 0000000000..42fd11d35f --- /dev/null +++ b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/multi_null.csv @@ -0,0 +1,2 @@ +id,put,call,a,t +id1,,hello2,, diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/test.csv b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/test.csv new file mode 100644 index 0000000000..de4da74ad8 --- /dev/null +++ b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/test.csv @@ -0,0 +1,2 @@ +id,put,call +id1,,hello2 |