aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOleg Doronin <dorooleg@yandex.ru>2024-01-29 20:51:03 +0300
committerGitHub <noreply@github.com>2024-01-29 20:51:03 +0300
commitf2ccd9c18aeacf78e3b1b29df20634dfc50ce5bf (patch)
treed7b88c425f70bd4b5145f582bc2b3295bf246c88
parent0f64a086eefa2fe6059edf4af6f0b32f5f5383f6 (diff)
downloadydb-f2ccd9c18aeacf78e3b1b29df20634dfc50ce5bf.tar.gz
Improved handling of empty cells in CSV YQ-2727 (#796)
* csv null poc * cleanup * not null string as default * cleanup tests
-rw-r--r--ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp4
-rw-r--r--ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp3
-rw-r--r--ydb/tests/fq/s3/test_format_setting.py120
-rw-r--r--ydb/tests/fq/s3/test_format_settings/date_null/as_default/multi_null.csv2
-rw-r--r--ydb/tests/fq/s3/test_format_settings/date_null/as_default/test.csv2
-rw-r--r--ydb/tests/fq/s3/test_format_settings/date_null/parse_error/multi_null.csv2
-rw-r--r--ydb/tests/fq/s3/test_format_settings/date_null/parse_error/test.csv2
7 files changed, 135 insertions, 0 deletions
diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp
index 1c58ffe536..695588bd15 100644
--- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp
+++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp
@@ -3034,6 +3034,10 @@ std::pair<NYql::NDq::IDqComputeActorAsyncInput*, IActor*> CreateS3ReadActor(
readSpec->Format = params.GetFormat();
+ if (readSpec->Format == "csv_with_names") {
+ readSpec->Settings.csv.empty_as_default = true;
+ }
+
if (const auto it = settings.find("compression"); settings.cend() != it)
readSpec->Compression = it->second;
diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
index 125a094f42..537ebe7fd5 100644
--- a/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
+++ b/ydb/library/yql/udfs/common/clickhouse/client/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
@@ -429,6 +429,9 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co
if (format_settings.csv.empty_as_default
&& (at_delimiter || at_last_column_line_end))
{
+ if (!type->isNullable() && type->getName() != "String" && !type->getName().starts_with("FixedString")) {
+ throw ParsingException("Invalid data format", NDB::ErrorCodes::INCORRECT_DATA);
+ }
/// Treat empty unquoted column value as default value, if
/// specified in the settings. Tuple columns might seem
/// problematic, because they are never quoted but still contain
diff --git a/ydb/tests/fq/s3/test_format_setting.py b/ydb/tests/fq/s3/test_format_setting.py
index f1267348db..b51693e442 100644
--- a/ydb/tests/fq/s3/test_format_setting.py
+++ b/ydb/tests/fq/s3/test_format_setting.py
@@ -15,6 +15,7 @@ import ydb.public.api.protos.draft.fq_pb2 as fq
import ydb.tests.fq.s3.s3_helpers as s3_helpers
from ydb.tests.tools.fq_runner.kikimr_utils import yq_all
+from google.protobuf import struct_pb2
class TestS3(TestYdsBase):
@@ -759,3 +760,122 @@ Pear;15;33'''
assert result_set.columns[3].type.type_id == ydb.Type.INT32
assert len(result_set.rows) == 6
+
+ @yq_all
+ @pytest.mark.parametrize("filename", [
+ ("date_null/as_default/test.csv"),
+ ("date_null/parse_error/test.csv")
+ ])
+ def test_date_null(self, kikimr, s3, client, filename):
+ self.create_bucket_and_upload_file(filename, s3, kikimr)
+ client.create_storage_connection("hcpp", "fbucket")
+
+ sql = '''
+ SELECT
+ `put`
+ FROM
+ `hcpp`.`{name}`
+ WITH (FORMAT="csv_with_names",
+ csv_delimiter=",",
+ SCHEMA=(
+ `put` Date
+ ))
+ LIMIT 10;
+ '''.format(name="/" + filename)
+
+ query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
+ client.wait_query_status(query_id, fq.QueryMeta.COMPLETED)
+ data = client.get_result_data(query_id, limit=50)
+ assert data.result.result_set.rows[0].items[0].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set)
+
+ @yq_all
+ @pytest.mark.parametrize("filename", [
+ ("date_null/as_default/test.csv"),
+ ("date_null/parse_error/test.csv")
+ ])
+ def test_date_null_with_not_null_type(self, kikimr, s3, client, filename):
+ self.create_bucket_and_upload_file(filename, s3, kikimr)
+ client.create_storage_connection("hcpp", "fbucket")
+
+ sql = '''
+ SELECT
+ `put`
+ FROM
+ `hcpp`.`{name}`
+ WITH (FORMAT="csv_with_names",
+ csv_delimiter=",",
+ SCHEMA=(
+ `put` Date NOT NULL
+ ))
+ LIMIT 10;
+ '''.format(name="/" + filename)
+
+ query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
+ client.wait_query_status(query_id, fq.QueryMeta.FAILED)
+ describe_result = client.describe_query(query_id).result
+ issues = describe_result.query.issue[0].issues
+ assert "Invalid data format" in str(issues), str(describe_result)
+ assert "name: put, type: Date, ERROR: text " in str(issues), str(describe_result)
+ assert "is not like Date" in str(issues), str(describe_result)
+
+ @yq_all
+ @pytest.mark.parametrize("filename", [
+ ("date_null/as_default/multi_null.csv"),
+ ("date_null/parse_error/multi_null.csv")
+ ])
+ def test_date_null_multi(self, kikimr, s3, client, filename):
+ self.create_bucket_and_upload_file(filename, s3, kikimr)
+ client.create_storage_connection("hcpp", "fbucket")
+
+ sql = '''
+ SELECT
+ `put`, `a`, `t`
+ FROM
+ `hcpp`.`{name}`
+ WITH (FORMAT="csv_with_names",
+ csv_delimiter=",",
+ SCHEMA=(
+ `put` Date,
+ `a` Date,
+ `t` Date
+ ))
+ LIMIT 10;
+ '''.format(name="/" + filename)
+
+ query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
+ client.wait_query_status(query_id, fq.QueryMeta.COMPLETED)
+ data = client.get_result_data(query_id, limit=50)
+ assert data.result.result_set.rows[0].items[0].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set)
+ assert data.result.result_set.rows[0].items[1].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set)
+ assert data.result.result_set.rows[0].items[2].null_flag_value == struct_pb2.NULL_VALUE, str(data.result.result_set)
+
+ @yq_all
+ @pytest.mark.parametrize("filename", [
+ ("date_null/as_default/multi_null.csv"),
+ ("date_null/parse_error/multi_null.csv")
+ ])
+ def test_string_not_null_multi(self, kikimr, s3, client, filename):
+ self.create_bucket_and_upload_file(filename, s3, kikimr)
+ client.create_storage_connection("hcpp", "fbucket")
+
+ sql = '''
+ SELECT
+ `put`, `a`, `t`
+ FROM
+ `hcpp`.`{name}`
+ WITH (FORMAT="csv_with_names",
+ csv_delimiter=",",
+ SCHEMA=(
+ `put` String NOT NULL,
+ `a` Utf8 NOT NULL,
+ `t` String NOT NULL
+ ))
+ LIMIT 10;
+ '''.format(name="/" + filename)
+
+ query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
+ client.wait_query_status(query_id, fq.QueryMeta.COMPLETED)
+ data = client.get_result_data(query_id, limit=50)
+ assert data.result.result_set.rows[0].items[0].bytes_value == b"", str(data.result.result_set)
+ assert data.result.result_set.rows[0].items[1].bytes_value == b"", str(data.result.result_set)
+ assert data.result.result_set.rows[0].items[2].bytes_value == b"", str(data.result.result_set)
diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/as_default/multi_null.csv b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/multi_null.csv
new file mode 100644
index 0000000000..6e1aec129f
--- /dev/null
+++ b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/multi_null.csv
@@ -0,0 +1,2 @@
+id,put,call,a,t
+id1,,2022-12-22,, \ No newline at end of file
diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/as_default/test.csv b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/test.csv
new file mode 100644
index 0000000000..e212764385
--- /dev/null
+++ b/ydb/tests/fq/s3/test_format_settings/date_null/as_default/test.csv
@@ -0,0 +1,2 @@
+id,put,call
+id1,,2022-12-22 \ No newline at end of file
diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/multi_null.csv b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/multi_null.csv
new file mode 100644
index 0000000000..42fd11d35f
--- /dev/null
+++ b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/multi_null.csv
@@ -0,0 +1,2 @@
+id,put,call,a,t
+id1,,hello2,,
diff --git a/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/test.csv b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/test.csv
new file mode 100644
index 0000000000..de4da74ad8
--- /dev/null
+++ b/ydb/tests/fq/s3/test_format_settings/date_null/parse_error/test.csv
@@ -0,0 +1,2 @@
+id,put,call
+id1,,hello2