aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIvan Sukhov <evanevannnn@ydb.tech>2024-10-17 12:35:08 +0300
committerGitHub <noreply@github.com>2024-10-17 12:35:08 +0300
commita71cc84f0c2f4321af417de000484193208eec01 (patch)
tree5438084c3914f5e3954e198914867d6b8e3ef541
parentfd93fab49c54f6bfaf5e480d6d537eb7f2716f64 (diff)
downloadydb-a71cc84f0c2f4321af417de000484193208eec01.tar.gz
Additional validation for queries with "json_list" format and datetime types (#10208)
-rw-r--r--ydb/core/external_sources/object_storage.cpp69
-rw-r--r--ydb/core/external_sources/object_storage_ut.cpp25
-rw-r--r--ydb/library/yql/providers/common/provider/yql_provider.cpp24
-rw-r--r--ydb/tests/fq/s3/test_s3_0.py63
4 files changed, 181 insertions, 0 deletions
diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp
index 7128417f6f..a96b3dd9b4 100644
--- a/ydb/core/external_sources/object_storage.cpp
+++ b/ydb/core/external_sources/object_storage.cpp
@@ -150,6 +150,7 @@ struct TObjectStorageExternalSource : public IExternalSource {
}
const bool hasPartitioning = objectStorage.projection_size() || objectStorage.partitioned_by_size();
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location, hasPartitioning));
+ issues.AddIssues(ValidateJsonListFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
if (hasPartitioning) {
if (NYql::NS3::HasWildcards(location)) {
@@ -264,6 +265,30 @@ struct TObjectStorageExternalSource : public IExternalSource {
}
template<typename TScheme>
+ static NYql::TIssues ValidateJsonListFormat(const TString& format, const TScheme& schema, const google::protobuf::RepeatedPtrField<TString>& partitionedBy) {
+ NYql::TIssues issues;
+ if (format != "json_list"sv) {
+ return issues;
+ }
+
+ TSet<TString> partitionedBySet{partitionedBy.begin(), partitionedBy.end()};
+
+ for (const auto& column: schema.column()) {
+ if (partitionedBySet.contains(column.name())) {
+ continue;
+ }
+ if (ValidateDateOrTimeType(column.type())) {
+ issues.AddIssue(MakeErrorIssue(
+ Ydb::StatusIds::BAD_REQUEST,
+ TStringBuilder{} << "Date, Timestamp and Interval types are not allowed in json_list format (you have '"
+ << column.name() << " " << NYdb::TType(column.type()).ToString() << "' field)"));
+ }
+ }
+
+ return issues;
+ }
+
+ template<typename TScheme>
static NYql::TIssues ValidateRawFormat(const TString& format, const TScheme& schema, const google::protobuf::RepeatedPtrField<TString>& partitionedBy) {
NYql::TIssues issues;
if (format != "raw"sv) {
@@ -800,6 +825,50 @@ private:
return FindIf(availableTypes, [&columnType](const auto& availableType) { return NYdb::TypesEqual(availableType, columnType); }) != availableTypes.end();
}
+ static std::vector<NYdb::TType> GetDateOrTimeTypes() {
+ NYdb::TType dateType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Date).Build();
+ NYdb::TType datetimeType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Datetime).Build();
+ NYdb::TType timestampType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Timestamp).Build();
+ NYdb::TType intervalType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Interval).Build();
+ NYdb::TType date32Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Date32).Build();
+ NYdb::TType datetime64Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Datetime64).Build();
+ NYdb::TType timestamp64Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Timestamp64).Build();
+ NYdb::TType interval64Type = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::Interval64).Build();
+ NYdb::TType tzdateType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::TzDate).Build();
+ NYdb::TType tzdatetimeType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::TzDatetime).Build();
+ NYdb::TType tztimestampType = NYdb::TTypeBuilder{}.Primitive(NYdb::EPrimitiveType::TzTimestamp).Build();
+ const std::vector<NYdb::TType> result {
+ dateType,
+ datetimeType,
+ timestampType,
+ intervalType,
+ date32Type,
+ datetime64Type,
+ timestamp64Type,
+ interval64Type,
+ tzdateType,
+ tzdatetimeType,
+ tztimestampType,
+ NYdb::TTypeBuilder{}.Optional(dateType).Build(),
+ NYdb::TTypeBuilder{}.Optional(datetimeType).Build(),
+ NYdb::TTypeBuilder{}.Optional(timestampType).Build(),
+ NYdb::TTypeBuilder{}.Optional(intervalType).Build(),
+ NYdb::TTypeBuilder{}.Optional(date32Type).Build(),
+ NYdb::TTypeBuilder{}.Optional(datetime64Type).Build(),
+ NYdb::TTypeBuilder{}.Optional(timestamp64Type).Build(),
+ NYdb::TTypeBuilder{}.Optional(interval64Type).Build(),
+ NYdb::TTypeBuilder{}.Optional(tzdateType).Build(),
+ NYdb::TTypeBuilder{}.Optional(tzdatetimeType).Build(),
+ NYdb::TTypeBuilder{}.Optional(tztimestampType).Build()
+ };
+ return result;
+ }
+
+ static bool ValidateDateOrTimeType(const NYdb::TType& columnType) {
+ static const std::vector<NYdb::TType> availableTypes = GetDateOrTimeTypes();
+ return FindIf(availableTypes, [&columnType](const auto& availableType) { return NYdb::TypesEqual(availableType, columnType); }) != availableTypes.end();
+ }
+
private:
const std::vector<TRegExMatch> HostnamePatterns;
const size_t PathsLimit;
diff --git a/ydb/core/external_sources/object_storage_ut.cpp b/ydb/core/external_sources/object_storage_ut.cpp
index 23fcc0e214..d5bc4a655d 100644
--- a/ydb/core/external_sources/object_storage_ut.cpp
+++ b/ydb/core/external_sources/object_storage_ut.cpp
@@ -30,6 +30,31 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) {
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified");
}
+ Y_UNIT_TEST(FailedJsonListValidation) {
+ static auto invalidTypes = {
+ Ydb::Type::DATE,
+ Ydb::Type::DATETIME,
+ Ydb::Type::TIMESTAMP,
+ Ydb::Type::INTERVAL,
+ Ydb::Type::DATE32,
+ Ydb::Type::DATETIME64,
+ Ydb::Type::TIMESTAMP64,
+ Ydb::Type::INTERVAL64,
+ Ydb::Type::TZ_DATE,
+ Ydb::Type::TZ_DATETIME,
+ Ydb::Type::TZ_TIMESTAMP,
+ };
+ auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false);
+ NKikimrExternalSources::TSchema schema;
+ for (const auto typeId : invalidTypes) {
+ auto newColumn = schema.add_column();
+ newColumn->mutable_type()->set_type_id(typeId);
+ }
+ NKikimrExternalSources::TGeneral general;
+ general.mutable_attributes()->insert({"format", "json_list"});
+ UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Date, Timestamp and Interval types are not allowed in json_list format");
+ }
+
Y_UNIT_TEST(WildcardsValidation) {
auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false);
NKikimrExternalSources::TSchema schema;
diff --git a/ydb/library/yql/providers/common/provider/yql_provider.cpp b/ydb/library/yql/providers/common/provider/yql_provider.cpp
index 551d535596..59c40e9958 100644
--- a/ydb/library/yql/providers/common/provider/yql_provider.cpp
+++ b/ydb/library/yql/providers/common/provider/yql_provider.cpp
@@ -1660,6 +1660,30 @@ bool ValidateFormatForInput(
return false;
}
}
+ else if (schemaStructRowType && format == TStringBuf("json_list")) {
+ bool failedSchemaColumns = false;
+
+ for (const TItemExprType* item : schemaStructRowType->GetItems()) {
+ if (excludeFields && excludeFields(item->GetName())) {
+ continue;
+ }
+ const TTypeAnnotationNode* rowType = item->GetItemType();
+ if (rowType->GetKind() == ETypeAnnotationKind::Optional) {
+ rowType = rowType->Cast<TOptionalExprType>()->GetItemType();
+ }
+
+ if (rowType->GetKind() == ETypeAnnotationKind::Data
+ && IsDataTypeDateOrTzDateOrInterval(rowType->Cast<TDataExprType>()->GetSlot())) {
+ ctx.AddError(TIssue(TStringBuilder() << "Date, Timestamp and Interval types are not allowed in json_list format (you have '"
+ << item->GetName() << " " << FormatType(rowType) << "' field)"));
+ failedSchemaColumns = true;
+ }
+ }
+
+ if (failedSchemaColumns) {
+ return false;
+ }
+ }
return true;
}
diff --git a/ydb/tests/fq/s3/test_s3_0.py b/ydb/tests/fq/s3/test_s3_0.py
index 892d9870da..d342d68c96 100644
--- a/ydb/tests/fq/s3/test_s3_0.py
+++ b/ydb/tests/fq/s3/test_s3_0.py
@@ -668,6 +668,69 @@ Pear,15,33'''
assert result_set.rows[2].items[0].int64_value == 30
assert sum(kikimr.control_plane.get_metering(1)) == 10
+ @yq_v2
+ @pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
+ def test_json_list_formats(self, kikimr, s3, client, unique_prefix):
+ resource = boto3.resource(
+ "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
+ )
+
+ bucket = resource.Bucket("fbucket")
+ bucket.create(ACL='public-read')
+ bucket.objects.all().delete()
+
+ s3_client = boto3.client(
+ "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key"
+ )
+
+ fruits = '''[
+ { "date" : "", "datetime" : "", "timestamp" : "", "interval" : "", "date32" : "", "datetime64" : "", "timestamp64" : "", "interval64" : "", "tzDate" : "", "tzDateTime" : "", "tzTimestamp" : "" },
+ { "date" : "", "datetime" : "", "timestamp" : "", "interval" : "", "date32" : "", "datetime64" : "", "timestamp64" : "", "interval64" : "", "tzDate" : "", "tzDateTime" : "", "tzTimestamp" : "" },
+ { "date" : "", "datetime" : "", "timestamp" : "", "interval" : "", "date32" : "", "datetime64" : "", "timestamp64" : "", "interval64" : "", "tzDate" : "", "tzDateTime" : "", "tzTimestamp" : "" }
+]'''
+ s3_client.put_object(Body=fruits, Bucket='fbucket', Key='timestamp.json', ContentType='text/plain')
+
+ kikimr.control_plane.wait_bootstrap(1)
+ storage_connection_name = unique_prefix + "fruitbucket"
+ client.create_storage_connection(storage_connection_name, "fbucket")
+
+ sql = f'''
+ SELECT *
+ FROM `{storage_connection_name}`.`/timestamp.json`
+ WITH (
+ format="json_list",
+ schema=(
+ `date` date,
+ `datetime` datetime,
+ `timestamp` timestamp,
+ `interval` interval,
+ `date32` date32,
+ `datetime64` datetime64,
+ `timestamp64` timestamp64,
+ `interval64` interval64,
+ `tzDate` tzDate,
+ `tzDateTime` tzDateTime,
+ `tzTimestamp` tzTimestamp
+ ));
+ '''
+
+ query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
+ client.wait_query_status(query_id, fq.QueryMeta.FAILED)
+
+ error_message = str(client.describe_query(query_id).result)
+ assert "Date, Timestamp and Interval types are not allowed in json_list format" in error_message
+ assert "Date" in error_message
+ assert "Datetime" in error_message
+ assert "Timestamp" in error_message
+ assert "Interval" in error_message
+ assert "Date32" in error_message
+ assert "Datetime64" in error_message
+ assert "Timestamp64" in error_message
+ assert "Interval64" in error_message
+ assert "TzDate" in error_message
+ assert "TzDatetime" in error_message
+ assert "TzTimestamp" in error_message
+
@yq_all
@pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True)
def test_csv_with_hopping(self, kikimr, s3, client, unique_prefix):