diff options
author | art-snake <art-snake@yandex-team.ru> | 2022-02-10 16:50:34 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:50:34 +0300 |
commit | 1700010e2088971894d12a7a16d6004866f986fd (patch) | |
tree | ac3b38289119375037d595858db9751013220a3f | |
parent | 785bc0acdf3b0c63f971ee17e845945d7381dcb7 (diff) | |
download | ydb-1700010e2088971894d12a7a16d6004866f986fd.tar.gz |
Restoring authorship annotation for <art-snake@yandex-team.ru>. Commit 1 of 2.
-rw-r--r-- | library/cpp/protobuf/json/config.h | 24 | ||||
-rw-r--r-- | library/cpp/protobuf/json/json2proto.cpp | 18 | ||||
-rw-r--r-- | library/cpp/protobuf/json/json2proto.h | 24 | ||||
-rw-r--r-- | library/cpp/protobuf/json/proto2json_printer.cpp | 28 | ||||
-rw-r--r-- | library/cpp/protobuf/json/ut/json2proto_ut.cpp | 136 | ||||
-rw-r--r-- | library/cpp/protobuf/json/ut/proto2json_ut.cpp | 114 | ||||
-rw-r--r-- | library/cpp/protobuf/json/ut/test.proto | 32 | ||||
-rw-r--r-- | util/charset/utf8.cpp | 188 | ||||
-rw-r--r-- | util/charset/utf8.h | 16 | ||||
-rw-r--r-- | util/charset/utf8_ut.cpp | 74 |
10 files changed, 327 insertions, 327 deletions
diff --git a/library/cpp/protobuf/json/config.h b/library/cpp/protobuf/json/config.h index dc84fb4d5d..ae06b3d8ec 100644 --- a/library/cpp/protobuf/json/config.h +++ b/library/cpp/protobuf/json/config.h @@ -15,20 +15,20 @@ namespace NProtobufJson { bool FormatOutput = false; enum MissingKeyMode { - // Skip missing keys + // Skip missing keys MissingKeySkip = 0, - // Fill missing keys with json null value. + // Fill missing keys with json null value. MissingKeyNull, - // Use default value in any case. - // If default value is not explicitly defined, use default type value: - // i.e. 0 for integers, "" for strings - // For repeated keys, means [] - MissingKeyDefault, - // Use default value if it is explicitly specified for optional fields. - // Skip if no explicitly defined default value for optional fields. - // Throw exception if required field is empty. - // For repeated keys, same as MissingKeySkip - MissingKeyExplicitDefaultThrowRequired + // Use default value in any case. + // If default value is not explicitly defined, use default type value: + // i.e. 0 for integers, "" for strings + // For repeated keys, means [] + MissingKeyDefault, + // Use default value if it is explicitly specified for optional fields. + // Skip if no explicitly defined default value for optional fields. + // Throw exception if required field is empty. + // For repeated keys, same as MissingKeySkip + MissingKeyExplicitDefaultThrowRequired }; MissingKeyMode MissingSingleKeyMode = MissingKeySkip; MissingKeyMode MissingRepeatedKeyMode = MissingKeySkip; diff --git a/library/cpp/protobuf/json/json2proto.cpp b/library/cpp/protobuf/json/json2proto.cpp index 640c10f5a5..f19204ac35 100644 --- a/library/cpp/protobuf/json/json2proto.cpp +++ b/library/cpp/protobuf/json/json2proto.cpp @@ -19,10 +19,10 @@ } \ if (!json.JsonCheckType()) { \ if (config.CastFromString && json.IsString()) { \ - if (config.DoNotCastEmptyStrings && json.GetString().empty()) { \ - /* Empty string is same as "no value" for scalar types.*/ \ - break; \ - } \ + if (config.DoNotCastEmptyStrings && json.GetString().empty()) { \ + /* Empty string is same as "no value" for scalar types.*/ \ + break; \ + } \ reflection->ProtoSet(&proto, &field, FromString(json.GetString())); \ break; \ } \ @@ -335,7 +335,7 @@ Json2RepeatedField(const NJson::TJsonValue& json, } } - if (fieldJson.GetType() != NJson::JSON_ARRAY && !config.MapAsObject && !config.VectorizeScalars && !config.ValueVectorizer) { + if (fieldJson.GetType() != NJson::JSON_ARRAY && !config.MapAsObject && !config.VectorizeScalars && !config.ValueVectorizer) { ythrow yexception() << "JSON field doesn't represent an array for " << name << "(actual type is " @@ -361,10 +361,10 @@ Json2RepeatedField(const NJson::TJsonValue& json, for (const NJson::TJsonValue& jsonValue : jsonArray) { Json2RepeatedFieldValue(jsonValue, proto, field, config, reflection); } - } else if (config.ValueVectorizer) { - for (const NJson::TJsonValue& jsonValue : config.ValueVectorizer(fieldJson)) { - Json2RepeatedFieldValue(jsonValue, proto, field, config, reflection); - } + } else if (config.ValueVectorizer) { + for (const NJson::TJsonValue& jsonValue : config.ValueVectorizer(fieldJson)) { + Json2RepeatedFieldValue(jsonValue, proto, field, config, reflection); + } } else if (config.VectorizeScalars) { Json2RepeatedFieldValue(fieldJson, proto, field, config, reflection); } diff --git a/library/cpp/protobuf/json/json2proto.h b/library/cpp/protobuf/json/json2proto.h index 4c33498dfa..458e0c0909 100644 --- a/library/cpp/protobuf/json/json2proto.h +++ b/library/cpp/protobuf/json/json2proto.h @@ -19,7 +19,7 @@ namespace google { namespace NProtobufJson { struct TJson2ProtoConfig { using TSelf = TJson2ProtoConfig; - using TValueVectorizer = std::function<NJson::TJsonValue::TArray(const NJson::TJsonValue& jsonValue)>; + using TValueVectorizer = std::function<NJson::TJsonValue::TArray(const NJson::TJsonValue& jsonValue)>; enum FldNameMode { FieldNameOriginalCase = 0, // default @@ -58,11 +58,11 @@ namespace NProtobufJson { return *this; } - TSelf& SetDoNotCastEmptyStrings(bool cast) { - DoNotCastEmptyStrings = cast; - return *this; - } - + TSelf& SetDoNotCastEmptyStrings(bool cast) { + DoNotCastEmptyStrings = cast; + return *this; + } + TSelf& SetCastRobust(bool cast) { CastRobust = cast; return *this; @@ -115,9 +115,9 @@ namespace NProtobufJson { /// Cast string json values to protobuf field type bool CastFromString = false; - /// Skip empty strings, instead casting from string into scalar types. - /// I.e. empty string like default value for scalar types. - bool DoNotCastEmptyStrings = false; + /// Skip empty strings, instead casting from string into scalar types. + /// I.e. empty string like default value for scalar types. + bool DoNotCastEmptyStrings = false; /// Cast all json values to protobuf field types bool CastRobust = false; @@ -138,9 +138,9 @@ namespace NProtobufJson { /// Append scalars to repeated fields bool VectorizeScalars = false; - - /// Custom spliter non array value to repeated fields. - TValueVectorizer ValueVectorizer; + + /// Custom spliter non array value to repeated fields. + TValueVectorizer ValueVectorizer; /// Allow js-style comments (both // and /**/) bool AllowComments = false; diff --git a/library/cpp/protobuf/json/proto2json_printer.cpp b/library/cpp/protobuf/json/proto2json_printer.cpp index 6123eab0f2..69a0aa25f6 100644 --- a/library/cpp/protobuf/json/proto2json_printer.cpp +++ b/library/cpp/protobuf/json/proto2json_printer.cpp @@ -205,18 +205,18 @@ namespace NProtobufJson { const Reflection* reflection = proto.GetReflection(); - bool shouldPrintField = reflection->HasField(proto, &field); - if (!shouldPrintField && GetConfig().MissingSingleKeyMode == TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired) { - if (field.has_default_value()) { - shouldPrintField = true; - } else if (field.is_required()) { - ythrow yexception() << "Empty required protobuf field: " - << field.full_name() << "."; - } - } - shouldPrintField = shouldPrintField || GetConfig().MissingSingleKeyMode == TProto2JsonConfig::MissingKeyDefault; - - if (shouldPrintField) { + bool shouldPrintField = reflection->HasField(proto, &field); + if (!shouldPrintField && GetConfig().MissingSingleKeyMode == TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired) { + if (field.has_default_value()) { + shouldPrintField = true; + } else if (field.is_required()) { + ythrow yexception() << "Empty required protobuf field: " + << field.full_name() << "."; + } + } + shouldPrintField = shouldPrintField || GetConfig().MissingSingleKeyMode == TProto2JsonConfig::MissingKeyDefault; + + if (shouldPrintField) { switch (field.cpp_type()) { INT_FIELD_TO_JSON(CPPTYPE_INT32, GetInt32); INT_FIELD_TO_JSON(CPPTYPE_INT64, GetInt64); @@ -256,7 +256,7 @@ namespace NProtobufJson { } case TProto2JsonConfig::MissingKeySkip: - case TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired: + case TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired: default: break; } @@ -358,7 +358,7 @@ namespace NProtobufJson { } case TProto2JsonConfig::MissingKeySkip: - case TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired: + case TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired: default: break; } diff --git a/library/cpp/protobuf/json/ut/json2proto_ut.cpp b/library/cpp/protobuf/json/ut/json2proto_ut.cpp index 0dfe57bc7a..081072971c 100644 --- a/library/cpp/protobuf/json/ut/json2proto_ut.cpp +++ b/library/cpp/protobuf/json/ut/json2proto_ut.cpp @@ -690,57 +690,57 @@ Y_UNIT_TEST(TestVectorizeScalars) { #undef DEFINE_FIELD } -Y_UNIT_TEST(TestValueVectorizer) { - { - // No ValueVectorizer - NJson::TJsonValue json; - json["RepeatedString"] = "123"; - TJson2ProtoConfig config; - TSingleRepeatedString expected; - UNIT_ASSERT_EXCEPTION(Json2Proto(json, expected, config), yexception); - } - { - // ValueVectorizer replace original value by array - NJson::TJsonValue json; - json["RepeatedString"] = "123"; - TJson2ProtoConfig config; - - TSingleRepeatedString expected; - expected.AddRepeatedString("4"); - expected.AddRepeatedString("5"); - expected.AddRepeatedString("6"); - - config.ValueVectorizer = [](const NJson::TJsonValue& val) -> NJson::TJsonValue::TArray { - Y_UNUSED(val); - return {NJson::TJsonValue("4"), NJson::TJsonValue("5"), NJson::TJsonValue("6")}; - }; - TSingleRepeatedString actual; - Json2Proto(json, actual, config); - UNIT_ASSERT_PROTOS_EQUAL(expected, actual); - } - { - // ValueVectorizer replace original value by array and cast - NJson::TJsonValue json; - json["RepeatedInt"] = 123; - TJson2ProtoConfig config; - - TSingleRepeatedInt expected; - expected.AddRepeatedInt(4); - expected.AddRepeatedInt(5); - expected.AddRepeatedInt(6); - - config.ValueVectorizer = [](const NJson::TJsonValue& val) -> NJson::TJsonValue::TArray { - Y_UNUSED(val); - return {NJson::TJsonValue("4"), NJson::TJsonValue(5), NJson::TJsonValue("6")}; - }; - config.CastFromString = true; - - TSingleRepeatedInt actual; - Json2Proto(json, actual, config); - UNIT_ASSERT_PROTOS_EQUAL(expected, actual); - } -} - +Y_UNIT_TEST(TestValueVectorizer) { + { + // No ValueVectorizer + NJson::TJsonValue json; + json["RepeatedString"] = "123"; + TJson2ProtoConfig config; + TSingleRepeatedString expected; + UNIT_ASSERT_EXCEPTION(Json2Proto(json, expected, config), yexception); + } + { + // ValueVectorizer replace original value by array + NJson::TJsonValue json; + json["RepeatedString"] = "123"; + TJson2ProtoConfig config; + + TSingleRepeatedString expected; + expected.AddRepeatedString("4"); + expected.AddRepeatedString("5"); + expected.AddRepeatedString("6"); + + config.ValueVectorizer = [](const NJson::TJsonValue& val) -> NJson::TJsonValue::TArray { + Y_UNUSED(val); + return {NJson::TJsonValue("4"), NJson::TJsonValue("5"), NJson::TJsonValue("6")}; + }; + TSingleRepeatedString actual; + Json2Proto(json, actual, config); + UNIT_ASSERT_PROTOS_EQUAL(expected, actual); + } + { + // ValueVectorizer replace original value by array and cast + NJson::TJsonValue json; + json["RepeatedInt"] = 123; + TJson2ProtoConfig config; + + TSingleRepeatedInt expected; + expected.AddRepeatedInt(4); + expected.AddRepeatedInt(5); + expected.AddRepeatedInt(6); + + config.ValueVectorizer = [](const NJson::TJsonValue& val) -> NJson::TJsonValue::TArray { + Y_UNUSED(val); + return {NJson::TJsonValue("4"), NJson::TJsonValue(5), NJson::TJsonValue("6")}; + }; + config.CastFromString = true; + + TSingleRepeatedInt actual; + Json2Proto(json, actual, config); + UNIT_ASSERT_PROTOS_EQUAL(expected, actual); + } +} + Y_UNIT_TEST(TestMapAsObject) { TMapType modelProto; @@ -1103,23 +1103,23 @@ Y_UNIT_TEST(TestMergeRepeatedAppend) { UNIT_ASSERT_PROTOS_EQUAL(proto, modelProto); } // TestMergeRepeatedAppend -Y_UNIT_TEST(TestEmptyStringForCastFromString) { - NJson::TJsonValue json; - json["I32"] = ""; - json["Bool"] = ""; - json["OneString"] = ""; - - TJson2ProtoConfig config; - config.SetCastFromString(true); - config.SetDoNotCastEmptyStrings(true); - TFlatOptional proto; - UNIT_ASSERT_NO_EXCEPTION(Json2Proto(json, proto, config)); - UNIT_ASSERT(!proto.HasBool()); - UNIT_ASSERT(!proto.HasI32()); - UNIT_ASSERT(proto.HasOneString()); - UNIT_ASSERT_EQUAL("", proto.GetOneString()); -} // TestEmptyStringForCastFromString - +Y_UNIT_TEST(TestEmptyStringForCastFromString) { + NJson::TJsonValue json; + json["I32"] = ""; + json["Bool"] = ""; + json["OneString"] = ""; + + TJson2ProtoConfig config; + config.SetCastFromString(true); + config.SetDoNotCastEmptyStrings(true); + TFlatOptional proto; + UNIT_ASSERT_NO_EXCEPTION(Json2Proto(json, proto, config)); + UNIT_ASSERT(!proto.HasBool()); + UNIT_ASSERT(!proto.HasI32()); + UNIT_ASSERT(proto.HasOneString()); + UNIT_ASSERT_EQUAL("", proto.GetOneString()); +} // TestEmptyStringForCastFromString + Y_UNIT_TEST(TestAllowComments) { constexpr TStringBuf json = R"( { diff --git a/library/cpp/protobuf/json/ut/proto2json_ut.cpp b/library/cpp/protobuf/json/ut/proto2json_ut.cpp index 07e52d7f2f..6ae7960bb1 100644 --- a/library/cpp/protobuf/json/ut/proto2json_ut.cpp +++ b/library/cpp/protobuf/json/ut/proto2json_ut.cpp @@ -465,52 +465,52 @@ Y_UNIT_TEST(TestMissingSingleKeyConfig) { UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); UNIT_ASSERT_JSONS_EQUAL(json, modelJson); } - { - // Test MissingKeyExplicitDefaultThrowRequired for non explicit default values. - TFlatOptional proto; - NJson::TJsonValue modelJson(NJson::JSON_MAP); - NJson::TJsonValue json; - TProto2JsonConfig config; - config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; - - UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); - UNIT_ASSERT_JSONS_EQUAL(json, modelJson); - } - { - // Test MissingKeyExplicitDefaultThrowRequired for explicit default values. - NJson::TJsonValue modelJson; - modelJson["String"] = "value"; - - TSingleDefaultString proto; - NJson::TJsonValue json; - TProto2JsonConfig config; - config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; - UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); - UNIT_ASSERT_JSONS_EQUAL(json, modelJson); - } - { - // Test MissingKeyExplicitDefaultThrowRequired for empty required values. - TFlatRequired proto; - NJson::TJsonValue json; - TProto2JsonConfig config; - config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; - UNIT_ASSERT_EXCEPTION_CONTAINS(Proto2Json(proto, json, config), yexception, "Empty required protobuf field"); - } - { - // Test MissingKeyExplicitDefaultThrowRequired for required value. - TSingleRequiredString proto; - NJson::TJsonValue json; - TProto2JsonConfig config; - config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; - - UNIT_ASSERT_EXCEPTION_CONTAINS(Proto2Json(proto, json, config), yexception, "Empty required protobuf field"); - - NJson::TJsonValue modelJson; - modelJson["String"] = "value"; - proto.SetString("value"); - UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); - UNIT_ASSERT_JSONS_EQUAL(json, modelJson); - } + { + // Test MissingKeyExplicitDefaultThrowRequired for non explicit default values. + TFlatOptional proto; + NJson::TJsonValue modelJson(NJson::JSON_MAP); + NJson::TJsonValue json; + TProto2JsonConfig config; + config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; + + UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); + UNIT_ASSERT_JSONS_EQUAL(json, modelJson); + } + { + // Test MissingKeyExplicitDefaultThrowRequired for explicit default values. + NJson::TJsonValue modelJson; + modelJson["String"] = "value"; + + TSingleDefaultString proto; + NJson::TJsonValue json; + TProto2JsonConfig config; + config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; + UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); + UNIT_ASSERT_JSONS_EQUAL(json, modelJson); + } + { + // Test MissingKeyExplicitDefaultThrowRequired for empty required values. + TFlatRequired proto; + NJson::TJsonValue json; + TProto2JsonConfig config; + config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; + UNIT_ASSERT_EXCEPTION_CONTAINS(Proto2Json(proto, json, config), yexception, "Empty required protobuf field"); + } + { + // Test MissingKeyExplicitDefaultThrowRequired for required value. + TSingleRequiredString proto; + NJson::TJsonValue json; + TProto2JsonConfig config; + config.MissingSingleKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; + + UNIT_ASSERT_EXCEPTION_CONTAINS(Proto2Json(proto, json, config), yexception, "Empty required protobuf field"); + + NJson::TJsonValue modelJson; + modelJson["String"] = "value"; + proto.SetString("value"); + UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); + UNIT_ASSERT_JSONS_EQUAL(json, modelJson); + } } // TestMissingSingleKeyConfig Y_UNIT_TEST(TestMissingRepeatedKeyNoConfig) { @@ -551,17 +551,17 @@ Y_UNIT_TEST(TestMissingRepeatedKeyConfig) { UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); UNIT_ASSERT_JSONS_EQUAL(json, modelJson); } - { - TFlatRepeated proto; - NJson::TJsonValue modelJson(NJson::JSON_MAP); - NJson::TJsonValue json; - TProto2JsonConfig config; - config.MissingRepeatedKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; - - // SHould be same as MissingKeySkip - UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); - UNIT_ASSERT_JSONS_EQUAL(json, modelJson); - } + { + TFlatRepeated proto; + NJson::TJsonValue modelJson(NJson::JSON_MAP); + NJson::TJsonValue json; + TProto2JsonConfig config; + config.MissingRepeatedKeyMode = TProto2JsonConfig::MissingKeyExplicitDefaultThrowRequired; + + // SHould be same as MissingKeySkip + UNIT_ASSERT_NO_EXCEPTION(Proto2Json(proto, json, config)); + UNIT_ASSERT_JSONS_EQUAL(json, modelJson); + } } // TestMissingRepeatedKeyConfig Y_UNIT_TEST(TestEscaping) { diff --git a/library/cpp/protobuf/json/ut/test.proto b/library/cpp/protobuf/json/ut/test.proto index 0fa996fd41..8cf7bf8a5d 100644 --- a/library/cpp/protobuf/json/ut/test.proto +++ b/library/cpp/protobuf/json/ut/test.proto @@ -177,22 +177,22 @@ message TWithJsonName { optional int32 Def_upper = 3; // json_name = "DefUpper" optional int32 def_lower = 4; // json_name = "defLower" } - -message TSingleRequiredString { - required string String = 1; -} - -message TSingleDefaultString { - optional string String = 1 [default = "value"]; -} - -message TSingleRepeatedString { - repeated string RepeatedString = 1; -} - -message TSingleRepeatedInt { - repeated int32 RepeatedInt = 1; -} + +message TSingleRequiredString { + required string String = 1; +} + +message TSingleDefaultString { + optional string String = 1 [default = "value"]; +} + +message TSingleRepeatedString { + repeated string RepeatedString = 1; +} + +message TSingleRepeatedInt { + repeated int32 RepeatedInt = 1; +} message TExtensionField { extensions 100 to 199; diff --git a/util/charset/utf8.cpp b/util/charset/utf8.cpp index efe3a52f61..21ed1adcc6 100644 --- a/util/charset/utf8.cpp +++ b/util/charset/utf8.cpp @@ -1,87 +1,87 @@ #include "unidata.h" #include "utf8.h" -namespace { - enum class ECaseConversion { - ToUpper, - ToLower, - }; - - wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) { - switch (conversion) { - case ECaseConversion::ToUpper: - return ToUpper(ch); - case ECaseConversion::ToLower: - return ToLower(ch); - } - Y_ASSERT(false); // NOTREACHED - return 0; - } - - bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n, - TString& newString) { - const unsigned char* p = (const unsigned char*)beg; - const unsigned char* const end = p + n; - - // first loop searches for the first character, which is changed by ConvertChar - // if there is no changed character, we don't need reallocation/copy - wchar32 cNew = 0; - size_t cLen = 0; - while (p < end) { - wchar32 c; - if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { - ythrow yexception() - << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); - } - cNew = ConvertChar(conversion, c); - - if (cNew != c) - break; - p += cLen; - } - if (p == end) { - return false; - } - - // some character changed after ToLower. Write new string to newString. - newString.resize(n); - - size_t written = (char*)p - beg; - char* writePtr = newString.begin(); - memcpy(writePtr, beg, written); - writePtr += written; - size_t destSpace = n - written; - - // before each iteration (including the first one) variable 'cNew' contains unwritten symbol - while (true) { - size_t cNewLen; +namespace { + enum class ECaseConversion { + ToUpper, + ToLower, + }; + + wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) { + switch (conversion) { + case ECaseConversion::ToUpper: + return ToUpper(ch); + case ECaseConversion::ToLower: + return ToLower(ch); + } + Y_ASSERT(false); // NOTREACHED + return 0; + } + + bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n, + TString& newString) { + const unsigned char* p = (const unsigned char*)beg; + const unsigned char* const end = p + n; + + // first loop searches for the first character, which is changed by ConvertChar + // if there is no changed character, we don't need reallocation/copy + wchar32 cNew = 0; + size_t cLen = 0; + while (p < end) { + wchar32 c; + if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { + ythrow yexception() + << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); + } + cNew = ConvertChar(conversion, c); + + if (cNew != c) + break; + p += cLen; + } + if (p == end) { + return false; + } + + // some character changed after ToLower. Write new string to newString. + newString.resize(n); + + size_t written = (char*)p - beg; + char* writePtr = newString.begin(); + memcpy(writePtr, beg, written); + writePtr += written; + size_t destSpace = n - written; + + // before each iteration (including the first one) variable 'cNew' contains unwritten symbol + while (true) { + size_t cNewLen; Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size()); - if (RECODE_EOOUTPUT == - SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) { + if (RECODE_EOOUTPUT == + SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) { destSpace += newString.size(); newString.resize(newString.size() * 2); writePtr = newString.begin() + (newString.size() - destSpace); - continue; - } - destSpace -= cNewLen; - writePtr += cNewLen; - p += cLen; - if (p == end) { + continue; + } + destSpace -= cNewLen; + writePtr += cNewLen; + p += cLen; + if (p == end) { newString.resize(newString.size() - destSpace); - return true; - } - wchar32 c = 0; - if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { - ythrow yexception() - << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); - } - cNew = ConvertChar(conversion, c); - } - Y_ASSERT(false); - return false; - } -} // namespace - + return true; + } + wchar32 c = 0; + if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { + ythrow yexception() + << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); + } + cNew = ConvertChar(conversion, c); + } + Y_ASSERT(false); + return false; + } +} // namespace + extern const wchar32 BROKEN_RUNE = 0xFFFD; static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) { @@ -130,7 +130,7 @@ EUTF8Detect UTF8Detect(const char* s, size_t len) { } bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) { - return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString); + return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString); } TString ToLowerUTF8(const TString& s) { @@ -148,23 +148,23 @@ TString ToLowerUTF8(TStringBuf s) { TString ToLowerUTF8(const char* s) { return ToLowerUTF8(TStringBuf(s)); } - -bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) { - return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString); -} - -TString ToUpperUTF8(const TString& s) { - TString newString; + +bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) { + return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString); +} + +TString ToUpperUTF8(const TString& s) { + TString newString; bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString); - return changed ? newString : s; -} - -TString ToUpperUTF8(TStringBuf s) { - TString newString; + return changed ? newString : s; +} + +TString ToUpperUTF8(TStringBuf s) { + TString newString; bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString); return changed ? newString : TString(s.data(), s.size()); -} - -TString ToUpperUTF8(const char* s) { - return ToUpperUTF8(TStringBuf(s)); -} +} + +TString ToUpperUTF8(const char* s) { + return ToUpperUTF8(TStringBuf(s)); +} diff --git a/util/charset/utf8.h b/util/charset/utf8.h index 5039b46ae9..5250bbeab2 100644 --- a/util/charset/utf8.h +++ b/util/charset/utf8.h @@ -374,15 +374,15 @@ bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString); TString ToLowerUTF8(const TString& s); TString ToLowerUTF8(TStringBuf s); TString ToLowerUTF8(const char* s); - + inline TString ToLowerUTF8(const std::string& s) { return ToLowerUTF8(TStringBuf(s)); } -//! returns true, if result is not the same as input, and put it in newString -//! returns false, if result is unmodified -bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString); - -TString ToUpperUTF8(const TString& s); -TString ToUpperUTF8(TStringBuf s); -TString ToUpperUTF8(const char* s); +//! returns true, if result is not the same as input, and put it in newString +//! returns false, if result is unmodified +bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString); + +TString ToUpperUTF8(const TString& s); +TString ToUpperUTF8(TStringBuf s); +TString ToUpperUTF8(const char* s); diff --git a/util/charset/utf8_ut.cpp b/util/charset/utf8_ut.cpp index 9e68881cca..8cbb844dc7 100644 --- a/util/charset/utf8_ut.cpp +++ b/util/charset/utf8_ut.cpp @@ -52,46 +52,46 @@ Y_UNIT_TEST_SUITE(TUtfUtilTest) { } } - Y_UNIT_TEST(TestToUpperUtfString) { - UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8("xyz XYZ привет!"), "XYZ XYZ ПРИВЕТ!"); - + Y_UNIT_TEST(TestToUpperUtfString) { + UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8("xyz XYZ привет!"), "XYZ XYZ ПРИВЕТ!"); + UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(TStringBuf("XYZ")), "XYZ"); - - { - TString s = "ПРИВЕТ!"; - TString q = "привет!"; - TString tmp; + + { + TString s = "ПРИВЕТ!"; + TString q = "привет!"; + TString tmp; UNIT_ASSERT(ToUpperUTF8Impl(s.data(), s.size(), tmp) == false); UNIT_ASSERT(ToUpperUTF8Impl(q.data(), q.size(), tmp) == true); - } - - { - const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3 - const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1 - TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird}; - const int N = Y_ARRAY_SIZE(chars); - //try all combinations of these letters. - int numberOfVariants = 1; - for (int len = 0; len <= 4; ++len) { - for (int i = 0; i < numberOfVariants; ++i) { - TString s; - int k = i; - for (int j = 0; j < len; ++j) { - //Treat 'i' like number in base-N system with digits from 'chars'-array - s += chars[k % N]; - k /= N; - } - - TUtf16String tmp = UTF8ToWide(s); - tmp.to_upper(); - - UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(s), WideToUTF8(tmp)); - } - numberOfVariants *= N; - } - } - } - + } + + { + const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3 + const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1 + TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird}; + const int N = Y_ARRAY_SIZE(chars); + //try all combinations of these letters. + int numberOfVariants = 1; + for (int len = 0; len <= 4; ++len) { + for (int i = 0; i < numberOfVariants; ++i) { + TString s; + int k = i; + for (int j = 0; j < len; ++j) { + //Treat 'i' like number in base-N system with digits from 'chars'-array + s += chars[k % N]; + k /= N; + } + + TUtf16String tmp = UTF8ToWide(s); + tmp.to_upper(); + + UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(s), WideToUTF8(tmp)); + } + numberOfVariants *= N; + } + } + } + Y_UNIT_TEST(TestUTF8ToWide) { TFileInput in(ArcadiaSourceRoot() + TStringBuf("/util/charset/ut/utf8/test1.txt")); |