diff options
author | romankoshelev <romankoshelev@yandex-team.com> | 2024-05-13 11:00:27 +0300 |
---|---|---|
committer | romankoshelev <romankoshelev@yandex-team.com> | 2024-05-13 11:13:05 +0300 |
commit | 5b22fadb0f035a3b82c328e0ae710ad2b92f6eac (patch) | |
tree | e15dc649c79c4fb78f35cd6694dfe9af9bfcc0ad /contrib/libs/icu/include/unicode/uspoof.h | |
parent | 5946aa7d3cbca62f6bcf074e8a2b9346e7a96af4 (diff) | |
download | ydb-5b22fadb0f035a3b82c328e0ae710ad2b92f6eac.tar.gz |
Update ICU to 75.1
904da4ae1c86fc5542eac7f1cd18d97b72eb8517
Diffstat (limited to 'contrib/libs/icu/include/unicode/uspoof.h')
-rw-r--r-- | contrib/libs/icu/include/unicode/uspoof.h | 307 |
1 files changed, 297 insertions, 10 deletions
diff --git a/contrib/libs/icu/include/unicode/uspoof.h b/contrib/libs/icu/include/unicode/uspoof.h index 442655bb54..20d29d62b2 100644 --- a/contrib/libs/icu/include/unicode/uspoof.h +++ b/contrib/libs/icu/include/unicode/uspoof.h @@ -19,6 +19,7 @@ #ifndef USPOOF_H #define USPOOF_H +#include "unicode/ubidi.h" #include "unicode/utypes.h" #include "unicode/uset.h" #include "unicode/parseerr.h" @@ -83,6 +84,25 @@ * the instance should be created once (e.g., upon application startup), and the efficient * {@link uspoof_areConfusable} method can be used at runtime. * + * If the paragraph direction used to display the strings is known, the bidi function should be used instead: + * + * \code{.c} + * UErrorCode status = U_ZERO_ERROR; + * // These strings look identical when rendered in a left-to-right context. + * // They look distinct in a right-to-left context. + * UChar* str1 = (UChar*) u"A1\u05D0"; // A1א + * UChar* str2 = (UChar*) u"A\u05D01"; // Aא1 + * + * USpoofChecker* sc = uspoof_open(&status); + * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); + * + * int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status); + * UBool result = bitmask != 0; + * // areBidiConfusable: 1 (status: U_ZERO_ERROR) + * printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status)); + * uspoof_close(sc); + * \endcode + * * <p> * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call * {@link uspoof_close} when the object goes out of scope: @@ -339,6 +359,51 @@ * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple * scripts. * + * <h2>Advanced bidirectional usage</h2> + * If the paragraph direction with which the identifiers will be displayed is not known, there are + * multiple options for confusable detection depending on the circumstances. + * + * <p> + * In some circumstances, the only concern is confusion between identifiers displayed with the same + * paragraph direction. + * + * <p> + * An example is the case where identifiers are usernames prefixed with the @ symbol. + * That symbol will appear to the left in a left-to-right context, and to the right in a + * right-to-left context, so that an identifier displayed in a left-to-right context can never be + * confused with an identifier displayed in a right-to-left context: + * <ul> + * <li> + * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1) + * would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the + * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered + * confusable, since they both appear as A_1א@ in a right-to-left context. + * </li> + * <li> + * The username "Mark_" would not be considered confusable with the username "_Mark", + * even though the latter would appear as Mark_@ in a right-to-left context, and the + * former as \@Mark_ in a left-to-right context. + * </li> + * </ul> + * <p> + * In that case, the caller should check for both LTR-confusability and RTL-confusability: + * + * \code{.cpp} + * bool confusableInEitherDirection = + * uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) || + * uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status); + * \endcode + * + * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR + * with LTR and RTL with RTL. + * + * <p> + * In cases where confusability between the visual appearances of an identifier displayed in a + * left-to-right context with another identifier displayed in a right-to-left context is a concern, + * the LTR skeleton of one can be compared with the RTL skeleton of the other. However, this + * very broad definition of confusability may have unexpected results; for instance, it treats the + * ASCII identifiers "Mark_" and "_Mark" as confusable. + * * <h2>Additional Information</h2> * * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. @@ -519,7 +584,7 @@ typedef enum USpoofChecks { /** - * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and + * Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and * for returned identifier restriction levels in check results. * * @stable ICU 51 @@ -633,8 +698,8 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng /** * Open a Spoof Checker from the source form of the spoof data. * The input corresponds to the Unicode data file confusables.txt - * as described in Unicode UAX #39. The syntax of the source data - * is as described in UAX #39 for this file, and the content of + * as described in Unicode Technical Standard #39. The syntax of the source data + * is as described in UTS #39 for this file, and the content of * this file is acceptable input. * * The character encoding of the (char *) input text is UTF-8. @@ -1111,7 +1176,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode * /** - * Check the whether two specified strings are visually confusable. + * Check whether two specified strings are visually confusable. * * If the strings are confusable, the return value will be nonzero, as long as * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). @@ -1159,7 +1224,58 @@ uspoof_areConfusable(const USpoofChecker *sc, const UChar *id2, int32_t length2, UErrorCode *status); - +#ifndef U_HIDE_DRAFT_API +/** + * Check whether two specified strings are visually confusable when + * displayed in a context with the given paragraph direction. + * + * If the strings are confusable, the return value will be nonzero, as long as + * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). + * + * The bits in the return value correspond to flags for each of the classes of + * confusables applicable to the two input strings. According to UTS 39 + * section 4, the possible flags are: + * + * <ul> + * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> + * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li> + * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> + * </ul> + * + * If one or more of the above flags were not listed in uspoof_setChecks(), this + * function will never report that class of confusable. The check + * {@link USPOOF_CONFUSABLE} enables all three flags. + * + * + * @param sc The USpoofChecker + * @param direction The paragraph direction with which the identifiers are + * displayed. Must be either UBIDI_LTR or UBIDI_RTL. + * @param id1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-16 format. + * @param length1 the length of the first identifier, expressed in + * 16 bit UTF-16 code units, or -1 if the string is + * nul terminated. + * @param id2 The second of the two identifiers to be compared for + * confusability. The identifiers are in UTF-16 format. + * @param length2 The length of the second identifiers, expressed in + * 16 bit UTF-16 code units, or -1 if the string is + * nul terminated. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Confusability of the identifiers is not reported here, + * but through this function's return value. + * @return An integer value with bit(s) set corresponding to + * the type of confusability found, as defined by + * enum USpoofChecks. Zero is returned if the identifiers + * are not confusable. + * + * @draft ICU 74 + */ +U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction, + const UChar *id1, int32_t length1, + const UChar *id2, int32_t length2, + UErrorCode *status); +#endif /* U_HIDE_DRAFT_API */ /** * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format. @@ -1192,14 +1308,45 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc, const char *id2, int32_t length2, UErrorCode *status); - - +#ifndef U_HIDE_DRAFT_API +/** + * A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format. + * + * @param sc The USpoofChecker + * @param direction The paragraph direction with which the identifiers are + * displayed. Must be either UBIDI_LTR or UBIDI_RTL. + * @param id1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param length1 the length of the first identifiers, in bytes, or -1 + * if the string is nul terminated. + * @param id2 The second of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param length2 The length of the second string in bytes, or -1 + * if the string is nul terminated. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Confusability of the strings is not reported here, + * but through this function's return value. + * @return An integer value with bit(s) set corresponding to + * the type of confusability found, as defined by + * enum USpoofChecks. Zero is returned if the strings + * are not confusable. + * + * @draft ICU 74 + * + * @see uspoof_areBidiConfusable + */ +U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction, + const char *id1, int32_t length1, + const char *id2, int32_t length2, + UErrorCode *status); +#endif /* U_HIDE_DRAFT_API */ /** * Get the "skeleton" for an identifier. * Skeletons are a transformation of the input identifier; * Two identifiers are confusable if their skeletons are identical. - * See Unicode UAX #39 for additional information. + * See Unicode Technical Standard #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large @@ -1233,11 +1380,50 @@ uspoof_getSkeleton(const USpoofChecker *sc, UChar *dest, int32_t destCapacity, UErrorCode *status); +#ifndef U_HIDE_DRAFT_API +/** + * Get the "bidiSkeleton" for an identifier and a direction. + * Skeletons are a transformation of the input identifier; + * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical; + * they are RTL-confusable if their RTL bidiSkeletons are identical. + * See Unicode Technical Standard #39 for additional information: + * https://www.unicode.org/reports/tr39/#Confusable_Detection. + * + * Using skeletons directly makes it possible to quickly check + * whether an identifier is confusable with any of some large + * set of existing identifiers, by creating an efficiently + * searchable collection of the skeletons. + * + * @param sc The USpoofChecker. + * @param direction The context direction with which the identifier will be + * displayed. Must be either UBIDI_LTR or UBIDI_RTL. + * @param id The input identifier whose skeleton will be computed. + * @param length The length of the input identifier, expressed in 16 bit + * UTF-16 code units, or -1 if the string is zero terminated. + * @param dest The output buffer, to receive the skeleton string. + * @param destCapacity The length of the output buffer, in 16 bit units. + * The destCapacity may be zero, in which case the function will + * return the actual length of the skeleton. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * @return The length of the skeleton string. The returned length + * is always that of the complete skeleton, even when the + * supplied buffer is too small (or of zero length) + * + * @draft ICU 74 + * @see uspoof_areBidiConfusable + */ +U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc, + UBiDiDirection direction, + const UChar *id, int32_t length, + UChar *dest, int32_t destCapacity, UErrorCode *status); +#endif /* U_HIDE_DRAFT_API */ + /** * Get the "skeleton" for an identifier. * Skeletons are a transformation of the input identifier; * Two identifiers are confusable if their skeletons are identical. - * See Unicode UAX #39 for additional information. + * See Unicode Technical Standard #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large @@ -1273,6 +1459,46 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc, char *dest, int32_t destCapacity, UErrorCode *status); +#ifndef U_HIDE_DRAFT_API +/** + * Get the "bidiSkeleton" for an identifier and a direction. + * Skeletons are a transformation of the input identifier; + * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical; + * they are RTL-confusable if their RTL bidiSkeletons are identical. + * See Unicode Technical Standard #39 for additional information: + * https://www.unicode.org/reports/tr39/#Confusable_Detection. + * + * Using skeletons directly makes it possible to quickly check + * whether an identifier is confusable with any of some large + * set of existing identifiers, by creating an efficiently + * searchable collection of the skeletons. + * + * @param sc The USpoofChecker + * @param direction The context direction with which the identifier will be + * displayed. Must be either UBIDI_LTR or UBIDI_RTL. + * @param id The UTF-8 format identifier whose skeleton will be computed. + * @param length The length of the input string, in bytes, + * or -1 if the string is zero terminated. + * @param dest The output buffer, to receive the skeleton string. + * @param destCapacity The length of the output buffer, in bytes. + * The destCapacity may be zero, in which case the function will + * return the actual length of the skeleton. + * @param status The error code, set if an error occurred while attempting to + * perform the check. Possible Errors include U_INVALID_CHAR_FOUND + * for invalid UTF-8 sequences, and + * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small + * to hold the complete skeleton. + * @return The length of the skeleton string, in bytes. The returned length + * is always that of the complete skeleton, even when the + * supplied buffer is too small (or of zero length) + * + * @draft ICU 74 + */ +U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction, + const char *id, int32_t length, char *dest, + int32_t destCapacity, UErrorCode *status); +#endif /* U_HIDE_DRAFT_API */ + /** * Get the set of Candidate Characters for Inclusion in Identifiers, as defined * in http://unicode.org/Public/security/latest/xidmodifications.txt @@ -1510,11 +1736,42 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc, const icu::UnicodeString &s2, UErrorCode *status); +#ifndef U_HIDE_DRAFT_API +/** + * A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings. + * + * @param sc The USpoofChecker + * @param direction The paragraph direction with which the identifiers are + * displayed. Must be either UBIDI_LTR or UBIDI_RTL. + * @param s1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param s2 The second of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Confusability of the identifiers is not reported here, + * but through this function's return value. + * @return An integer value with bit(s) set corresponding to + * the type of confusability found, as defined by + * enum USpoofChecks. Zero is returned if the identifiers + * are not confusable. + * + * @draft ICU 74 + * + * @see uspoof_areBidiConfusable + */ +U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc, + UBiDiDirection direction, + const icu::UnicodeString &s1, + const icu::UnicodeString &s2, + UErrorCode *status); +#endif /* U_HIDE_DRAFT_API */ + /** * Get the "skeleton" for an identifier. * Skeletons are a transformation of the input identifier; * Two identifiers are confusable if their skeletons are identical. - * See Unicode UAX #39 for additional information. + * See Unicode Technical Standard #39 for additional information. * * Using skeletons directly makes it possible to quickly check * whether an identifier is confusable with any of some large @@ -1540,6 +1797,36 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, icu::UnicodeString &dest, UErrorCode *status); +#ifndef U_HIDE_DRAFT_API +/** + * Get the "bidiSkeleton" for an identifier and a direction. + * Skeletons are a transformation of the input identifier; + * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical; + * they are RTL-confusable if their RTL bidiSkeletons are identical. + * See Unicode Technical Standard #39 for additional information. + * https://www.unicode.org/reports/tr39/#Confusable_Detection. + * + * Using skeletons directly makes it possible to quickly check + * whether an identifier is confusable with any of some large + * set of existing identifiers, by creating an efficiently + * searchable collection of the skeletons. + * + * @param sc The USpoofChecker. + * @param direction The context direction with which the identifier will be + * displayed. Must be either UBIDI_LTR or UBIDI_RTL. + * @param id The input identifier whose bidiSkeleton will be computed. + * @param dest The output identifier, to receive the skeleton string. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * @return A reference to the destination (skeleton) string. + * + * @draft ICU 74 + */ +U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString( + const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id, + icu::UnicodeString &dest, UErrorCode *status); +#endif /* U_HIDE_DRAFT_API */ + /** * Get the set of Candidate Characters for Inclusion in Identifiers, as defined * in http://unicode.org/Public/security/latest/xidmodifications.txt |