aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/icu/include/unicode/uspoof.h
diff options
context:
space:
mode:
authorromankoshelev <romankoshelev@yandex-team.com>2024-05-13 11:00:27 +0300
committerromankoshelev <romankoshelev@yandex-team.com>2024-05-13 11:13:05 +0300
commit5b22fadb0f035a3b82c328e0ae710ad2b92f6eac (patch)
treee15dc649c79c4fb78f35cd6694dfe9af9bfcc0ad /contrib/libs/icu/include/unicode/uspoof.h
parent5946aa7d3cbca62f6bcf074e8a2b9346e7a96af4 (diff)
downloadydb-5b22fadb0f035a3b82c328e0ae710ad2b92f6eac.tar.gz
Update ICU to 75.1
904da4ae1c86fc5542eac7f1cd18d97b72eb8517
Diffstat (limited to 'contrib/libs/icu/include/unicode/uspoof.h')
-rw-r--r--contrib/libs/icu/include/unicode/uspoof.h307
1 files changed, 297 insertions, 10 deletions
diff --git a/contrib/libs/icu/include/unicode/uspoof.h b/contrib/libs/icu/include/unicode/uspoof.h
index 442655bb54..20d29d62b2 100644
--- a/contrib/libs/icu/include/unicode/uspoof.h
+++ b/contrib/libs/icu/include/unicode/uspoof.h
@@ -19,6 +19,7 @@
#ifndef USPOOF_H
#define USPOOF_H
+#include "unicode/ubidi.h"
#include "unicode/utypes.h"
#include "unicode/uset.h"
#include "unicode/parseerr.h"
@@ -83,6 +84,25 @@
* the instance should be created once (e.g., upon application startup), and the efficient
* {@link uspoof_areConfusable} method can be used at runtime.
*
+ * If the paragraph direction used to display the strings is known, the bidi function should be used instead:
+ *
+ * \code{.c}
+ * UErrorCode status = U_ZERO_ERROR;
+ * // These strings look identical when rendered in a left-to-right context.
+ * // They look distinct in a right-to-left context.
+ * UChar* str1 = (UChar*) u"A1\u05D0"; // A1א
+ * UChar* str2 = (UChar*) u"A\u05D01"; // Aא1
+ *
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
+ *
+ * int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
+ * UBool result = bitmask != 0;
+ * // areBidiConfusable: 1 (status: U_ZERO_ERROR)
+ * printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
+ * uspoof_close(sc);
+ * \endcode
+ *
* <p>
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
* {@link uspoof_close} when the object goes out of scope:
@@ -339,6 +359,51 @@
* COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
* scripts.
*
+ * <h2>Advanced bidirectional usage</h2>
+ * If the paragraph direction with which the identifiers will be displayed is not known, there are
+ * multiple options for confusable detection depending on the circumstances.
+ *
+ * <p>
+ * In some circumstances, the only concern is confusion between identifiers displayed with the same
+ * paragraph direction.
+ *
+ * <p>
+ * An example is the case where identifiers are usernames prefixed with the @ symbol.
+ * That symbol will appear to the left in a left-to-right context, and to the right in a
+ * right-to-left context, so that an identifier displayed in a left-to-right context can never be
+ * confused with an identifier displayed in a right-to-left context:
+ * <ul>
+ * <li>
+ * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
+ * would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the
+ * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
+ * confusable, since they both appear as A_1א@ in a right-to-left context.
+ * </li>
+ * <li>
+ * The username "Mark_" would not be considered confusable with the username "_Mark",
+ * even though the latter would appear as Mark_@ in a right-to-left context, and the
+ * former as \@Mark_ in a left-to-right context.
+ * </li>
+ * </ul>
+ * <p>
+ * In that case, the caller should check for both LTR-confusability and RTL-confusability:
+ *
+ * \code{.cpp}
+ * bool confusableInEitherDirection =
+ * uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) ||
+ * uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status);
+ * \endcode
+ *
+ * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
+ * with LTR and RTL with RTL.
+ *
+ * <p>
+ * In cases where confusability between the visual appearances of an identifier displayed in a
+ * left-to-right context with another identifier displayed in a right-to-left context is a concern,
+ * the LTR skeleton of one can be compared with the RTL skeleton of the other. However, this
+ * very broad definition of confusability may have unexpected results; for instance, it treats the
+ * ASCII identifiers "Mark_" and "_Mark" as confusable.
+ *
* <h2>Additional Information</h2>
*
* A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
@@ -519,7 +584,7 @@ typedef enum USpoofChecks {
/**
- * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
+ * Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
* for returned identifier restriction levels in check results.
*
* @stable ICU 51
@@ -633,8 +698,8 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
/**
* Open a Spoof Checker from the source form of the spoof data.
* The input corresponds to the Unicode data file confusables.txt
- * as described in Unicode UAX #39. The syntax of the source data
- * is as described in UAX #39 for this file, and the content of
+ * as described in Unicode Technical Standard #39. The syntax of the source data
+ * is as described in UTS #39 for this file, and the content of
* this file is acceptable input.
*
* The character encoding of the (char *) input text is UTF-8.
@@ -1111,7 +1176,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
/**
- * Check the whether two specified strings are visually confusable.
+ * Check whether two specified strings are visually confusable.
*
* If the strings are confusable, the return value will be nonzero, as long as
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
@@ -1159,7 +1224,58 @@ uspoof_areConfusable(const USpoofChecker *sc,
const UChar *id2, int32_t length2,
UErrorCode *status);
-
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Check whether two specified strings are visually confusable when
+ * displayed in a context with the given paragraph direction.
+ *
+ * If the strings are confusable, the return value will be nonzero, as long as
+ * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
+ *
+ * The bits in the return value correspond to flags for each of the classes of
+ * confusables applicable to the two input strings. According to UTS 39
+ * section 4, the possible flags are:
+ *
+ * <ul>
+ * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
+ * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
+ * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
+ * </ul>
+ *
+ * If one or more of the above flags were not listed in uspoof_setChecks(), this
+ * function will never report that class of confusable. The check
+ * {@link USPOOF_CONFUSABLE} enables all three flags.
+ *
+ *
+ * @param sc The USpoofChecker
+ * @param direction The paragraph direction with which the identifiers are
+ * displayed. Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id1 The first of the two identifiers to be compared for
+ * confusability. The strings are in UTF-16 format.
+ * @param length1 the length of the first identifier, expressed in
+ * 16 bit UTF-16 code units, or -1 if the string is
+ * nul terminated.
+ * @param id2 The second of the two identifiers to be compared for
+ * confusability. The identifiers are in UTF-16 format.
+ * @param length2 The length of the second identifiers, expressed in
+ * 16 bit UTF-16 code units, or -1 if the string is
+ * nul terminated.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * Confusability of the identifiers is not reported here,
+ * but through this function's return value.
+ * @return An integer value with bit(s) set corresponding to
+ * the type of confusability found, as defined by
+ * enum USpoofChecks. Zero is returned if the identifiers
+ * are not confusable.
+ *
+ * @draft ICU 74
+ */
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
+ const UChar *id1, int32_t length1,
+ const UChar *id2, int32_t length2,
+ UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
/**
* A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
@@ -1192,14 +1308,45 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc,
const char *id2, int32_t length2,
UErrorCode *status);
-
-
+#ifndef U_HIDE_DRAFT_API
+/**
+ * A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
+ *
+ * @param sc The USpoofChecker
+ * @param direction The paragraph direction with which the identifiers are
+ * displayed. Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id1 The first of the two identifiers to be compared for
+ * confusability. The strings are in UTF-8 format.
+ * @param length1 the length of the first identifiers, in bytes, or -1
+ * if the string is nul terminated.
+ * @param id2 The second of the two identifiers to be compared for
+ * confusability. The strings are in UTF-8 format.
+ * @param length2 The length of the second string in bytes, or -1
+ * if the string is nul terminated.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * Confusability of the strings is not reported here,
+ * but through this function's return value.
+ * @return An integer value with bit(s) set corresponding to
+ * the type of confusability found, as defined by
+ * enum USpoofChecks. Zero is returned if the strings
+ * are not confusable.
+ *
+ * @draft ICU 74
+ *
+ * @see uspoof_areBidiConfusable
+ */
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
+ const char *id1, int32_t length1,
+ const char *id2, int32_t length2,
+ UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
- * See Unicode UAX #39 for additional information.
+ * See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@@ -1233,11 +1380,50 @@ uspoof_getSkeleton(const USpoofChecker *sc,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Get the "bidiSkeleton" for an identifier and a direction.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
+ * they are RTL-confusable if their RTL bidiSkeletons are identical.
+ * See Unicode Technical Standard #39 for additional information:
+ * https://www.unicode.org/reports/tr39/#Confusable_Detection.
+ *
+ * Using skeletons directly makes it possible to quickly check
+ * whether an identifier is confusable with any of some large
+ * set of existing identifiers, by creating an efficiently
+ * searchable collection of the skeletons.
+ *
+ * @param sc The USpoofChecker.
+ * @param direction The context direction with which the identifier will be
+ * displayed. Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id The input identifier whose skeleton will be computed.
+ * @param length The length of the input identifier, expressed in 16 bit
+ * UTF-16 code units, or -1 if the string is zero terminated.
+ * @param dest The output buffer, to receive the skeleton string.
+ * @param destCapacity The length of the output buffer, in 16 bit units.
+ * The destCapacity may be zero, in which case the function will
+ * return the actual length of the skeleton.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * @return The length of the skeleton string. The returned length
+ * is always that of the complete skeleton, even when the
+ * supplied buffer is too small (or of zero length)
+ *
+ * @draft ICU 74
+ * @see uspoof_areBidiConfusable
+ */
+U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
+ UBiDiDirection direction,
+ const UChar *id, int32_t length,
+ UChar *dest, int32_t destCapacity, UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
- * See Unicode UAX #39 for additional information.
+ * See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@@ -1273,6 +1459,46 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
char *dest, int32_t destCapacity,
UErrorCode *status);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Get the "bidiSkeleton" for an identifier and a direction.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
+ * they are RTL-confusable if their RTL bidiSkeletons are identical.
+ * See Unicode Technical Standard #39 for additional information:
+ * https://www.unicode.org/reports/tr39/#Confusable_Detection.
+ *
+ * Using skeletons directly makes it possible to quickly check
+ * whether an identifier is confusable with any of some large
+ * set of existing identifiers, by creating an efficiently
+ * searchable collection of the skeletons.
+ *
+ * @param sc The USpoofChecker
+ * @param direction The context direction with which the identifier will be
+ * displayed. Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id The UTF-8 format identifier whose skeleton will be computed.
+ * @param length The length of the input string, in bytes,
+ * or -1 if the string is zero terminated.
+ * @param dest The output buffer, to receive the skeleton string.
+ * @param destCapacity The length of the output buffer, in bytes.
+ * The destCapacity may be zero, in which case the function will
+ * return the actual length of the skeleton.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check. Possible Errors include U_INVALID_CHAR_FOUND
+ * for invalid UTF-8 sequences, and
+ * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
+ * to hold the complete skeleton.
+ * @return The length of the skeleton string, in bytes. The returned length
+ * is always that of the complete skeleton, even when the
+ * supplied buffer is too small (or of zero length)
+ *
+ * @draft ICU 74
+ */
+U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
+ const char *id, int32_t length, char *dest,
+ int32_t destCapacity, UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in http://unicode.org/Public/security/latest/xidmodifications.txt
@@ -1510,11 +1736,42 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
const icu::UnicodeString &s2,
UErrorCode *status);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
+ *
+ * @param sc The USpoofChecker
+ * @param direction The paragraph direction with which the identifiers are
+ * displayed. Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param s1 The first of the two identifiers to be compared for
+ * confusability. The strings are in UTF-8 format.
+ * @param s2 The second of the two identifiers to be compared for
+ * confusability. The strings are in UTF-8 format.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * Confusability of the identifiers is not reported here,
+ * but through this function's return value.
+ * @return An integer value with bit(s) set corresponding to
+ * the type of confusability found, as defined by
+ * enum USpoofChecks. Zero is returned if the identifiers
+ * are not confusable.
+ *
+ * @draft ICU 74
+ *
+ * @see uspoof_areBidiConfusable
+ */
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
+ UBiDiDirection direction,
+ const icu::UnicodeString &s1,
+ const icu::UnicodeString &s2,
+ UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
- * See Unicode UAX #39 for additional information.
+ * See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@@ -1540,6 +1797,36 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
icu::UnicodeString &dest,
UErrorCode *status);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Get the "bidiSkeleton" for an identifier and a direction.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
+ * they are RTL-confusable if their RTL bidiSkeletons are identical.
+ * See Unicode Technical Standard #39 for additional information.
+ * https://www.unicode.org/reports/tr39/#Confusable_Detection.
+ *
+ * Using skeletons directly makes it possible to quickly check
+ * whether an identifier is confusable with any of some large
+ * set of existing identifiers, by creating an efficiently
+ * searchable collection of the skeletons.
+ *
+ * @param sc The USpoofChecker.
+ * @param direction The context direction with which the identifier will be
+ * displayed. Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id The input identifier whose bidiSkeleton will be computed.
+ * @param dest The output identifier, to receive the skeleton string.
+ * @param status The error code, set if an error occurred while attempting to
+ * perform the check.
+ * @return A reference to the destination (skeleton) string.
+ *
+ * @draft ICU 74
+ */
+U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
+ const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
+ icu::UnicodeString &dest, UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in http://unicode.org/Public/security/latest/xidmodifications.txt