Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 2 of 2.

author: neksard <neksard@yandex-team.ru> 2022-02-10 16:45:33 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:45:33 +0300
commit: 1d9c550e7c38e051d7961f576013a482003a70d9 (patch)
tree: b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/i18n/csdetect.cpp
parent: 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff)
download: ydb-1d9c550e7c38e051d7961f576013a482003a70d9.tar.gz
1 files changed, 482 insertions, 482 deletions
diff --git a/contrib/libs/icu/i18n/csdetect.cpp b/contrib/libs/icu/i18n/csdetect.cpp
index 0221ec4043..babb308430 100644
--- a/contrib/libs/icu/i18n/csdetect.cpp
+++ b/contrib/libs/icu/i18n/csdetect.cpp
@@ -1,487 +1,487 @@
 // © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html 
-/* 
- ********************************************************************** 
- *   Copyright (C) 2005-2016, International Business Machines 
- *   Corporation and others.  All Rights Reserved. 
- ********************************************************************** 
- */ 
- 
-#include "unicode/utypes.h" 
- 
-#if !UCONFIG_NO_CONVERSION 
- 
-#include "unicode/ucsdet.h" 
- 
-#include "csdetect.h" 
-#include "csmatch.h" 
-#include "uenumimp.h" 
- 
-#include "cmemory.h" 
-#include "cstring.h" 
-#include "umutex.h" 
-#include "ucln_in.h" 
-#include "uarrsort.h" 
-#include "inputext.h" 
-#include "csrsbcs.h" 
-#include "csrmbcs.h" 
-#include "csrutf8.h" 
-#include "csrucode.h" 
-#include "csr2022.h" 
- 
-#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 
-#define DELETE_ARRAY(array) uprv_free((void *) (array)) 
- 
-U_NAMESPACE_BEGIN 
- 
-struct CSRecognizerInfo : public UMemory { 
-    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) 
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+ **********************************************************************
+ *   Copyright (C) 2005-2016, International Business Machines
+ *   Corporation and others.  All Rights Reserved.
+ **********************************************************************
+ */
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
+#include "unicode/ucsdet.h"
+
+#include "csdetect.h"
+#include "csmatch.h"
+#include "uenumimp.h"
+
+#include "cmemory.h"
+#include "cstring.h"
+#include "umutex.h"
+#include "ucln_in.h"
+#include "uarrsort.h"
+#include "inputext.h"
+#include "csrsbcs.h"
+#include "csrmbcs.h"
+#include "csrutf8.h"
+#include "csrucode.h"
+#include "csr2022.h"
+
+#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
+#define DELETE_ARRAY(array) uprv_free((void *) (array))
+
+U_NAMESPACE_BEGIN
+
+struct CSRecognizerInfo : public UMemory {
+    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
         : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
- 
+
     ~CSRecognizerInfo() {delete recognizer;}
- 
-    CharsetRecognizer *recognizer; 
-    UBool isDefaultEnabled; 
-}; 
- 
-U_NAMESPACE_END 
- 
-static icu::CSRecognizerInfo **fCSRecognizers = NULL; 
+
+    CharsetRecognizer *recognizer;
+    UBool isDefaultEnabled;
+};
+
+U_NAMESPACE_END
+
+static icu::CSRecognizerInfo **fCSRecognizers = NULL;
 static icu::UInitOnce gCSRecognizersInitOnce = U_INITONCE_INITIALIZER;
-static int32_t fCSRecognizers_size = 0; 
- 
-U_CDECL_BEGIN 
-static UBool U_CALLCONV csdet_cleanup(void) 
-{ 
-    U_NAMESPACE_USE 
-    if (fCSRecognizers != NULL) { 
-        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 
-            delete fCSRecognizers[r]; 
-            fCSRecognizers[r] = NULL; 
-        } 
- 
-        DELETE_ARRAY(fCSRecognizers); 
-        fCSRecognizers = NULL; 
-        fCSRecognizers_size = 0; 
+static int32_t fCSRecognizers_size = 0;
+
+U_CDECL_BEGIN
+static UBool U_CALLCONV csdet_cleanup(void)
+{
+    U_NAMESPACE_USE
+    if (fCSRecognizers != NULL) {
+        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
+            delete fCSRecognizers[r];
+            fCSRecognizers[r] = NULL;
+        }
+
+        DELETE_ARRAY(fCSRecognizers);
+        fCSRecognizers = NULL;
+        fCSRecognizers_size = 0;
+    }
+    gCSRecognizersInitOnce.reset();
+
+    return TRUE;
+}
+
+static int32_t U_CALLCONV
+charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
+{
+    U_NAMESPACE_USE
+
+    const CharsetMatch **csm_l = (const CharsetMatch **) left;
+    const CharsetMatch **csm_r = (const CharsetMatch **) right;
+
+    // NOTE: compare is backwards to sort from highest to lowest.
+    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
+}
+
+static void U_CALLCONV initRecognizers(UErrorCode &status) {
+    U_NAMESPACE_USE
+    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
+    CSRecognizerInfo *tempArray[] = {
+        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
+
+        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
+
+        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
+
+        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
+#if !UCONFIG_ONLY_HTML_CONVERSION
+        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
+        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
+
+        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
+        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
+        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
+        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
+#endif
+    };
+    int32_t rCount = UPRV_LENGTHOF(tempArray);
+
+    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
+
+    if (fCSRecognizers == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
     } 
-    gCSRecognizersInitOnce.reset(); 
- 
-    return TRUE; 
-} 
- 
-static int32_t U_CALLCONV 
-charsetMatchComparator(const void * /*context*/, const void *left, const void *right) 
-{ 
-    U_NAMESPACE_USE 
- 
-    const CharsetMatch **csm_l = (const CharsetMatch **) left; 
-    const CharsetMatch **csm_r = (const CharsetMatch **) right; 
- 
-    // NOTE: compare is backwards to sort from highest to lowest. 
-    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 
-} 
- 
-static void U_CALLCONV initRecognizers(UErrorCode &status) { 
-    U_NAMESPACE_USE 
-    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 
-    CSRecognizerInfo *tempArray[] = { 
-        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), 
- 
-        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), 
- 
-        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), 
- 
-        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), 
-#if !UCONFIG_ONLY_HTML_CONVERSION 
-        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), 
-        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), 
- 
-        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), 
-        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), 
-        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), 
-        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) 
-#endif 
-    }; 
-    int32_t rCount = UPRV_LENGTHOF(tempArray); 
- 
-    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); 
- 
-    if (fCSRecognizers == NULL) { 
-        status = U_MEMORY_ALLOCATION_ERROR; 
-    }  
-    else { 
-        fCSRecognizers_size = rCount; 
-        for (int32_t r = 0; r < rCount; r += 1) { 
-            fCSRecognizers[r] = tempArray[r]; 
-            if (fCSRecognizers[r] == NULL) { 
-                status = U_MEMORY_ALLOCATION_ERROR; 
-            } 
-        } 
-    } 
-} 
- 
-U_CDECL_END 
- 
-U_NAMESPACE_BEGIN 
- 
-void CharsetDetector::setRecognizers(UErrorCode &status) 
-{ 
-    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); 
-} 
- 
-CharsetDetector::CharsetDetector(UErrorCode &status) 
-  : textIn(new InputText(status)), resultArray(NULL), 
-    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), 
-    fEnabledRecognizers(NULL) 
-{ 
-    if (U_FAILURE(status)) { 
-        return; 
-    } 
- 
-    setRecognizers(status); 
- 
-    if (U_FAILURE(status)) { 
-        return; 
-    } 
- 
-    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 
- 
-    if (resultArray == NULL) { 
-        status = U_MEMORY_ALLOCATION_ERROR; 
-        return; 
-    } 
- 
-    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 
-        resultArray[i] = new CharsetMatch(); 
- 
-        if (resultArray[i] == NULL) { 
-            status = U_MEMORY_ALLOCATION_ERROR; 
-            break; 
-        } 
-    } 
-} 
- 
-CharsetDetector::~CharsetDetector() 
-{ 
-    delete textIn; 
- 
-    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 
-        delete resultArray[i]; 
-    } 
- 
-    uprv_free(resultArray); 
- 
-    if (fEnabledRecognizers) { 
-        uprv_free(fEnabledRecognizers); 
-    } 
-} 
- 
-void CharsetDetector::setText(const char *in, int32_t len) 
-{ 
-    textIn->setText(in, len); 
-    fFreshTextSet = TRUE; 
-} 
- 
-UBool CharsetDetector::setStripTagsFlag(UBool flag) 
-{ 
-    UBool temp = fStripTags; 
-    fStripTags = flag; 
-    fFreshTextSet = TRUE; 
-    return temp; 
-} 
- 
-UBool CharsetDetector::getStripTagsFlag() const 
-{ 
-    return fStripTags; 
-} 
- 
-void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 
-{ 
-    textIn->setDeclaredEncoding(encoding,len); 
-} 
- 
-int32_t CharsetDetector::getDetectableCount() 
-{ 
-    UErrorCode status = U_ZERO_ERROR; 
- 
-    setRecognizers(status); 
- 
-    return fCSRecognizers_size;  
-} 
- 
-const CharsetMatch *CharsetDetector::detect(UErrorCode &status) 
-{ 
-    int32_t maxMatchesFound = 0; 
- 
-    detectAll(maxMatchesFound, status); 
- 
-    if(maxMatchesFound > 0) { 
-        return resultArray[0]; 
-    } else { 
-        return NULL; 
-    } 
-} 
- 
-const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 
-{ 
-    if(!textIn->isSet()) { 
-        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set 
- 
-        return NULL; 
-    } else if (fFreshTextSet) { 
-        CharsetRecognizer *csr; 
-        int32_t            i; 
- 
-        textIn->MungeInput(fStripTags); 
- 
-        // Iterate over all possible charsets, remember all that 
-        // give a match quality > 0. 
-        resultCount = 0; 
-        for (i = 0; i < fCSRecognizers_size; i += 1) { 
-            csr = fCSRecognizers[i]->recognizer; 
-            if (csr->match(textIn, resultArray[resultCount])) { 
-                resultCount++; 
-            } 
-        } 
- 
-        if (resultCount > 1) { 
-            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 
-        } 
-        fFreshTextSet = FALSE; 
-    } 
- 
-    maxMatchesFound = resultCount; 
- 
-    return resultArray; 
-} 
- 
-void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) 
-{ 
-    if (U_FAILURE(status)) { 
-        return; 
-    } 
- 
-    int32_t modIdx = -1; 
-    UBool isDefaultVal = FALSE; 
-    for (int32_t i = 0; i < fCSRecognizers_size; i++) { 
-        CSRecognizerInfo *csrinfo = fCSRecognizers[i]; 
-        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { 
-            modIdx = i; 
-            isDefaultVal = (csrinfo->isDefaultEnabled == enabled); 
-            break; 
-        } 
-    } 
-    if (modIdx < 0) { 
-        // No matching encoding found 
-        status = U_ILLEGAL_ARGUMENT_ERROR; 
-        return; 
-    } 
- 
-    if (fEnabledRecognizers == NULL && !isDefaultVal) { 
-        // Create an array storing the non default setting 
-        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); 
-        if (fEnabledRecognizers == NULL) { 
-            status = U_MEMORY_ALLOCATION_ERROR; 
-            return; 
-        } 
-        // Initialize the array with default info 
-        for (int32_t i = 0; i < fCSRecognizers_size; i++) { 
-            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; 
-        } 
-    } 
- 
-    if (fEnabledRecognizers != NULL) { 
-        fEnabledRecognizers[modIdx] = enabled; 
-    } 
-} 
- 
-/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 
-{ 
-    if( index > fCSRecognizers_size-1 || index < 0) { 
-        status = U_INDEX_OUTOFBOUNDS_ERROR; 
- 
-        return 0; 
-    } else { 
-        return fCSRecognizers[index]->getName(); 
-    } 
-}*/ 
- 
-U_NAMESPACE_END 
- 
-U_CDECL_BEGIN 
-typedef struct { 
-    int32_t currIndex; 
-    UBool all; 
-    UBool *enabledRecognizers; 
-} Context; 
- 
- 
- 
-static void U_CALLCONV 
-enumClose(UEnumeration *en) { 
-    if(en->context != NULL) { 
-        DELETE_ARRAY(en->context); 
-    } 
- 
-    DELETE_ARRAY(en); 
-} 
- 
-static int32_t U_CALLCONV 
-enumCount(UEnumeration *en, UErrorCode *) { 
-    if (((Context *)en->context)->all) { 
-        // ucsdet_getAllDetectableCharsets, all charset detector names 
-        return fCSRecognizers_size; 
-    } 
- 
-    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones 
-    int32_t count = 0; 
-    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 
-    if (enabledArray != NULL) { 
-        // custom set 
-        for (int32_t i = 0; i < fCSRecognizers_size; i++) { 
-            if (enabledArray[i]) { 
-                count++; 
-            } 
-        } 
-    } else { 
-        // default set 
-        for (int32_t i = 0; i < fCSRecognizers_size; i++) { 
-            if (fCSRecognizers[i]->isDefaultEnabled) { 
-                count++; 
-            } 
-        } 
-    } 
-    return count; 
-} 
- 
-static const char* U_CALLCONV 
-enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 
-    const char *currName = NULL; 
- 
-    if (((Context *)en->context)->currIndex < fCSRecognizers_size) { 
-        if (((Context *)en->context)->all) { 
-            // ucsdet_getAllDetectableCharsets, all charset detector names 
-            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 
-            ((Context *)en->context)->currIndex++; 
-        } else { 
-            // ucsdet_getDetectableCharsets 
-            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 
-            if (enabledArray != NULL) { 
-                // custome set 
-                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 
-                    if (enabledArray[((Context *)en->context)->currIndex]) { 
-                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 
-                    } 
-                    ((Context *)en->context)->currIndex++; 
-                } 
-            } else { 
-                // default set 
-                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 
-                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { 
-                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 
-                    } 
-                    ((Context *)en->context)->currIndex++; 
-                } 
-            } 
-        } 
-    } 
- 
-    if(resultLength != NULL) { 
-        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); 
-    } 
- 
-    return currName; 
-} 
- 
- 
-static void U_CALLCONV 
-enumReset(UEnumeration *en, UErrorCode *) { 
-    ((Context *)en->context)->currIndex = 0; 
-} 
- 
-static const UEnumeration gCSDetEnumeration = { 
-    NULL, 
-    NULL, 
-    enumClose, 
-    enumCount, 
-    uenum_unextDefault, 
-    enumNext, 
-    enumReset 
-}; 
- 
-U_CDECL_END 
- 
-U_NAMESPACE_BEGIN 
- 
-UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) 
-{ 
- 
-    /* Initialize recognized charsets. */ 
-    setRecognizers(status); 
- 
-    if(U_FAILURE(status)) { 
-        return 0; 
-    } 
- 
-    UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 
-    if (en == NULL) { 
-        status = U_MEMORY_ALLOCATION_ERROR; 
-        return 0; 
-    } 
-    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 
-    en->context = (void*)NEW_ARRAY(Context, 1); 
-    if (en->context == NULL) { 
-        status = U_MEMORY_ALLOCATION_ERROR; 
-        DELETE_ARRAY(en); 
-        return 0; 
-    } 
-    uprv_memset(en->context, 0, sizeof(Context)); 
-    ((Context*)en->context)->all = TRUE; 
-    return en; 
-} 
- 
-UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const 
-{ 
-    if(U_FAILURE(status)) { 
-        return 0; 
-    } 
- 
-    UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 
-    if (en == NULL) { 
-        status = U_MEMORY_ALLOCATION_ERROR; 
-        return 0; 
-    } 
-    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 
-    en->context = (void*)NEW_ARRAY(Context, 1); 
-    if (en->context == NULL) { 
-        status = U_MEMORY_ALLOCATION_ERROR; 
-        DELETE_ARRAY(en); 
-        return 0; 
-    } 
-    uprv_memset(en->context, 0, sizeof(Context)); 
-    ((Context*)en->context)->all = FALSE; 
-    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; 
-    return en; 
-} 
- 
-U_NAMESPACE_END 
- 
-#endif 
+    else {
+        fCSRecognizers_size = rCount;
+        for (int32_t r = 0; r < rCount; r += 1) {
+            fCSRecognizers[r] = tempArray[r];
+            if (fCSRecognizers[r] == NULL) {
+                status = U_MEMORY_ALLOCATION_ERROR;
+            }
+        }
+    }
+}
+
+U_CDECL_END
+
+U_NAMESPACE_BEGIN
+
+void CharsetDetector::setRecognizers(UErrorCode &status)
+{
+    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
+}
+
+CharsetDetector::CharsetDetector(UErrorCode &status)
+  : textIn(new InputText(status)), resultArray(NULL),
+    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
+    fEnabledRecognizers(NULL)
+{
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    setRecognizers(status);
+
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
+
+    if (resultArray == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+
+    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
+        resultArray[i] = new CharsetMatch();
+
+        if (resultArray[i] == NULL) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            break;
+        }
+    }
+}
+
+CharsetDetector::~CharsetDetector()
+{
+    delete textIn;
+
+    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
+        delete resultArray[i];
+    }
+
+    uprv_free(resultArray);
+
+    if (fEnabledRecognizers) {
+        uprv_free(fEnabledRecognizers);
+    }
+}
+
+void CharsetDetector::setText(const char *in, int32_t len)
+{
+    textIn->setText(in, len);
+    fFreshTextSet = TRUE;
+}
+
+UBool CharsetDetector::setStripTagsFlag(UBool flag)
+{
+    UBool temp = fStripTags;
+    fStripTags = flag;
+    fFreshTextSet = TRUE;
+    return temp;
+}
+
+UBool CharsetDetector::getStripTagsFlag() const
+{
+    return fStripTags;
+}
+
+void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
+{
+    textIn->setDeclaredEncoding(encoding,len);
+}
+
+int32_t CharsetDetector::getDetectableCount()
+{
+    UErrorCode status = U_ZERO_ERROR;
+
+    setRecognizers(status);
+
+    return fCSRecognizers_size; 
+}
+
+const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
+{
+    int32_t maxMatchesFound = 0;
+
+    detectAll(maxMatchesFound, status);
+
+    if(maxMatchesFound > 0) {
+        return resultArray[0];
+    } else {
+        return NULL;
+    }
+}
+
+const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
+{
+    if(!textIn->isSet()) {
+        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
+
+        return NULL;
+    } else if (fFreshTextSet) {
+        CharsetRecognizer *csr;
+        int32_t            i;
+
+        textIn->MungeInput(fStripTags);
+
+        // Iterate over all possible charsets, remember all that
+        // give a match quality > 0.
+        resultCount = 0;
+        for (i = 0; i < fCSRecognizers_size; i += 1) {
+            csr = fCSRecognizers[i]->recognizer;
+            if (csr->match(textIn, resultArray[resultCount])) {
+                resultCount++;
+            }
+        }
+
+        if (resultCount > 1) {
+            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
+        }
+        fFreshTextSet = FALSE;
+    }
+
+    maxMatchesFound = resultCount;
+
+    return resultArray;
+}
+
+void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
+{
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    int32_t modIdx = -1;
+    UBool isDefaultVal = FALSE;
+    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
+        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
+            modIdx = i;
+            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
+            break;
+        }
+    }
+    if (modIdx < 0) {
+        // No matching encoding found
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
+    if (fEnabledRecognizers == NULL && !isDefaultVal) {
+        // Create an array storing the non default setting
+        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
+        if (fEnabledRecognizers == NULL) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+        // Initialize the array with default info
+        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
+        }
+    }
+
+    if (fEnabledRecognizers != NULL) {
+        fEnabledRecognizers[modIdx] = enabled;
+    }
+}
+
+/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
+{
+    if( index > fCSRecognizers_size-1 || index < 0) {
+        status = U_INDEX_OUTOFBOUNDS_ERROR;
+
+        return 0;
+    } else {
+        return fCSRecognizers[index]->getName();
+    }
+}*/
+
+U_NAMESPACE_END
+
+U_CDECL_BEGIN
+typedef struct {
+    int32_t currIndex;
+    UBool all;
+    UBool *enabledRecognizers;
+} Context;
+
+
+
+static void U_CALLCONV
+enumClose(UEnumeration *en) {
+    if(en->context != NULL) {
+        DELETE_ARRAY(en->context);
+    }
+
+    DELETE_ARRAY(en);
+}
+
+static int32_t U_CALLCONV
+enumCount(UEnumeration *en, UErrorCode *) {
+    if (((Context *)en->context)->all) {
+        // ucsdet_getAllDetectableCharsets, all charset detector names
+        return fCSRecognizers_size;
+    }
+
+    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
+    int32_t count = 0;
+    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
+    if (enabledArray != NULL) {
+        // custom set
+        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+            if (enabledArray[i]) {
+                count++;
+            }
+        }
+    } else {
+        // default set
+        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+            if (fCSRecognizers[i]->isDefaultEnabled) {
+                count++;
+            }
+        }
+    }
+    return count;
+}
+
+static const char* U_CALLCONV
+enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
+    const char *currName = NULL;
+
+    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
+        if (((Context *)en->context)->all) {
+            // ucsdet_getAllDetectableCharsets, all charset detector names
+            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
+            ((Context *)en->context)->currIndex++;
+        } else {
+            // ucsdet_getDetectableCharsets
+            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
+            if (enabledArray != NULL) {
+                // custome set
+                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
+                    if (enabledArray[((Context *)en->context)->currIndex]) {
+                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
+                    }
+                    ((Context *)en->context)->currIndex++;
+                }
+            } else {
+                // default set
+                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
+                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
+                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
+                    }
+                    ((Context *)en->context)->currIndex++;
+                }
+            }
+        }
+    }
+
+    if(resultLength != NULL) {
+        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
+    }
+
+    return currName;
+}
+
+
+static void U_CALLCONV
+enumReset(UEnumeration *en, UErrorCode *) {
+    ((Context *)en->context)->currIndex = 0;
+}
+
+static const UEnumeration gCSDetEnumeration = {
+    NULL,
+    NULL,
+    enumClose,
+    enumCount,
+    uenum_unextDefault,
+    enumNext,
+    enumReset
+};
+
+U_CDECL_END
+
+U_NAMESPACE_BEGIN
+
+UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
+{
+
+    /* Initialize recognized charsets. */
+    setRecognizers(status);
+
+    if(U_FAILURE(status)) {
+        return 0;
+    }
+
+    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
+    if (en == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return 0;
+    }
+    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
+    en->context = (void*)NEW_ARRAY(Context, 1);
+    if (en->context == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        DELETE_ARRAY(en);
+        return 0;
+    }
+    uprv_memset(en->context, 0, sizeof(Context));
+    ((Context*)en->context)->all = TRUE;
+    return en;
+}
+
+UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
+{
+    if(U_FAILURE(status)) {
+        return 0;
+    }
+
+    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
+    if (en == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return 0;
+    }
+    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
+    en->context = (void*)NEW_ARRAY(Context, 1);
+    if (en->context == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        DELETE_ARRAY(en);
+        return 0;
+    }
+    uprv_memset(en->context, 0, sizeof(Context));
+    ((Context*)en->context)->all = FALSE;
+    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
+    return en;
+}
+
+U_NAMESPACE_END
+
+#endif
author	neksard <neksard@yandex-team.ru>	2022-02-10 16:45:33 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:45:33 +0300
commit	1d9c550e7c38e051d7961f576013a482003a70d9 (patch)
tree	b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/i18n/csdetect.cpp
parent	8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff)
download	ydb-1d9c550e7c38e051d7961f576013a482003a70d9.tar.gz