diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:33 +0300 |
commit | 1d9c550e7c38e051d7961f576013a482003a70d9 (patch) | |
tree | b2cc84ee7850122e7ccf51d0ea21e4fa7e7a5685 /contrib/libs/icu/i18n/csdetect.cpp | |
parent | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (diff) | |
download | ydb-1d9c550e7c38e051d7961f576013a482003a70d9.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/icu/i18n/csdetect.cpp')
-rw-r--r-- | contrib/libs/icu/i18n/csdetect.cpp | 964 |
1 files changed, 482 insertions, 482 deletions
diff --git a/contrib/libs/icu/i18n/csdetect.cpp b/contrib/libs/icu/i18n/csdetect.cpp index 0221ec4043..babb308430 100644 --- a/contrib/libs/icu/i18n/csdetect.cpp +++ b/contrib/libs/icu/i18n/csdetect.cpp @@ -1,487 +1,487 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* - ********************************************************************** - * Copyright (C) 2005-2016, International Business Machines - * Corporation and others. All Rights Reserved. - ********************************************************************** - */ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_CONVERSION - -#include "unicode/ucsdet.h" - -#include "csdetect.h" -#include "csmatch.h" -#include "uenumimp.h" - -#include "cmemory.h" -#include "cstring.h" -#include "umutex.h" -#include "ucln_in.h" -#include "uarrsort.h" -#include "inputext.h" -#include "csrsbcs.h" -#include "csrmbcs.h" -#include "csrutf8.h" -#include "csrucode.h" -#include "csr2022.h" - -#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) -#define DELETE_ARRAY(array) uprv_free((void *) (array)) - -U_NAMESPACE_BEGIN - -struct CSRecognizerInfo : public UMemory { - CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) +// License & terms of use: http://www.unicode.org/copyright.html +/* + ********************************************************************** + * Copyright (C) 2005-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "unicode/ucsdet.h" + +#include "csdetect.h" +#include "csmatch.h" +#include "uenumimp.h" + +#include "cmemory.h" +#include "cstring.h" +#include "umutex.h" +#include "ucln_in.h" +#include "uarrsort.h" +#include "inputext.h" +#include "csrsbcs.h" +#include "csrmbcs.h" +#include "csrutf8.h" +#include "csrucode.h" +#include "csr2022.h" + +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + +U_NAMESPACE_BEGIN + +struct CSRecognizerInfo : public UMemory { + CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {} - + ~CSRecognizerInfo() {delete recognizer;} - - CharsetRecognizer *recognizer; - UBool isDefaultEnabled; -}; - -U_NAMESPACE_END - -static icu::CSRecognizerInfo **fCSRecognizers = NULL; + + CharsetRecognizer *recognizer; + UBool isDefaultEnabled; +}; + +U_NAMESPACE_END + +static icu::CSRecognizerInfo **fCSRecognizers = NULL; static icu::UInitOnce gCSRecognizersInitOnce = U_INITONCE_INITIALIZER; -static int32_t fCSRecognizers_size = 0; - -U_CDECL_BEGIN -static UBool U_CALLCONV csdet_cleanup(void) -{ - U_NAMESPACE_USE - if (fCSRecognizers != NULL) { - for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { - delete fCSRecognizers[r]; - fCSRecognizers[r] = NULL; - } - - DELETE_ARRAY(fCSRecognizers); - fCSRecognizers = NULL; - fCSRecognizers_size = 0; +static int32_t fCSRecognizers_size = 0; + +U_CDECL_BEGIN +static UBool U_CALLCONV csdet_cleanup(void) +{ + U_NAMESPACE_USE + if (fCSRecognizers != NULL) { + for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { + delete fCSRecognizers[r]; + fCSRecognizers[r] = NULL; + } + + DELETE_ARRAY(fCSRecognizers); + fCSRecognizers = NULL; + fCSRecognizers_size = 0; + } + gCSRecognizersInitOnce.reset(); + + return TRUE; +} + +static int32_t U_CALLCONV +charsetMatchComparator(const void * /*context*/, const void *left, const void *right) +{ + U_NAMESPACE_USE + + const CharsetMatch **csm_l = (const CharsetMatch **) left; + const CharsetMatch **csm_r = (const CharsetMatch **) right; + + // NOTE: compare is backwards to sort from highest to lowest. + return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); +} + +static void U_CALLCONV initRecognizers(UErrorCode &status) { + U_NAMESPACE_USE + ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); + CSRecognizerInfo *tempArray[] = { + new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), + new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), + new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), + new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), + new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), + new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), + new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), + new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), + new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), +#if !UCONFIG_ONLY_HTML_CONVERSION + new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), + new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) +#endif + }; + int32_t rCount = UPRV_LENGTHOF(tempArray); + + fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); + + if (fCSRecognizers == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; } - gCSRecognizersInitOnce.reset(); - - return TRUE; -} - -static int32_t U_CALLCONV -charsetMatchComparator(const void * /*context*/, const void *left, const void *right) -{ - U_NAMESPACE_USE - - const CharsetMatch **csm_l = (const CharsetMatch **) left; - const CharsetMatch **csm_r = (const CharsetMatch **) right; - - // NOTE: compare is backwards to sort from highest to lowest. - return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); -} - -static void U_CALLCONV initRecognizers(UErrorCode &status) { - U_NAMESPACE_USE - ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); - CSRecognizerInfo *tempArray[] = { - new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), - - new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), - new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), - new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), - new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), - - new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), - new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), - new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), - new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), - new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), - new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), - new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), - new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), - new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), - new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), - new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), - new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), - new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), - new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), - new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), - new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), - - new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), -#if !UCONFIG_ONLY_HTML_CONVERSION - new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), - new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), - - new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), - new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), - new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), - new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) -#endif - }; - int32_t rCount = UPRV_LENGTHOF(tempArray); - - fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); - - if (fCSRecognizers == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } - else { - fCSRecognizers_size = rCount; - for (int32_t r = 0; r < rCount; r += 1) { - fCSRecognizers[r] = tempArray[r]; - if (fCSRecognizers[r] == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } - } - } -} - -U_CDECL_END - -U_NAMESPACE_BEGIN - -void CharsetDetector::setRecognizers(UErrorCode &status) -{ - umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); -} - -CharsetDetector::CharsetDetector(UErrorCode &status) - : textIn(new InputText(status)), resultArray(NULL), - resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), - fEnabledRecognizers(NULL) -{ - if (U_FAILURE(status)) { - return; - } - - setRecognizers(status); - - if (U_FAILURE(status)) { - return; - } - - resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); - - if (resultArray == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - - for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { - resultArray[i] = new CharsetMatch(); - - if (resultArray[i] == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - break; - } - } -} - -CharsetDetector::~CharsetDetector() -{ - delete textIn; - - for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { - delete resultArray[i]; - } - - uprv_free(resultArray); - - if (fEnabledRecognizers) { - uprv_free(fEnabledRecognizers); - } -} - -void CharsetDetector::setText(const char *in, int32_t len) -{ - textIn->setText(in, len); - fFreshTextSet = TRUE; -} - -UBool CharsetDetector::setStripTagsFlag(UBool flag) -{ - UBool temp = fStripTags; - fStripTags = flag; - fFreshTextSet = TRUE; - return temp; -} - -UBool CharsetDetector::getStripTagsFlag() const -{ - return fStripTags; -} - -void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const -{ - textIn->setDeclaredEncoding(encoding,len); -} - -int32_t CharsetDetector::getDetectableCount() -{ - UErrorCode status = U_ZERO_ERROR; - - setRecognizers(status); - - return fCSRecognizers_size; -} - -const CharsetMatch *CharsetDetector::detect(UErrorCode &status) -{ - int32_t maxMatchesFound = 0; - - detectAll(maxMatchesFound, status); - - if(maxMatchesFound > 0) { - return resultArray[0]; - } else { - return NULL; - } -} - -const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) -{ - if(!textIn->isSet()) { - status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set - - return NULL; - } else if (fFreshTextSet) { - CharsetRecognizer *csr; - int32_t i; - - textIn->MungeInput(fStripTags); - - // Iterate over all possible charsets, remember all that - // give a match quality > 0. - resultCount = 0; - for (i = 0; i < fCSRecognizers_size; i += 1) { - csr = fCSRecognizers[i]->recognizer; - if (csr->match(textIn, resultArray[resultCount])) { - resultCount++; - } - } - - if (resultCount > 1) { - uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); - } - fFreshTextSet = FALSE; - } - - maxMatchesFound = resultCount; - - return resultArray; -} - -void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) -{ - if (U_FAILURE(status)) { - return; - } - - int32_t modIdx = -1; - UBool isDefaultVal = FALSE; - for (int32_t i = 0; i < fCSRecognizers_size; i++) { - CSRecognizerInfo *csrinfo = fCSRecognizers[i]; - if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { - modIdx = i; - isDefaultVal = (csrinfo->isDefaultEnabled == enabled); - break; - } - } - if (modIdx < 0) { - // No matching encoding found - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - if (fEnabledRecognizers == NULL && !isDefaultVal) { - // Create an array storing the non default setting - fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); - if (fEnabledRecognizers == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - // Initialize the array with default info - for (int32_t i = 0; i < fCSRecognizers_size; i++) { - fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; - } - } - - if (fEnabledRecognizers != NULL) { - fEnabledRecognizers[modIdx] = enabled; - } -} - -/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const -{ - if( index > fCSRecognizers_size-1 || index < 0) { - status = U_INDEX_OUTOFBOUNDS_ERROR; - - return 0; - } else { - return fCSRecognizers[index]->getName(); - } -}*/ - -U_NAMESPACE_END - -U_CDECL_BEGIN -typedef struct { - int32_t currIndex; - UBool all; - UBool *enabledRecognizers; -} Context; - - - -static void U_CALLCONV -enumClose(UEnumeration *en) { - if(en->context != NULL) { - DELETE_ARRAY(en->context); - } - - DELETE_ARRAY(en); -} - -static int32_t U_CALLCONV -enumCount(UEnumeration *en, UErrorCode *) { - if (((Context *)en->context)->all) { - // ucsdet_getAllDetectableCharsets, all charset detector names - return fCSRecognizers_size; - } - - // Otherwise, ucsdet_getDetectableCharsets - only enabled ones - int32_t count = 0; - UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; - if (enabledArray != NULL) { - // custom set - for (int32_t i = 0; i < fCSRecognizers_size; i++) { - if (enabledArray[i]) { - count++; - } - } - } else { - // default set - for (int32_t i = 0; i < fCSRecognizers_size; i++) { - if (fCSRecognizers[i]->isDefaultEnabled) { - count++; - } - } - } - return count; -} - -static const char* U_CALLCONV -enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { - const char *currName = NULL; - - if (((Context *)en->context)->currIndex < fCSRecognizers_size) { - if (((Context *)en->context)->all) { - // ucsdet_getAllDetectableCharsets, all charset detector names - currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); - ((Context *)en->context)->currIndex++; - } else { - // ucsdet_getDetectableCharsets - UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; - if (enabledArray != NULL) { - // custome set - while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { - if (enabledArray[((Context *)en->context)->currIndex]) { - currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); - } - ((Context *)en->context)->currIndex++; - } - } else { - // default set - while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { - if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { - currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); - } - ((Context *)en->context)->currIndex++; - } - } - } - } - - if(resultLength != NULL) { - *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); - } - - return currName; -} - - -static void U_CALLCONV -enumReset(UEnumeration *en, UErrorCode *) { - ((Context *)en->context)->currIndex = 0; -} - -static const UEnumeration gCSDetEnumeration = { - NULL, - NULL, - enumClose, - enumCount, - uenum_unextDefault, - enumNext, - enumReset -}; - -U_CDECL_END - -U_NAMESPACE_BEGIN - -UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) -{ - - /* Initialize recognized charsets. */ - setRecognizers(status); - - if(U_FAILURE(status)) { - return 0; - } - - UEnumeration *en = NEW_ARRAY(UEnumeration, 1); - if (en == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); - en->context = (void*)NEW_ARRAY(Context, 1); - if (en->context == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - DELETE_ARRAY(en); - return 0; - } - uprv_memset(en->context, 0, sizeof(Context)); - ((Context*)en->context)->all = TRUE; - return en; -} - -UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const -{ - if(U_FAILURE(status)) { - return 0; - } - - UEnumeration *en = NEW_ARRAY(UEnumeration, 1); - if (en == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); - en->context = (void*)NEW_ARRAY(Context, 1); - if (en->context == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - DELETE_ARRAY(en); - return 0; - } - uprv_memset(en->context, 0, sizeof(Context)); - ((Context*)en->context)->all = FALSE; - ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; - return en; -} - -U_NAMESPACE_END - -#endif + else { + fCSRecognizers_size = rCount; + for (int32_t r = 0; r < rCount; r += 1) { + fCSRecognizers[r] = tempArray[r]; + if (fCSRecognizers[r] == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } + } + } +} + +U_CDECL_END + +U_NAMESPACE_BEGIN + +void CharsetDetector::setRecognizers(UErrorCode &status) +{ + umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); +} + +CharsetDetector::CharsetDetector(UErrorCode &status) + : textIn(new InputText(status)), resultArray(NULL), + resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), + fEnabledRecognizers(NULL) +{ + if (U_FAILURE(status)) { + return; + } + + setRecognizers(status); + + if (U_FAILURE(status)) { + return; + } + + resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); + + if (resultArray == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { + resultArray[i] = new CharsetMatch(); + + if (resultArray[i] == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + break; + } + } +} + +CharsetDetector::~CharsetDetector() +{ + delete textIn; + + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { + delete resultArray[i]; + } + + uprv_free(resultArray); + + if (fEnabledRecognizers) { + uprv_free(fEnabledRecognizers); + } +} + +void CharsetDetector::setText(const char *in, int32_t len) +{ + textIn->setText(in, len); + fFreshTextSet = TRUE; +} + +UBool CharsetDetector::setStripTagsFlag(UBool flag) +{ + UBool temp = fStripTags; + fStripTags = flag; + fFreshTextSet = TRUE; + return temp; +} + +UBool CharsetDetector::getStripTagsFlag() const +{ + return fStripTags; +} + +void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const +{ + textIn->setDeclaredEncoding(encoding,len); +} + +int32_t CharsetDetector::getDetectableCount() +{ + UErrorCode status = U_ZERO_ERROR; + + setRecognizers(status); + + return fCSRecognizers_size; +} + +const CharsetMatch *CharsetDetector::detect(UErrorCode &status) +{ + int32_t maxMatchesFound = 0; + + detectAll(maxMatchesFound, status); + + if(maxMatchesFound > 0) { + return resultArray[0]; + } else { + return NULL; + } +} + +const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) +{ + if(!textIn->isSet()) { + status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set + + return NULL; + } else if (fFreshTextSet) { + CharsetRecognizer *csr; + int32_t i; + + textIn->MungeInput(fStripTags); + + // Iterate over all possible charsets, remember all that + // give a match quality > 0. + resultCount = 0; + for (i = 0; i < fCSRecognizers_size; i += 1) { + csr = fCSRecognizers[i]->recognizer; + if (csr->match(textIn, resultArray[resultCount])) { + resultCount++; + } + } + + if (resultCount > 1) { + uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); + } + fFreshTextSet = FALSE; + } + + maxMatchesFound = resultCount; + + return resultArray; +} + +void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + + int32_t modIdx = -1; + UBool isDefaultVal = FALSE; + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + CSRecognizerInfo *csrinfo = fCSRecognizers[i]; + if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { + modIdx = i; + isDefaultVal = (csrinfo->isDefaultEnabled == enabled); + break; + } + } + if (modIdx < 0) { + // No matching encoding found + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if (fEnabledRecognizers == NULL && !isDefaultVal) { + // Create an array storing the non default setting + fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); + if (fEnabledRecognizers == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + // Initialize the array with default info + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; + } + } + + if (fEnabledRecognizers != NULL) { + fEnabledRecognizers[modIdx] = enabled; + } +} + +/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const +{ + if( index > fCSRecognizers_size-1 || index < 0) { + status = U_INDEX_OUTOFBOUNDS_ERROR; + + return 0; + } else { + return fCSRecognizers[index]->getName(); + } +}*/ + +U_NAMESPACE_END + +U_CDECL_BEGIN +typedef struct { + int32_t currIndex; + UBool all; + UBool *enabledRecognizers; +} Context; + + + +static void U_CALLCONV +enumClose(UEnumeration *en) { + if(en->context != NULL) { + DELETE_ARRAY(en->context); + } + + DELETE_ARRAY(en); +} + +static int32_t U_CALLCONV +enumCount(UEnumeration *en, UErrorCode *) { + if (((Context *)en->context)->all) { + // ucsdet_getAllDetectableCharsets, all charset detector names + return fCSRecognizers_size; + } + + // Otherwise, ucsdet_getDetectableCharsets - only enabled ones + int32_t count = 0; + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; + if (enabledArray != NULL) { + // custom set + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + if (enabledArray[i]) { + count++; + } + } + } else { + // default set + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + if (fCSRecognizers[i]->isDefaultEnabled) { + count++; + } + } + } + return count; +} + +static const char* U_CALLCONV +enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { + const char *currName = NULL; + + if (((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (((Context *)en->context)->all) { + // ucsdet_getAllDetectableCharsets, all charset detector names + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + ((Context *)en->context)->currIndex++; + } else { + // ucsdet_getDetectableCharsets + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; + if (enabledArray != NULL) { + // custome set + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (enabledArray[((Context *)en->context)->currIndex]) { + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + } + ((Context *)en->context)->currIndex++; + } + } else { + // default set + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + } + ((Context *)en->context)->currIndex++; + } + } + } + } + + if(resultLength != NULL) { + *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); + } + + return currName; +} + + +static void U_CALLCONV +enumReset(UEnumeration *en, UErrorCode *) { + ((Context *)en->context)->currIndex = 0; +} + +static const UEnumeration gCSDetEnumeration = { + NULL, + NULL, + enumClose, + enumCount, + uenum_unextDefault, + enumNext, + enumReset +}; + +U_CDECL_END + +U_NAMESPACE_BEGIN + +UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) +{ + + /* Initialize recognized charsets. */ + setRecognizers(status); + + if(U_FAILURE(status)) { + return 0; + } + + UEnumeration *en = NEW_ARRAY(UEnumeration, 1); + if (en == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); + en->context = (void*)NEW_ARRAY(Context, 1); + if (en->context == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + DELETE_ARRAY(en); + return 0; + } + uprv_memset(en->context, 0, sizeof(Context)); + ((Context*)en->context)->all = TRUE; + return en; +} + +UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const +{ + if(U_FAILURE(status)) { + return 0; + } + + UEnumeration *en = NEW_ARRAY(UEnumeration, 1); + if (en == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); + en->context = (void*)NEW_ARRAY(Context, 1); + if (en->context == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + DELETE_ARRAY(en); + return 0; + } + uprv_memset(en->context, 0, sizeof(Context)); + ((Context*)en->context)->all = FALSE; + ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; + return en; +} + +U_NAMESPACE_END + +#endif |