diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/libidn/tld.c | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/libidn/tld.c')
-rw-r--r-- | contrib/libs/libidn/tld.c | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/contrib/libs/libidn/tld.c b/contrib/libs/libidn/tld.c new file mode 100644 index 0000000000..4e894663fb --- /dev/null +++ b/contrib/libs/libidn/tld.c @@ -0,0 +1,514 @@ +/* tld.c --- Handle TLD restriction checking. + * Copyright (C) 2004, 2005, 2006, 2007 Simon Josefsson. + * Copyright (C) 2003, 2004 Free Software Foundation, Inc. + * + * Author: Thomas Jacob, Internet24.de + * + * This file is part of GNU Libidn. + * + * GNU Libidn is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * GNU Libidn is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GNU Libidn; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + */ + +/* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */ +#include <stringprep.h> + +/* Get strcmp(). */ +#include <string.h> + +/* Get specifications. */ +#include <tld.h> + +/* Array of built-in domain restriction structures. See tlds.c. */ +extern const Tld_table *_tld_tables[]; + +/** + * tld_get_table - get table for a TLD name in table + * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string. + * @tables: Zero terminated array of #Tld_table info-structures for + * TLDs. + * + * Get the TLD table for a named TLD by searching through the given + * TLD table array. + * + * Return value: Return structure corresponding to TLD @tld by going + * thru @tables, or return %NULL if no such structure is found. + */ +const Tld_table * +tld_get_table (const char *tld, const Tld_table ** tables) +{ + const Tld_table **tldtable = NULL; + + if (!tld || !tables) + return NULL; + + for (tldtable = tables; *tldtable; tldtable++) + if (!strcmp ((*tldtable)->name, tld)) + return *tldtable; + + return NULL; +} + +/** + * tld_default_table - get table for a TLD name + * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string. + * @overrides: Additional zero terminated array of #Tld_table + * info-structures for TLDs, or %NULL to only use library deault + * tables. + * + * Get the TLD table for a named TLD, using the internal defaults, + * possibly overrided by the (optional) supplied tables. + * + * Return value: Return structure corresponding to TLD @tld_str, first + * looking through @overrides then thru built-in list, or %NULL if + * no such structure found. + */ +const Tld_table * +tld_default_table (const char *tld, const Tld_table ** overrides) +{ + const Tld_table *tldtable = NULL; + + if (!tld) + return NULL; + + if (overrides) + tldtable = tld_get_table (tld, overrides); + + if (!tldtable) + tldtable = tld_get_table (tld, _tld_tables); + + return tldtable; +} + +#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \ + (c) == 0xFF0E || (c) == 0xFF61) + +/** + * tld_get_4 - extract top level domain part in input Unicode string + * @in: Array of unicode code points to process. Does not need to be + * zero terminated. + * @inlen: Number of unicode code points. + * @out: Zero terminated ascii result string pointer. + * + * Isolate the top-level domain of @in and return it as an ASCII + * string in @out. + * + * Return value: Return %TLD_SUCCESS on success, or the corresponding + * #Tld_rc error code otherwise. + */ +int +tld_get_4 (const uint32_t * in, size_t inlen, char **out) +{ + const uint32_t *ipos; + size_t olen; + + *out = NULL; + if (!in || inlen == 0) + return TLD_NODATA; + + ipos = &in[inlen - 1]; + olen = 0; + /* Scan backwards for non(latin)letters. */ + while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) || + (*ipos >= 0x61 && *ipos <= 0x7A))) + ipos--, olen++; + + if (olen > 0 && DOTP (*ipos)) /* Found something that appears a TLD. */ + { + char *out_s = malloc (sizeof (char) * (olen + 1)); + char *opos = out_s; + + if (!opos) + return TLD_MALLOC_ERROR; + + ipos++; + /* Transcribe to lowercase ascii string. */ + for (; ipos < &in[inlen]; ipos++, opos++) + *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20; + *opos = 0; + *out = out_s; + return TLD_SUCCESS; + } + + return TLD_NO_TLD; +} + +/** + * tld_get_4z - extract top level domain part in input Unicode string + * @in: Zero terminated array of unicode code points to process. + * @out: Zero terminated ascii result string pointer. + * + * Isolate the top-level domain of @in and return it as an ASCII + * string in @out. + * + * Return value: Return %TLD_SUCCESS on success, or the corresponding + * #Tld_rc error code otherwise. + */ +int +tld_get_4z (const uint32_t * in, char **out) +{ + const uint32_t *ipos = in; + + if (!in) + return TLD_NODATA; + + while (*ipos) + ipos++; + + return tld_get_4 (in, ipos - in, out); +} + +/** + * tld_get_z - extract top level domain part in input string + * @in: Zero terminated character array to process. + * @out: Zero terminated ascii result string pointer. + * + * Isolate the top-level domain of @in and return it as an ASCII + * string in @out. The input string @in may be UTF-8, ISO-8859-1 or + * any ASCII compatible character encoding. + * + * Return value: Return %TLD_SUCCESS on success, or the corresponding + * #Tld_rc error code otherwise. + */ +int +tld_get_z (const char *in, char **out) +{ + uint32_t *iucs; + size_t i, ilen; + int rc; + + ilen = strlen (in); + iucs = calloc (ilen, sizeof (*iucs)); + + if (!iucs) + return TLD_MALLOC_ERROR; + + for (i = 0; i < ilen; i++) + iucs[i] = in[i]; + + rc = tld_get_4 (iucs, ilen, out); + + free (iucs); + + return rc; +} + +/* + * tld_checkchar - verify that character is permitted + * @ch: 32 bit unicode character to check. + * @tld: A #Tld_table data structure to check @ch against. + * + * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid + * character in @tld. + * + * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a + * valid character for the TLD @tld or if @tld is %NULL, + * %TLD_INVALID if @ch is invalid as defined by @tld. + */ +static int +_tld_checkchar (uint32_t ch, const Tld_table * tld) +{ + const Tld_table_element *s, *e, *m; + + if (!tld) + return TLD_SUCCESS; + + /* Check for [-a-z0-9.]. */ + if ((ch >= 0x61 && ch <= 0x7A) || + (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch)) + return TLD_SUCCESS; + + s = tld->valid; + e = s + tld->nvalid; + while (s < e) + { + m = s + ((e - s) >> 1); + if (ch < m->start) + e = m; + else if (ch > m->end) + s = m + 1; + else + return TLD_SUCCESS; + } + + return TLD_INVALID; +} + +/** + * tld_check_4t - verify that characters are permitted + * @in: Array of unicode code points to process. Does not need to be + * zero terminated. + * @inlen: Number of unicode code points. + * @errpos: Position of offending character is returned here. + * @tld: A #Tld_table data structure representing the restrictions for + * which the input should be tested. + * + * Test each of the code points in @in for whether or not + * they are allowed by the data structure in @tld, return + * the position of the first character for which this is not + * the case in @errpos. + * + * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code + * points are valid or when @tld is null, %TLD_INVALID if a + * character is not allowed, or additional error codes on general + * failure conditions. + */ +int +tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos, + const Tld_table * tld) +{ + const uint32_t *ipos; + int rc; + + if (!tld) /* No data for TLD so everything is valid. */ + return TLD_SUCCESS; + + ipos = in; + while (ipos < &in[inlen]) + { + rc = _tld_checkchar (*ipos, tld); + if (rc != TLD_SUCCESS) + { + if (errpos) + *errpos = ipos - in; + return rc; + } + ipos++; + } + return TLD_SUCCESS; +} + +/** + * tld_check_4tz - verify that characters are permitted + * @in: Zero terminated array of unicode code points to process. + * @errpos: Position of offending character is returned here. + * @tld: A #Tld_table data structure representing the restrictions for + * which the input should be tested. + * + * Test each of the code points in @in for whether or not + * they are allowed by the data structure in @tld, return + * the position of the first character for which this is not + * the case in @errpos. + * + * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code + * points are valid or when @tld is null, %TLD_INVALID if a + * character is not allowed, or additional error codes on general + * failure conditions. + */ +int +tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld) +{ + const uint32_t *ipos = in; + + if (!ipos) + return TLD_NODATA; + + while (*ipos) + ipos++; + + return tld_check_4t (in, ipos - in, errpos, tld); +} + +/** + * tld_check_4 - verify that characters are permitted + * @in: Array of unicode code points to process. Does not need to be + * zero terminated. + * @inlen: Number of unicode code points. + * @errpos: Position of offending character is returned here. + * @overrides: A #Tld_table array of additional domain restriction + * structures that complement and supersede the built-in information. + * + * Test each of the code points in @in for whether or not they are + * allowed by the information in @overrides or by the built-in TLD + * restriction data. When data for the same TLD is available both + * internally and in @overrides, the information in @overrides takes + * precedence. If several entries for a specific TLD are found, the + * first one is used. If @overrides is %NULL, only the built-in + * information is used. The position of the first offending character + * is returned in @errpos. + * + * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code + * points are valid or when @tld is null, %TLD_INVALID if a + * character is not allowed, or additional error codes on general + * failure conditions. + */ +int +tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos, + const Tld_table ** overrides) +{ + const Tld_table *tld; + char *domain; + int rc; + + if (errpos) + *errpos = 0; + + /* Get TLD name. */ + rc = tld_get_4 (in, inlen, &domain); + + if (rc != TLD_SUCCESS) + { + if (rc == TLD_NO_TLD) /* No TLD, say OK */ + return TLD_SUCCESS; + else + return rc; + } + + /* Retrieve appropriate data structure. */ + tld = tld_default_table (domain, overrides); + free (domain); + + return tld_check_4t (in, inlen, errpos, tld); +} + +/** + * tld_check_4z - verify that characters are permitted + * @in: Zero-terminated array of unicode code points to process. + * @errpos: Position of offending character is returned here. + * @overrides: A #Tld_table array of additional domain restriction + * structures that complement and supersede the built-in information. + * + * Test each of the code points in @in for whether or not they are + * allowed by the information in @overrides or by the built-in TLD + * restriction data. When data for the same TLD is available both + * internally and in @overrides, the information in @overrides takes + * precedence. If several entries for a specific TLD are found, the + * first one is used. If @overrides is %NULL, only the built-in + * information is used. The position of the first offending character + * is returned in @errpos. + * + * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code + * points are valid or when @tld is null, %TLD_INVALID if a + * character is not allowed, or additional error codes on general + * failure conditions. + */ +int +tld_check_4z (const uint32_t * in, size_t * errpos, + const Tld_table ** overrides) +{ + const uint32_t *ipos = in; + + if (!ipos) + return TLD_NODATA; + + while (*ipos) + ipos++; + + return tld_check_4 (in, ipos - in, errpos, overrides); +} + +/** + * tld_check_8z - verify that characters are permitted + * @in: Zero-terminated UTF8 string to process. + * @errpos: Position of offending character is returned here. + * @overrides: A #Tld_table array of additional domain restriction + * structures that complement and supersede the built-in information. + * + * Test each of the characters in @in for whether or not they are + * allowed by the information in @overrides or by the built-in TLD + * restriction data. When data for the same TLD is available both + * internally and in @overrides, the information in @overrides takes + * precedence. If several entries for a specific TLD are found, the + * first one is used. If @overrides is %NULL, only the built-in + * information is used. The position of the first offending character + * is returned in @errpos. Note that the error position refers to the + * decoded character offset rather than the byte position in the + * string. + * + * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all + * characters are valid or when @tld is null, %TLD_INVALID if a + * character is not allowed, or additional error codes on general + * failure conditions. + */ +int +tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides) +{ + uint32_t *iucs; + size_t ilen; + int rc; + + if (!in) + return TLD_NODATA; + + iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen); + + if (!iucs) + return TLD_MALLOC_ERROR; + + rc = tld_check_4 (iucs, ilen, errpos, overrides); + + free (iucs); + + return rc; +} + +/** + * tld_check_lz - verify that characters are permitted + * @in: Zero-terminated string in the current locales encoding to process. + * @errpos: Position of offending character is returned here. + * @overrides: A #Tld_table array of additional domain restriction + * structures that complement and supersede the built-in information. + * + * Test each of the characters in @in for whether or not they are + * allowed by the information in @overrides or by the built-in TLD + * restriction data. When data for the same TLD is available both + * internally and in @overrides, the information in @overrides takes + * precedence. If several entries for a specific TLD are found, the + * first one is used. If @overrides is %NULL, only the built-in + * information is used. The position of the first offending character + * is returned in @errpos. Note that the error position refers to the + * decoded character offset rather than the byte position in the + * string. + * + * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all + * characters are valid or when @tld is null, %TLD_INVALID if a + * character is not allowed, or additional error codes on general + * failure conditions. + */ +int +tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides) +{ + char *utf8; + int rc; + + if (!in) + return TLD_NODATA; + + utf8 = stringprep_locale_to_utf8 (in); + if (!utf8) + return TLD_ICONV_ERROR; + + + rc = tld_check_8z (utf8, errpos, overrides); + + free (utf8); + + return rc; +} + +/** + * Tld_rc: + * @TLD_SUCCESS: Successful operation. This value is guaranteed to + * always be zero, the remaining ones are only guaranteed to hold + * non-zero values, for logical comparison purposes. + * @TLD_INVALID: Invalid character found. + * @TLD_NODATA: No input data was provided. + * @TLD_MALLOC_ERROR: Error during memory allocation. + * @TLD_ICONV_ERROR: Error during iconv string conversion. + * @TLD_NO_TLD: No top-level domain found in domain string. + * + * Enumerated return codes of the TLD checking functions. + * The value 0 is guaranteed to always correspond to success. + */ |