aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/libidn/tld.c
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/libidn/tld.c
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/libidn/tld.c')
-rw-r--r--contrib/libs/libidn/tld.c514
1 files changed, 514 insertions, 0 deletions
diff --git a/contrib/libs/libidn/tld.c b/contrib/libs/libidn/tld.c
new file mode 100644
index 0000000000..4e894663fb
--- /dev/null
+++ b/contrib/libs/libidn/tld.c
@@ -0,0 +1,514 @@
+/* tld.c --- Handle TLD restriction checking.
+ * Copyright (C) 2004, 2005, 2006, 2007 Simon Josefsson.
+ * Copyright (C) 2003, 2004 Free Software Foundation, Inc.
+ *
+ * Author: Thomas Jacob, Internet24.de
+ *
+ * This file is part of GNU Libidn.
+ *
+ * GNU Libidn is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * GNU Libidn is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GNU Libidn; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ */
+
+/* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
+#include <stringprep.h>
+
+/* Get strcmp(). */
+#include <string.h>
+
+/* Get specifications. */
+#include <tld.h>
+
+/* Array of built-in domain restriction structures. See tlds.c. */
+extern const Tld_table *_tld_tables[];
+
+/**
+ * tld_get_table - get table for a TLD name in table
+ * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
+ * @tables: Zero terminated array of #Tld_table info-structures for
+ * TLDs.
+ *
+ * Get the TLD table for a named TLD by searching through the given
+ * TLD table array.
+ *
+ * Return value: Return structure corresponding to TLD @tld by going
+ * thru @tables, or return %NULL if no such structure is found.
+ */
+const Tld_table *
+tld_get_table (const char *tld, const Tld_table ** tables)
+{
+ const Tld_table **tldtable = NULL;
+
+ if (!tld || !tables)
+ return NULL;
+
+ for (tldtable = tables; *tldtable; tldtable++)
+ if (!strcmp ((*tldtable)->name, tld))
+ return *tldtable;
+
+ return NULL;
+}
+
+/**
+ * tld_default_table - get table for a TLD name
+ * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
+ * @overrides: Additional zero terminated array of #Tld_table
+ * info-structures for TLDs, or %NULL to only use library deault
+ * tables.
+ *
+ * Get the TLD table for a named TLD, using the internal defaults,
+ * possibly overrided by the (optional) supplied tables.
+ *
+ * Return value: Return structure corresponding to TLD @tld_str, first
+ * looking through @overrides then thru built-in list, or %NULL if
+ * no such structure found.
+ */
+const Tld_table *
+tld_default_table (const char *tld, const Tld_table ** overrides)
+{
+ const Tld_table *tldtable = NULL;
+
+ if (!tld)
+ return NULL;
+
+ if (overrides)
+ tldtable = tld_get_table (tld, overrides);
+
+ if (!tldtable)
+ tldtable = tld_get_table (tld, _tld_tables);
+
+ return tldtable;
+}
+
+#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
+ (c) == 0xFF0E || (c) == 0xFF61)
+
+/**
+ * tld_get_4 - extract top level domain part in input Unicode string
+ * @in: Array of unicode code points to process. Does not need to be
+ * zero terminated.
+ * @inlen: Number of unicode code points.
+ * @out: Zero terminated ascii result string pointer.
+ *
+ * Isolate the top-level domain of @in and return it as an ASCII
+ * string in @out.
+ *
+ * Return value: Return %TLD_SUCCESS on success, or the corresponding
+ * #Tld_rc error code otherwise.
+ */
+int
+tld_get_4 (const uint32_t * in, size_t inlen, char **out)
+{
+ const uint32_t *ipos;
+ size_t olen;
+
+ *out = NULL;
+ if (!in || inlen == 0)
+ return TLD_NODATA;
+
+ ipos = &in[inlen - 1];
+ olen = 0;
+ /* Scan backwards for non(latin)letters. */
+ while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
+ (*ipos >= 0x61 && *ipos <= 0x7A)))
+ ipos--, olen++;
+
+ if (olen > 0 && DOTP (*ipos)) /* Found something that appears a TLD. */
+ {
+ char *out_s = malloc (sizeof (char) * (olen + 1));
+ char *opos = out_s;
+
+ if (!opos)
+ return TLD_MALLOC_ERROR;
+
+ ipos++;
+ /* Transcribe to lowercase ascii string. */
+ for (; ipos < &in[inlen]; ipos++, opos++)
+ *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
+ *opos = 0;
+ *out = out_s;
+ return TLD_SUCCESS;
+ }
+
+ return TLD_NO_TLD;
+}
+
+/**
+ * tld_get_4z - extract top level domain part in input Unicode string
+ * @in: Zero terminated array of unicode code points to process.
+ * @out: Zero terminated ascii result string pointer.
+ *
+ * Isolate the top-level domain of @in and return it as an ASCII
+ * string in @out.
+ *
+ * Return value: Return %TLD_SUCCESS on success, or the corresponding
+ * #Tld_rc error code otherwise.
+ */
+int
+tld_get_4z (const uint32_t * in, char **out)
+{
+ const uint32_t *ipos = in;
+
+ if (!in)
+ return TLD_NODATA;
+
+ while (*ipos)
+ ipos++;
+
+ return tld_get_4 (in, ipos - in, out);
+}
+
+/**
+ * tld_get_z - extract top level domain part in input string
+ * @in: Zero terminated character array to process.
+ * @out: Zero terminated ascii result string pointer.
+ *
+ * Isolate the top-level domain of @in and return it as an ASCII
+ * string in @out. The input string @in may be UTF-8, ISO-8859-1 or
+ * any ASCII compatible character encoding.
+ *
+ * Return value: Return %TLD_SUCCESS on success, or the corresponding
+ * #Tld_rc error code otherwise.
+ */
+int
+tld_get_z (const char *in, char **out)
+{
+ uint32_t *iucs;
+ size_t i, ilen;
+ int rc;
+
+ ilen = strlen (in);
+ iucs = calloc (ilen, sizeof (*iucs));
+
+ if (!iucs)
+ return TLD_MALLOC_ERROR;
+
+ for (i = 0; i < ilen; i++)
+ iucs[i] = in[i];
+
+ rc = tld_get_4 (iucs, ilen, out);
+
+ free (iucs);
+
+ return rc;
+}
+
+/*
+ * tld_checkchar - verify that character is permitted
+ * @ch: 32 bit unicode character to check.
+ * @tld: A #Tld_table data structure to check @ch against.
+ *
+ * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
+ * character in @tld.
+ *
+ * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
+ * valid character for the TLD @tld or if @tld is %NULL,
+ * %TLD_INVALID if @ch is invalid as defined by @tld.
+ */
+static int
+_tld_checkchar (uint32_t ch, const Tld_table * tld)
+{
+ const Tld_table_element *s, *e, *m;
+
+ if (!tld)
+ return TLD_SUCCESS;
+
+ /* Check for [-a-z0-9.]. */
+ if ((ch >= 0x61 && ch <= 0x7A) ||
+ (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
+ return TLD_SUCCESS;
+
+ s = tld->valid;
+ e = s + tld->nvalid;
+ while (s < e)
+ {
+ m = s + ((e - s) >> 1);
+ if (ch < m->start)
+ e = m;
+ else if (ch > m->end)
+ s = m + 1;
+ else
+ return TLD_SUCCESS;
+ }
+
+ return TLD_INVALID;
+}
+
+/**
+ * tld_check_4t - verify that characters are permitted
+ * @in: Array of unicode code points to process. Does not need to be
+ * zero terminated.
+ * @inlen: Number of unicode code points.
+ * @errpos: Position of offending character is returned here.
+ * @tld: A #Tld_table data structure representing the restrictions for
+ * which the input should be tested.
+ *
+ * Test each of the code points in @in for whether or not
+ * they are allowed by the data structure in @tld, return
+ * the position of the first character for which this is not
+ * the case in @errpos.
+ *
+ * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
+ * points are valid or when @tld is null, %TLD_INVALID if a
+ * character is not allowed, or additional error codes on general
+ * failure conditions.
+ */
+int
+tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos,
+ const Tld_table * tld)
+{
+ const uint32_t *ipos;
+ int rc;
+
+ if (!tld) /* No data for TLD so everything is valid. */
+ return TLD_SUCCESS;
+
+ ipos = in;
+ while (ipos < &in[inlen])
+ {
+ rc = _tld_checkchar (*ipos, tld);
+ if (rc != TLD_SUCCESS)
+ {
+ if (errpos)
+ *errpos = ipos - in;
+ return rc;
+ }
+ ipos++;
+ }
+ return TLD_SUCCESS;
+}
+
+/**
+ * tld_check_4tz - verify that characters are permitted
+ * @in: Zero terminated array of unicode code points to process.
+ * @errpos: Position of offending character is returned here.
+ * @tld: A #Tld_table data structure representing the restrictions for
+ * which the input should be tested.
+ *
+ * Test each of the code points in @in for whether or not
+ * they are allowed by the data structure in @tld, return
+ * the position of the first character for which this is not
+ * the case in @errpos.
+ *
+ * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
+ * points are valid or when @tld is null, %TLD_INVALID if a
+ * character is not allowed, or additional error codes on general
+ * failure conditions.
+ */
+int
+tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld)
+{
+ const uint32_t *ipos = in;
+
+ if (!ipos)
+ return TLD_NODATA;
+
+ while (*ipos)
+ ipos++;
+
+ return tld_check_4t (in, ipos - in, errpos, tld);
+}
+
+/**
+ * tld_check_4 - verify that characters are permitted
+ * @in: Array of unicode code points to process. Does not need to be
+ * zero terminated.
+ * @inlen: Number of unicode code points.
+ * @errpos: Position of offending character is returned here.
+ * @overrides: A #Tld_table array of additional domain restriction
+ * structures that complement and supersede the built-in information.
+ *
+ * Test each of the code points in @in for whether or not they are
+ * allowed by the information in @overrides or by the built-in TLD
+ * restriction data. When data for the same TLD is available both
+ * internally and in @overrides, the information in @overrides takes
+ * precedence. If several entries for a specific TLD are found, the
+ * first one is used. If @overrides is %NULL, only the built-in
+ * information is used. The position of the first offending character
+ * is returned in @errpos.
+ *
+ * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
+ * points are valid or when @tld is null, %TLD_INVALID if a
+ * character is not allowed, or additional error codes on general
+ * failure conditions.
+ */
+int
+tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos,
+ const Tld_table ** overrides)
+{
+ const Tld_table *tld;
+ char *domain;
+ int rc;
+
+ if (errpos)
+ *errpos = 0;
+
+ /* Get TLD name. */
+ rc = tld_get_4 (in, inlen, &domain);
+
+ if (rc != TLD_SUCCESS)
+ {
+ if (rc == TLD_NO_TLD) /* No TLD, say OK */
+ return TLD_SUCCESS;
+ else
+ return rc;
+ }
+
+ /* Retrieve appropriate data structure. */
+ tld = tld_default_table (domain, overrides);
+ free (domain);
+
+ return tld_check_4t (in, inlen, errpos, tld);
+}
+
+/**
+ * tld_check_4z - verify that characters are permitted
+ * @in: Zero-terminated array of unicode code points to process.
+ * @errpos: Position of offending character is returned here.
+ * @overrides: A #Tld_table array of additional domain restriction
+ * structures that complement and supersede the built-in information.
+ *
+ * Test each of the code points in @in for whether or not they are
+ * allowed by the information in @overrides or by the built-in TLD
+ * restriction data. When data for the same TLD is available both
+ * internally and in @overrides, the information in @overrides takes
+ * precedence. If several entries for a specific TLD are found, the
+ * first one is used. If @overrides is %NULL, only the built-in
+ * information is used. The position of the first offending character
+ * is returned in @errpos.
+ *
+ * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
+ * points are valid or when @tld is null, %TLD_INVALID if a
+ * character is not allowed, or additional error codes on general
+ * failure conditions.
+ */
+int
+tld_check_4z (const uint32_t * in, size_t * errpos,
+ const Tld_table ** overrides)
+{
+ const uint32_t *ipos = in;
+
+ if (!ipos)
+ return TLD_NODATA;
+
+ while (*ipos)
+ ipos++;
+
+ return tld_check_4 (in, ipos - in, errpos, overrides);
+}
+
+/**
+ * tld_check_8z - verify that characters are permitted
+ * @in: Zero-terminated UTF8 string to process.
+ * @errpos: Position of offending character is returned here.
+ * @overrides: A #Tld_table array of additional domain restriction
+ * structures that complement and supersede the built-in information.
+ *
+ * Test each of the characters in @in for whether or not they are
+ * allowed by the information in @overrides or by the built-in TLD
+ * restriction data. When data for the same TLD is available both
+ * internally and in @overrides, the information in @overrides takes
+ * precedence. If several entries for a specific TLD are found, the
+ * first one is used. If @overrides is %NULL, only the built-in
+ * information is used. The position of the first offending character
+ * is returned in @errpos. Note that the error position refers to the
+ * decoded character offset rather than the byte position in the
+ * string.
+ *
+ * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
+ * characters are valid or when @tld is null, %TLD_INVALID if a
+ * character is not allowed, or additional error codes on general
+ * failure conditions.
+ */
+int
+tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides)
+{
+ uint32_t *iucs;
+ size_t ilen;
+ int rc;
+
+ if (!in)
+ return TLD_NODATA;
+
+ iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
+
+ if (!iucs)
+ return TLD_MALLOC_ERROR;
+
+ rc = tld_check_4 (iucs, ilen, errpos, overrides);
+
+ free (iucs);
+
+ return rc;
+}
+
+/**
+ * tld_check_lz - verify that characters are permitted
+ * @in: Zero-terminated string in the current locales encoding to process.
+ * @errpos: Position of offending character is returned here.
+ * @overrides: A #Tld_table array of additional domain restriction
+ * structures that complement and supersede the built-in information.
+ *
+ * Test each of the characters in @in for whether or not they are
+ * allowed by the information in @overrides or by the built-in TLD
+ * restriction data. When data for the same TLD is available both
+ * internally and in @overrides, the information in @overrides takes
+ * precedence. If several entries for a specific TLD are found, the
+ * first one is used. If @overrides is %NULL, only the built-in
+ * information is used. The position of the first offending character
+ * is returned in @errpos. Note that the error position refers to the
+ * decoded character offset rather than the byte position in the
+ * string.
+ *
+ * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
+ * characters are valid or when @tld is null, %TLD_INVALID if a
+ * character is not allowed, or additional error codes on general
+ * failure conditions.
+ */
+int
+tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides)
+{
+ char *utf8;
+ int rc;
+
+ if (!in)
+ return TLD_NODATA;
+
+ utf8 = stringprep_locale_to_utf8 (in);
+ if (!utf8)
+ return TLD_ICONV_ERROR;
+
+
+ rc = tld_check_8z (utf8, errpos, overrides);
+
+ free (utf8);
+
+ return rc;
+}
+
+/**
+ * Tld_rc:
+ * @TLD_SUCCESS: Successful operation. This value is guaranteed to
+ * always be zero, the remaining ones are only guaranteed to hold
+ * non-zero values, for logical comparison purposes.
+ * @TLD_INVALID: Invalid character found.
+ * @TLD_NODATA: No input data was provided.
+ * @TLD_MALLOC_ERROR: Error during memory allocation.
+ * @TLD_ICONV_ERROR: Error during iconv string conversion.
+ * @TLD_NO_TLD: No top-level domain found in domain string.
+ *
+ * Enumerated return codes of the TLD checking functions.
+ * The value 0 is guaranteed to always correspond to success.
+ */