YQ Connector: move tests from yql to ydb (OSS)

Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
author: vitalyisaev <[email protected]> 2023-11-14 09:58:56 +0300
committer: vitalyisaev <[email protected]> 2023-11-14 10:20:20 +0300
commit: c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
tree: cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
parent: d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)
1 files changed, 359 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Functions/extractTextFromHTML.cpp b/contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
new file mode 100644
index 00000000000..4eefeaa9f86
--- /dev/null
+++ b/contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
@@ -0,0 +1,359 @@
+#include <Columns/ColumnString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <base/find_symbols.h>
+#include <Common/StringUtils/StringUtils.h>
+
+
+/** A function to extract text from HTML or XHTML.
+  * It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
+  * but the implementation is reasonably accurate and it is fast.
+  *
+  * The rules are the following:
+  *
+  * 1. Comments are skipped. Example: <!-- test -->
+  * Comment must end with -->. Nested comments are not possible.
+  * Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
+  *
+  * 2. CDATA is pasted verbatim.
+  * Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
+  *
+  * 3. 'script' and 'style' elements are removed with all their content.
+  * Note: it's assumed that closing tag cannot appear inside content.
+  * For example, in JS string literal is has to be escaped as "<\/script>".
+  * Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
+  * Example: <script><![CDATA[</script>]]></script>
+  * But still searched inside comments. Sometimes it becomes complicated:
+  * <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
+  * Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
+  * Example: <script:a>Hello</script:a>.
+  * Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
+  *
+  * 4. Other tags or tag-like elements are skipped without inner content.
+  * Example: <a>.</a>
+  * Note: it's expected that this HTML is illegal: <a test=">"></a>
+  * Note: it will also skip something like tags: <>, <!>, etc.
+  * Note: tag without end will be skipped to the end of input: <hello
+  * >
+  * 5. HTML and XML entities are not decoded.
+  * It should be processed by separate function.
+  *
+  * 6. Whitespaces in text are collapsed or inserted by specific rules.
+  * Whitespaces at beginning and at the end are removed.
+  * Consecutive whitespaces are collapsed.
+  * But if text is separated by other elements and there is no whitespace, it is inserted.
+  * It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
+  * - in HTML there will be no whitespace, but the function will insert it.
+  * But also consider: Hello<p>world</p>, Hello<br>world.
+  * This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
+  *
+  * 7. Also note that correct handling of whitespaces would require
+  * support of <pre></pre> and CSS display and white-space properties.
+  *
+  * Usage example:
+  *
+  * SELECT extractTextFromHTML(html) FROM url('https://github.com/ClickHouse/ClickHouse', RawBLOB, 'html String')
+  *
+  * - ClickHouse has embedded web browser.
+  */
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+inline bool startsWith(const char * s, const char * end, const std::string_view prefix)
+{
+    return s + prefix.length() < end && 0 == memcmp(s, prefix.data(), prefix.length());
+}
+
+inline bool checkAndSkip(const char * __restrict & s, const char * end, const std::string_view prefix)
+{
+    if (startsWith(s, end, prefix))
+    {
+        s += prefix.length();
+        return true;
+    }
+    return false;
+}
+
+bool processComment(const char * __restrict & src, const char * end)
+{
+    if (!checkAndSkip(src, end, "<!--"))
+        return false;
+
+    while (true)
+    {
+        const char * gt = find_first_symbols<'>'>(src, end);
+        if (gt >= end)
+            break;
+
+        if (gt > src + strlen("--") && gt[-1] == '-' && gt[-2] == '-')
+        {
+            src = gt + 1;
+            break;
+        }
+
+        src = gt + 1;
+    }
+
+    return true;
+}
+
+bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
+{
+    if (!checkAndSkip(src, end, "<![CDATA["))
+        return false;
+
+    const char * gt = src;
+    while (true)
+    {
+        gt = find_first_symbols<'>'>(gt, end);
+        if (gt >= end)
+            break;
+
+        if (gt[-1] == ']' && gt[-2] == ']')
+        {
+            if (dst)
+            {
+                size_t bytes_to_copy = gt - src - strlen("]]");
+                memcpy(dst, src, bytes_to_copy);
+                dst += bytes_to_copy;
+            }
+            src = gt + 1;
+            break;
+        }
+
+        ++gt;
+    }
+
+    return true;
+}
+
+bool processElementAndSkipContent(const char * __restrict & src, const char * end, const std::string_view tag_name)
+{
+    const auto * old_src = src;
+
+    if (!(src < end && *src == '<'))
+        return false;
+    ++src;
+
+    if (!checkAndSkip(src, end, tag_name))
+    {
+        src = old_src;
+        return false;
+    }
+
+    if (src >= end)
+        return false;
+
+    if (!(isWhitespaceASCII(*src) || *src == '>'))
+    {
+        src = old_src;
+        return false;
+    }
+
+    const char * gt = find_first_symbols<'>'>(src, end);
+    if (gt >= end)
+        return false;
+
+    src = gt + 1;
+
+    while (true)
+    {
+        const char * lt = find_first_symbols<'<'>(src, end);
+        src = lt;
+        if (src + 1 >= end)
+            break;
+
+        ++src;
+
+        /// Skip CDATA
+        if (*src == '!')
+        {
+            --src;
+            char * dst = nullptr;
+            if (processCDATA(src, end, dst))
+                continue;
+            ++src;
+        }
+
+        if (*src != '/')
+            continue;
+        ++src;
+
+        if (checkAndSkip(src, end, tag_name))
+        {
+            while (src < end && isWhitespaceASCII(*src))
+                ++src;
+
+            if (src >= end)
+                break;
+
+            if (*src == '>')
+            {
+                ++src;
+                break;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool skipTag(const char * __restrict & src, const char * end)
+{
+    if (src < end && *src == '<')
+    {
+        src = find_first_symbols<'>'>(src, end);
+        if (src < end)
+            ++src;
+
+        return true;
+    }
+
+    return false;
+}
+
+void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
+{
+    while (src < end && isWhitespaceASCII(*src))
+        ++src;
+
+    const char * lt = find_first_symbols<'<'>(src, end);
+
+    if (needs_whitespace && src < lt)
+    {
+        *dst = ' ';
+        ++dst;
+    }
+
+    while (true)
+    {
+        const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
+        size_t bytes_to_copy = ws - src;
+        memcpy(dst, src, bytes_to_copy);
+        dst += bytes_to_copy;
+
+        src = ws;
+        while (src < lt && isWhitespaceASCII(*src))
+            ++src;
+
+        if (src < lt)
+        {
+            *dst = ' ';
+            ++dst;
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    src = lt;
+}
+
+size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
+{
+    /** There are the following rules:
+      * - comments are removed with all their content;
+      * - elements 'script' and 'style' are removed with all their content;
+      * - for other elements tags are removed but content is processed as text;
+      * - CDATA should be copied verbatim;
+      */
+
+    const char * end = src + size;
+    char * dst_begin = dst;
+
+    while (src < end)
+    {
+        bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
+        copyText(src, end, dst, needs_whitespace);
+
+        processComment(src, end)
+            || processCDATA(src, end, dst)
+            || processElementAndSkipContent(src, end, "script")
+            || processElementAndSkipContent(src, end, "style")
+            || skipTag(src, end);
+    }
+
+    return dst - dst_begin;
+}
+
+}
+
+
+class FunctionExtractTextFromHTML : public IFunction
+{
+public:
+    static constexpr auto name = "extractTextFromHTML";
+
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionExtractTextFromHTML>(); }
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 1; }
+    bool useDefaultImplementationForConstants() const override { return true; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (!isString(arguments[0]))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
+                arguments[0]->getName(), getName());
+        return arguments[0];
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t rows) const override
+    {
+        const ColumnString * src = checkAndGetColumn<ColumnString>(arguments[0].column.get());
+        if (!src)
+             throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument for function {} must be string.", getName());
+
+        const ColumnString::Chars & src_chars = src->getChars();
+        const ColumnString::Offsets & src_offsets = src->getOffsets();
+
+        auto res = ColumnString::create();
+
+        ColumnString::Chars & res_chars = res->getChars();
+        ColumnString::Offsets & res_offsets = res->getOffsets();
+
+        res_chars.resize(src_chars.size());
+        res_offsets.resize(src_offsets.size());
+
+        ColumnString::Offset src_offset = 0;
+        ColumnString::Offset res_offset = 0;
+
+        for (size_t i = 0; i < rows; ++i)
+        {
+            auto next_src_offset = src_offsets[i];
+
+            res_offset += extract(
+                reinterpret_cast<const char *>(&src_chars[src_offset]),
+                next_src_offset - src_offset - 1,
+                reinterpret_cast<char *>(&res_chars[res_offset]));
+
+            res_chars[res_offset] = 0;
+            ++res_offset;
+            res_offsets[i] = res_offset;
+
+            src_offset = next_src_offset;
+        }
+
+        res_chars.resize(res_offset);
+        return res;
+    }
+};
+
+REGISTER_FUNCTION(ExtractTextFromHTML)
+{
+    factory.registerFunction<FunctionExtractTextFromHTML>();
+}
+
+}
author	vitalyisaev <[email protected]>	2023-11-14 09:58:56 +0300
committer	vitalyisaev <[email protected]>	2023-11-14 10:20:20 +0300
commit	c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
tree	cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
parent	d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)