summaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
diff options
context:
space:
mode:
authorvitalyisaev <[email protected]>2023-11-14 09:58:56 +0300
committervitalyisaev <[email protected]>2023-11-14 10:20:20 +0300
commitc2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
treecd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
parentd4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/Functions/extractTextFromHTML.cpp')
-rw-r--r--contrib/clickhouse/src/Functions/extractTextFromHTML.cpp359
1 files changed, 359 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Functions/extractTextFromHTML.cpp b/contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
new file mode 100644
index 00000000000..4eefeaa9f86
--- /dev/null
+++ b/contrib/clickhouse/src/Functions/extractTextFromHTML.cpp
@@ -0,0 +1,359 @@
+#include <Columns/ColumnString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <base/find_symbols.h>
+#include <Common/StringUtils/StringUtils.h>
+
+
+/** A function to extract text from HTML or XHTML.
+ * It does not necessarily 100% conforms to any of the HTML, XML or XHTML standards,
+ * but the implementation is reasonably accurate and it is fast.
+ *
+ * The rules are the following:
+ *
+ * 1. Comments are skipped. Example: <!-- test -->
+ * Comment must end with -->. Nested comments are not possible.
+ * Note: constructions like <!--> <!---> are not valid comments in HTML but will be skipped by other rules.
+ *
+ * 2. CDATA is pasted verbatim.
+ * Note: CDATA is XML/XHTML specific. But we still process it for "best-effort" approach.
+ *
+ * 3. 'script' and 'style' elements are removed with all their content.
+ * Note: it's assumed that closing tag cannot appear inside content.
+ * For example, in JS string literal is has to be escaped as "<\/script>".
+ * Note: comments and CDATA is possible inside script or style - then closing tags are not searched inside CDATA.
+ * Example: <script><![CDATA[</script>]]></script>
+ * But still searched inside comments. Sometimes it becomes complicated:
+ * <script>var x = "<!--"; </script> var y = "-->"; alert(x + y);</script>
+ * Note: script and style can be the names of XML namespaces - then they are not treat like usual script or style.
+ * Example: <script:a>Hello</script:a>.
+ * Note: whitespaces are possible after closing tag name: </script > but not before: < / script>.
+ *
+ * 4. Other tags or tag-like elements are skipped without inner content.
+ * Example: <a>.</a>
+ * Note: it's expected that this HTML is illegal: <a test=">"></a>
+ * Note: it will also skip something like tags: <>, <!>, etc.
+ * Note: tag without end will be skipped to the end of input: <hello
+ * >
+ * 5. HTML and XML entities are not decoded.
+ * It should be processed by separate function.
+ *
+ * 6. Whitespaces in text are collapsed or inserted by specific rules.
+ * Whitespaces at beginning and at the end are removed.
+ * Consecutive whitespaces are collapsed.
+ * But if text is separated by other elements and there is no whitespace, it is inserted.
+ * It may be unnatural, examples: Hello<b>world</b>, Hello<!-- -->world
+ * - in HTML there will be no whitespace, but the function will insert it.
+ * But also consider: Hello<p>world</p>, Hello<br>world.
+ * This behaviour is reasonable for data analysis, e.g. convert HTML to a bag of words.
+ *
+ * 7. Also note that correct handling of whitespaces would require
+ * support of <pre></pre> and CSS display and white-space properties.
+ *
+ * Usage example:
+ *
+ * SELECT extractTextFromHTML(html) FROM url('https://github.com/ClickHouse/ClickHouse', RawBLOB, 'html String')
+ *
+ * - ClickHouse has embedded web browser.
+ */
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_COLUMN;
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+inline bool startsWith(const char * s, const char * end, const std::string_view prefix)
+{
+ return s + prefix.length() < end && 0 == memcmp(s, prefix.data(), prefix.length());
+}
+
+inline bool checkAndSkip(const char * __restrict & s, const char * end, const std::string_view prefix)
+{
+ if (startsWith(s, end, prefix))
+ {
+ s += prefix.length();
+ return true;
+ }
+ return false;
+}
+
+bool processComment(const char * __restrict & src, const char * end)
+{
+ if (!checkAndSkip(src, end, "<!--"))
+ return false;
+
+ while (true)
+ {
+ const char * gt = find_first_symbols<'>'>(src, end);
+ if (gt >= end)
+ break;
+
+ if (gt > src + strlen("--") && gt[-1] == '-' && gt[-2] == '-')
+ {
+ src = gt + 1;
+ break;
+ }
+
+ src = gt + 1;
+ }
+
+ return true;
+}
+
+bool processCDATA(const char * __restrict & src, const char * end, char * __restrict & dst)
+{
+ if (!checkAndSkip(src, end, "<![CDATA["))
+ return false;
+
+ const char * gt = src;
+ while (true)
+ {
+ gt = find_first_symbols<'>'>(gt, end);
+ if (gt >= end)
+ break;
+
+ if (gt[-1] == ']' && gt[-2] == ']')
+ {
+ if (dst)
+ {
+ size_t bytes_to_copy = gt - src - strlen("]]");
+ memcpy(dst, src, bytes_to_copy);
+ dst += bytes_to_copy;
+ }
+ src = gt + 1;
+ break;
+ }
+
+ ++gt;
+ }
+
+ return true;
+}
+
+bool processElementAndSkipContent(const char * __restrict & src, const char * end, const std::string_view tag_name)
+{
+ const auto * old_src = src;
+
+ if (!(src < end && *src == '<'))
+ return false;
+ ++src;
+
+ if (!checkAndSkip(src, end, tag_name))
+ {
+ src = old_src;
+ return false;
+ }
+
+ if (src >= end)
+ return false;
+
+ if (!(isWhitespaceASCII(*src) || *src == '>'))
+ {
+ src = old_src;
+ return false;
+ }
+
+ const char * gt = find_first_symbols<'>'>(src, end);
+ if (gt >= end)
+ return false;
+
+ src = gt + 1;
+
+ while (true)
+ {
+ const char * lt = find_first_symbols<'<'>(src, end);
+ src = lt;
+ if (src + 1 >= end)
+ break;
+
+ ++src;
+
+ /// Skip CDATA
+ if (*src == '!')
+ {
+ --src;
+ char * dst = nullptr;
+ if (processCDATA(src, end, dst))
+ continue;
+ ++src;
+ }
+
+ if (*src != '/')
+ continue;
+ ++src;
+
+ if (checkAndSkip(src, end, tag_name))
+ {
+ while (src < end && isWhitespaceASCII(*src))
+ ++src;
+
+ if (src >= end)
+ break;
+
+ if (*src == '>')
+ {
+ ++src;
+ break;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool skipTag(const char * __restrict & src, const char * end)
+{
+ if (src < end && *src == '<')
+ {
+ src = find_first_symbols<'>'>(src, end);
+ if (src < end)
+ ++src;
+
+ return true;
+ }
+
+ return false;
+}
+
+void copyText(const char * __restrict & src, const char * end, char * __restrict & dst, bool needs_whitespace)
+{
+ while (src < end && isWhitespaceASCII(*src))
+ ++src;
+
+ const char * lt = find_first_symbols<'<'>(src, end);
+
+ if (needs_whitespace && src < lt)
+ {
+ *dst = ' ';
+ ++dst;
+ }
+
+ while (true)
+ {
+ const char * ws = find_first_symbols<' ', '\t', '\n', '\r', '\f', '\v'>(src, lt);
+ size_t bytes_to_copy = ws - src;
+ memcpy(dst, src, bytes_to_copy);
+ dst += bytes_to_copy;
+
+ src = ws;
+ while (src < lt && isWhitespaceASCII(*src))
+ ++src;
+
+ if (src < lt)
+ {
+ *dst = ' ';
+ ++dst;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ src = lt;
+}
+
+size_t extract(const char * __restrict src, size_t size, char * __restrict dst)
+{
+ /** There are the following rules:
+ * - comments are removed with all their content;
+ * - elements 'script' and 'style' are removed with all their content;
+ * - for other elements tags are removed but content is processed as text;
+ * - CDATA should be copied verbatim;
+ */
+
+ const char * end = src + size;
+ char * dst_begin = dst;
+
+ while (src < end)
+ {
+ bool needs_whitespace = dst != dst_begin && dst[-1] != ' ';
+ copyText(src, end, dst, needs_whitespace);
+
+ processComment(src, end)
+ || processCDATA(src, end, dst)
+ || processElementAndSkipContent(src, end, "script")
+ || processElementAndSkipContent(src, end, "style")
+ || skipTag(src, end);
+ }
+
+ return dst - dst_begin;
+}
+
+}
+
+
+class FunctionExtractTextFromHTML : public IFunction
+{
+public:
+ static constexpr auto name = "extractTextFromHTML";
+
+ static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionExtractTextFromHTML>(); }
+ String getName() const override { return name; }
+ size_t getNumberOfArguments() const override { return 1; }
+ bool useDefaultImplementationForConstants() const override { return true; }
+ bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+
+ DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+ {
+ if (!isString(arguments[0]))
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
+ arguments[0]->getName(), getName());
+ return arguments[0];
+ }
+
+ ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t rows) const override
+ {
+ const ColumnString * src = checkAndGetColumn<ColumnString>(arguments[0].column.get());
+ if (!src)
+ throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument for function {} must be string.", getName());
+
+ const ColumnString::Chars & src_chars = src->getChars();
+ const ColumnString::Offsets & src_offsets = src->getOffsets();
+
+ auto res = ColumnString::create();
+
+ ColumnString::Chars & res_chars = res->getChars();
+ ColumnString::Offsets & res_offsets = res->getOffsets();
+
+ res_chars.resize(src_chars.size());
+ res_offsets.resize(src_offsets.size());
+
+ ColumnString::Offset src_offset = 0;
+ ColumnString::Offset res_offset = 0;
+
+ for (size_t i = 0; i < rows; ++i)
+ {
+ auto next_src_offset = src_offsets[i];
+
+ res_offset += extract(
+ reinterpret_cast<const char *>(&src_chars[src_offset]),
+ next_src_offset - src_offset - 1,
+ reinterpret_cast<char *>(&res_chars[res_offset]));
+
+ res_chars[res_offset] = 0;
+ ++res_offset;
+ res_offsets[i] = res_offset;
+
+ src_offset = next_src_offset;
+ }
+
+ res_chars.resize(res_offset);
+ return res;
+ }
+};
+
+REGISTER_FUNCTION(ExtractTextFromHTML)
+{
+ factory.registerFunction<FunctionExtractTextFromHTML>();
+}
+
+}