diff options
author | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
---|---|---|
committer | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
commit | 06e5c21a835c0e923506c4ff27929f34e00761c2 (patch) | |
tree | 75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /library/cpp/html/entity/htmlentity.h | |
parent | 03f024c4412e3aa613bb543cf1660176320ba8f4 (diff) | |
download | ydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz |
fix ya.make
Diffstat (limited to 'library/cpp/html/entity/htmlentity.h')
-rw-r--r-- | library/cpp/html/entity/htmlentity.h | 84 |
1 files changed, 84 insertions, 0 deletions
diff --git a/library/cpp/html/entity/htmlentity.h b/library/cpp/html/entity/htmlentity.h new file mode 100644 index 0000000000..0da2167d28 --- /dev/null +++ b/library/cpp/html/entity/htmlentity.h @@ -0,0 +1,84 @@ +#pragma once + +#include "decoder.h" + +#include <util/system/defaults.h> +#include <library/cpp/charset/doccodes.h> +#include <util/generic/strbuf.h> +#include <utility> + +/******************************************************/ +/* direct decoding actions */ +/******************************************************/ + +//! Try decode named or numeric entity using general html5 standard rules. +//! @param str - string started with '&'. +bool HtTryDecodeEntity(const char* str, size_t len, TEntity* entity); + +/******************************************************/ +/* step by step actions */ +/******************************************************/ + +// NOTE: Some entities have two codepoinst, if entity has one codepoint +// then the second wchar32 in pair is zero. +// Decodes with html5 standard rules. +std::pair<wchar32, wchar32> HtEntDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map); + +// Decodes assuming that ';' should always present after entity. +std::pair<wchar32, wchar32> HtEntPureDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map); + +// Similar with HtEntDecodeStep, but do not decodes named entities with two codepoints. +// Use HtEntDecodeStep and HtEntPureDecodeStep instead. +wchar32 HtEntOldDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map); +wchar32 HtEntOldPureDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map); + +/******************************************************/ +/* complete actions */ +/******************************************************/ + +// Try decode str using general html5 standard rules. +// Stops when str or buffer finish. +size_t HtEntDecode(ECharset cp, const char* str, size_t len, wchar32* buffer, size_t buflen, unsigned char* char_lengthes = nullptr); + +size_t HtEntDecodeToUtf8(ECharset cp, const char* src, size_t srclen, char* dst, size_t dstlen); + +// Special rules for attributes decoding +// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#character-reference-in-attribute-value-state +size_t HtDecodeAttrToUtf8(ECharset cp, const char* src, size_t srclen, char* dst, size_t dstlen); + +size_t HtEntDecodeToChar(ECharset cp, const char* str, size_t len, wchar16* buffer, unsigned char* char_lengthes = nullptr); + +/** + * decode HTML entities if any + * @param src input buffer + * @param dst output buffer + * @param dstlen output buffer length + * @param cpsrc input buffer encoding, ascii-compatible + * @param cpdst output buffer encoding, if different from cpsrc + * @return src if no entities and encodings are the same (dst remains untouched) + * NULL if dst was not sufficiently long + * dst-based output buffer with decoded string + * @note entities must be pure, with the terminating ";" + */ +TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc = CODES_UTF8); +TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc, ECharset cpdst); + +//! decodes HTML entities and converts non-ASCII characters to unicode, then converts unicode to UTF8 and percent-encodes +//! @param text zero-terminated text of link +//! @param buffer buffer receiving UTF8 percent-encoded text of link +//! @param buflen length of output buffer +//! @param cp code page object used to convert non-ASCII characters +//! @note HTML entities directly converted into unicode characters, non-ASCII characters +//! converted into unicode using code page object if it is passed to the function, +//! then unicode characters converted to UTF8 and percent-encoded, +//! percent-encoded text in the link copied into output buffer as is +bool HtLinkDecode(const char* text, char* buffer, size_t buflen, size_t& written, ECharset cp = CODES_UNKNOWN); +bool HtLinkDecode(const TStringBuf& text, char* buffer, size_t buflen, size_t& written, ECharset cp = CODES_UNKNOWN); + +static inline bool HtLinkDecode(const char* text, char* buffer, size_t buflen, ECharset cp = CODES_UNKNOWN) { + size_t written; + const bool ok = HtLinkDecode(text, buffer, buflen, written, cp); + if (ok) + buffer[written] = '\x00'; + return ok; +} |