diff options
author | bnagaev <[email protected]> | 2022-02-10 16:47:04 +0300 |
---|---|---|
committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:47:04 +0300 |
commit | d6449ba66291ff0c0d352c82e6eb3efb4c8a7e8d (patch) | |
tree | d5dca6d44593f5e52556a1cc7b1ab0386e096ebe /contrib/libs/yaml-cpp/src/stream.cpp | |
parent | 1861d4c1402bb2c67a3e6b43b51706081b74508a (diff) |
Restoring authorship annotation for <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/yaml-cpp/src/stream.cpp')
-rw-r--r-- | contrib/libs/yaml-cpp/src/stream.cpp | 896 |
1 files changed, 448 insertions, 448 deletions
diff --git a/contrib/libs/yaml-cpp/src/stream.cpp b/contrib/libs/yaml-cpp/src/stream.cpp index 3b013cfa7d3..070eda1ad9b 100644 --- a/contrib/libs/yaml-cpp/src/stream.cpp +++ b/contrib/libs/yaml-cpp/src/stream.cpp @@ -1,448 +1,448 @@ -#include <iostream> - -#include "stream.h" - -#ifndef YAML_PREFETCH_SIZE -#define YAML_PREFETCH_SIZE 2048 -#endif - -#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A))) -#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A)) - -#define CP_REPLACEMENT_CHARACTER (0xFFFD) - -namespace YAML { -enum UtfIntroState { - uis_start, - uis_utfbe_b1, - uis_utf32be_b2, - uis_utf32be_bom3, - uis_utf32be, - uis_utf16be, - uis_utf16be_bom1, - uis_utfle_bom1, - uis_utf16le_bom2, - uis_utf32le_bom3, - uis_utf16le, - uis_utf32le, - uis_utf8_imp, - uis_utf16le_imp, - uis_utf32le_imp3, - uis_utf8_bom1, - uis_utf8_bom2, - uis_utf8, - uis_error -}; - -enum UtfIntroCharType { - uict00, - uictBB, - uictBF, - uictEF, - uictFE, - uictFF, - uictAscii, - uictOther, - uictMax -}; - -static bool s_introFinalState[] = { - false, // uis_start - false, // uis_utfbe_b1 - false, // uis_utf32be_b2 - false, // uis_utf32be_bom3 - true, // uis_utf32be - true, // uis_utf16be - false, // uis_utf16be_bom1 - false, // uis_utfle_bom1 - false, // uis_utf16le_bom2 - false, // uis_utf32le_bom3 - true, // uis_utf16le - true, // uis_utf32le - false, // uis_utf8_imp - false, // uis_utf16le_imp - false, // uis_utf32le_imp3 - false, // uis_utf8_bom1 - false, // uis_utf8_bom2 - true, // uis_utf8 - true, // uis_error -}; - -static UtfIntroState s_introTransitions[][uictMax] = { - // uict00, uictBB, uictBF, uictEF, - // uictFE, uictFF, uictAscii, uictOther - {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, - uis_utfle_bom1, uis_utf8_imp, uis_utf8}, - {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf16be, uis_utf8}, - {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, - uis_utf8, uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, - uis_utf8}, - {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, - uis_utf32be, uis_utf32be, uis_utf32be}, - {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, - uis_utf16be, uis_utf16be, uis_utf16be}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, - uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, - uis_utf8, uis_utf8}, - {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, - uis_utf32le, uis_utf32le, uis_utf32le}, - {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8, uis_utf8}, - {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8}, -}; - -static char s_introUngetCount[][uictMax] = { - // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther - {0, 1, 1, 0, 0, 0, 0, 1}, - {0, 2, 2, 2, 2, 2, 2, 2}, - {3, 3, 3, 3, 0, 3, 3, 3}, - {4, 4, 4, 4, 4, 0, 4, 4}, - {1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1}, - {2, 2, 2, 2, 2, 0, 2, 2}, - {2, 2, 2, 2, 0, 2, 2, 2}, - {0, 1, 1, 1, 1, 1, 1, 1}, - {0, 2, 2, 2, 2, 2, 2, 2}, - {1, 1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1, 1}, - {0, 2, 2, 2, 2, 2, 2, 2}, - {0, 3, 3, 3, 3, 3, 3, 3}, - {4, 4, 4, 4, 4, 4, 4, 4}, - {2, 0, 2, 2, 2, 2, 2, 2}, - {3, 3, 0, 3, 3, 3, 3, 3}, - {1, 1, 1, 1, 1, 1, 1, 1}, -}; - -inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) { - if (std::istream::traits_type::eof() == ch) { - return uictOther; - } - - switch (ch) { - case 0: - return uict00; - case 0xBB: - return uictBB; - case 0xBF: - return uictBF; - case 0xEF: - return uictEF; - case 0xFE: - return uictFE; - case 0xFF: - return uictFF; - } - - if ((ch > 0) && (ch < 0xFF)) { - return uictAscii; - } - - return uictOther; -} - -inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, - unsigned char rshift) { - const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits); - const unsigned char mask = (0xFF >> (lead_bits + 1)); - return static_cast<char>( - static_cast<unsigned char>(header | ((ch >> rshift) & mask))); -} - -inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) { - // We are not allowed to queue the Stream::eof() codepoint, so - // replace it with CP_REPLACEMENT_CHARACTER - if (static_cast<unsigned long>(Stream::eof()) == ch) { - ch = CP_REPLACEMENT_CHARACTER; - } - - if (ch < 0x80) { - q.push_back(Utf8Adjust(ch, 0, 0)); - } else if (ch < 0x800) { - q.push_back(Utf8Adjust(ch, 2, 6)); - q.push_back(Utf8Adjust(ch, 1, 0)); - } else if (ch < 0x10000) { - q.push_back(Utf8Adjust(ch, 3, 12)); - q.push_back(Utf8Adjust(ch, 1, 6)); - q.push_back(Utf8Adjust(ch, 1, 0)); - } else { - q.push_back(Utf8Adjust(ch, 4, 18)); - q.push_back(Utf8Adjust(ch, 1, 12)); - q.push_back(Utf8Adjust(ch, 1, 6)); - q.push_back(Utf8Adjust(ch, 1, 0)); - } -} - -Stream::Stream(std::istream& input) - : m_input(input), - m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), - m_nPrefetchedAvailable(0), - m_nPrefetchedUsed(0) { - typedef std::istream::traits_type char_traits; - - if (!input) - return; - - // Determine (or guess) the character-set by reading the BOM, if any. See - // the YAML specification for the determination algorithm. - char_traits::int_type intro[4]; - int nIntroUsed = 0; - UtfIntroState state = uis_start; - for (; !s_introFinalState[state];) { - std::istream::int_type ch = input.get(); - intro[nIntroUsed++] = ch; - UtfIntroCharType charType = IntroCharTypeOf(ch); - UtfIntroState newState = s_introTransitions[state][charType]; - int nUngets = s_introUngetCount[state][charType]; - if (nUngets > 0) { - input.clear(); - for (; nUngets > 0; --nUngets) { - if (char_traits::eof() != intro[--nIntroUsed]) - input.putback(char_traits::to_char_type(intro[nIntroUsed])); - } - } - state = newState; - } - - switch (state) { - case uis_utf8: - m_charSet = utf8; - break; - case uis_utf16le: - m_charSet = utf16le; - break; - case uis_utf16be: - m_charSet = utf16be; - break; - case uis_utf32le: - m_charSet = utf32le; - break; - case uis_utf32be: - m_charSet = utf32be; - break; - default: - m_charSet = utf8; - break; - } - - ReadAheadTo(0); -} - -Stream::~Stream() { delete[] m_pPrefetched; } - -char Stream::peek() const { - if (m_readahead.empty()) { - return Stream::eof(); - } - - return m_readahead[0]; -} - -Stream::operator bool() const { - return m_input.good() || - (!m_readahead.empty() && m_readahead[0] != Stream::eof()); -} - -// get -// . Extracts a character from the stream and updates our position -char Stream::get() { - char ch = peek(); - AdvanceCurrent(); - m_mark.column++; - - if (ch == '\n') { - m_mark.column = 0; - m_mark.line++; - } - - return ch; -} - -// get -// . Extracts 'n' characters from the stream and updates our position -std::string Stream::get(int n) { - std::string ret; - ret.reserve(n); - for (int i = 0; i < n; i++) - ret += get(); - return ret; -} - -// eat -// . Eats 'n' characters and updates our position. -void Stream::eat(int n) { - for (int i = 0; i < n; i++) - get(); -} - -void Stream::AdvanceCurrent() { - if (!m_readahead.empty()) { - m_readahead.pop_front(); - m_mark.pos++; - } - - ReadAheadTo(0); -} - -bool Stream::_ReadAheadTo(size_t i) const { - while (m_input.good() && (m_readahead.size() <= i)) { - switch (m_charSet) { - case utf8: - StreamInUtf8(); - break; - case utf16le: - StreamInUtf16(); - break; - case utf16be: - StreamInUtf16(); - break; - case utf32le: - StreamInUtf32(); - break; - case utf32be: - StreamInUtf32(); - break; - } - } - - // signal end of stream - if (!m_input.good()) - m_readahead.push_back(Stream::eof()); - - return m_readahead.size() > i; -} - -void Stream::StreamInUtf8() const { - unsigned char b = GetNextByte(); - if (m_input.good()) { - m_readahead.push_back(b); - } -} - -void Stream::StreamInUtf16() const { - unsigned long ch = 0; - unsigned char bytes[2]; - int nBigEnd = (m_charSet == utf16be) ? 0 : 1; - - bytes[0] = GetNextByte(); - bytes[1] = GetNextByte(); - if (!m_input.good()) { - return; - } - ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | - static_cast<unsigned long>(bytes[1 ^ nBigEnd]); - - if (ch >= 0xDC00 && ch < 0xE000) { - // Trailing (low) surrogate...ugh, wrong order - QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); - return; - } else if (ch >= 0xD800 && ch < 0xDC00) { - // ch is a leading (high) surrogate - - // Four byte UTF-8 code point - - // Read the trailing (low) surrogate - for (;;) { - bytes[0] = GetNextByte(); - bytes[1] = GetNextByte(); - if (!m_input.good()) { - QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); - return; - } - unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | - static_cast<unsigned long>(bytes[1 ^ nBigEnd]); - if (chLow < 0xDC00 || chLow >= 0xE000) { - // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the - // stream. - QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); - - // Deal with the next UTF-16 unit - if (chLow < 0xD800 || chLow >= 0xE000) { - // Easiest case: queue the codepoint and return - QueueUnicodeCodepoint(m_readahead, ch); - return; - } else { - // Start the loop over with the new high surrogate - ch = chLow; - continue; - } - } - - // Select the payload bits from the high surrogate - ch &= 0x3FF; - ch <<= 10; - - // Include bits from low surrogate - ch |= (chLow & 0x3FF); - - // Add the surrogacy offset - ch += 0x10000; - break; - } - } - - QueueUnicodeCodepoint(m_readahead, ch); -} - -inline char* ReadBuffer(unsigned char* pBuffer) { - return reinterpret_cast<char*>(pBuffer); -} - -unsigned char Stream::GetNextByte() const { - if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) { - std::streambuf* pBuf = m_input.rdbuf(); - m_nPrefetchedAvailable = static_cast<std::size_t>( - pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE)); - m_nPrefetchedUsed = 0; - if (!m_nPrefetchedAvailable) { - m_input.setstate(std::ios_base::eofbit); - } - - if (0 == m_nPrefetchedAvailable) { - return 0; - } - } - - return m_pPrefetched[m_nPrefetchedUsed++]; -} - -void Stream::StreamInUtf32() const { - static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}}; - - unsigned long ch = 0; - unsigned char bytes[4]; - int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0]; - - bytes[0] = GetNextByte(); - bytes[1] = GetNextByte(); - bytes[2] = GetNextByte(); - bytes[3] = GetNextByte(); - if (!m_input.good()) { - return; - } - - for (int i = 0; i < 4; ++i) { - ch <<= 8; - ch |= bytes[pIndexes[i]]; - } - - QueueUnicodeCodepoint(m_readahead, ch); -} -} +#include <iostream> + +#include "stream.h" + +#ifndef YAML_PREFETCH_SIZE +#define YAML_PREFETCH_SIZE 2048 +#endif + +#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A))) +#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A)) + +#define CP_REPLACEMENT_CHARACTER (0xFFFD) + +namespace YAML { +enum UtfIntroState { + uis_start, + uis_utfbe_b1, + uis_utf32be_b2, + uis_utf32be_bom3, + uis_utf32be, + uis_utf16be, + uis_utf16be_bom1, + uis_utfle_bom1, + uis_utf16le_bom2, + uis_utf32le_bom3, + uis_utf16le, + uis_utf32le, + uis_utf8_imp, + uis_utf16le_imp, + uis_utf32le_imp3, + uis_utf8_bom1, + uis_utf8_bom2, + uis_utf8, + uis_error +}; + +enum UtfIntroCharType { + uict00, + uictBB, + uictBF, + uictEF, + uictFE, + uictFF, + uictAscii, + uictOther, + uictMax +}; + +static bool s_introFinalState[] = { + false, // uis_start + false, // uis_utfbe_b1 + false, // uis_utf32be_b2 + false, // uis_utf32be_bom3 + true, // uis_utf32be + true, // uis_utf16be + false, // uis_utf16be_bom1 + false, // uis_utfle_bom1 + false, // uis_utf16le_bom2 + false, // uis_utf32le_bom3 + true, // uis_utf16le + true, // uis_utf32le + false, // uis_utf8_imp + false, // uis_utf16le_imp + false, // uis_utf32le_imp3 + false, // uis_utf8_bom1 + false, // uis_utf8_bom2 + true, // uis_utf8 + true, // uis_error +}; + +static UtfIntroState s_introTransitions[][uictMax] = { + // uict00, uictBB, uictBF, uictEF, + // uictFE, uictFF, uictAscii, uictOther + {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, + uis_utfle_bom1, uis_utf8_imp, uis_utf8}, + {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, + uis_utf16be, uis_utf8}, + {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, + uis_utf8, uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, + uis_utf8}, + {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, + uis_utf32be, uis_utf32be, uis_utf32be}, + {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, + uis_utf16be, uis_utf16be, uis_utf16be}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, + uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, + uis_utf8, uis_utf8}, + {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, + uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, + uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, + uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, + uis_utf32le, uis_utf32le, uis_utf32le}, + {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, + uis_utf8, uis_utf8}, + {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, + uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, + uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, + uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, + uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, + uis_utf8}, +}; + +static char s_introUngetCount[][uictMax] = { + // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther + {0, 1, 1, 0, 0, 0, 0, 1}, + {0, 2, 2, 2, 2, 2, 2, 2}, + {3, 3, 3, 3, 0, 3, 3, 3}, + {4, 4, 4, 4, 4, 0, 4, 4}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {2, 2, 2, 2, 2, 0, 2, 2}, + {2, 2, 2, 2, 0, 2, 2, 2}, + {0, 1, 1, 1, 1, 1, 1, 1}, + {0, 2, 2, 2, 2, 2, 2, 2}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {0, 2, 2, 2, 2, 2, 2, 2}, + {0, 3, 3, 3, 3, 3, 3, 3}, + {4, 4, 4, 4, 4, 4, 4, 4}, + {2, 0, 2, 2, 2, 2, 2, 2}, + {3, 3, 0, 3, 3, 3, 3, 3}, + {1, 1, 1, 1, 1, 1, 1, 1}, +}; + +inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) { + if (std::istream::traits_type::eof() == ch) { + return uictOther; + } + + switch (ch) { + case 0: + return uict00; + case 0xBB: + return uictBB; + case 0xBF: + return uictBF; + case 0xEF: + return uictEF; + case 0xFE: + return uictFE; + case 0xFF: + return uictFF; + } + + if ((ch > 0) && (ch < 0xFF)) { + return uictAscii; + } + + return uictOther; +} + +inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, + unsigned char rshift) { + const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits); + const unsigned char mask = (0xFF >> (lead_bits + 1)); + return static_cast<char>( + static_cast<unsigned char>(header | ((ch >> rshift) & mask))); +} + +inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) { + // We are not allowed to queue the Stream::eof() codepoint, so + // replace it with CP_REPLACEMENT_CHARACTER + if (static_cast<unsigned long>(Stream::eof()) == ch) { + ch = CP_REPLACEMENT_CHARACTER; + } + + if (ch < 0x80) { + q.push_back(Utf8Adjust(ch, 0, 0)); + } else if (ch < 0x800) { + q.push_back(Utf8Adjust(ch, 2, 6)); + q.push_back(Utf8Adjust(ch, 1, 0)); + } else if (ch < 0x10000) { + q.push_back(Utf8Adjust(ch, 3, 12)); + q.push_back(Utf8Adjust(ch, 1, 6)); + q.push_back(Utf8Adjust(ch, 1, 0)); + } else { + q.push_back(Utf8Adjust(ch, 4, 18)); + q.push_back(Utf8Adjust(ch, 1, 12)); + q.push_back(Utf8Adjust(ch, 1, 6)); + q.push_back(Utf8Adjust(ch, 1, 0)); + } +} + +Stream::Stream(std::istream& input) + : m_input(input), + m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), + m_nPrefetchedAvailable(0), + m_nPrefetchedUsed(0) { + typedef std::istream::traits_type char_traits; + + if (!input) + return; + + // Determine (or guess) the character-set by reading the BOM, if any. See + // the YAML specification for the determination algorithm. + char_traits::int_type intro[4]; + int nIntroUsed = 0; + UtfIntroState state = uis_start; + for (; !s_introFinalState[state];) { + std::istream::int_type ch = input.get(); + intro[nIntroUsed++] = ch; + UtfIntroCharType charType = IntroCharTypeOf(ch); + UtfIntroState newState = s_introTransitions[state][charType]; + int nUngets = s_introUngetCount[state][charType]; + if (nUngets > 0) { + input.clear(); + for (; nUngets > 0; --nUngets) { + if (char_traits::eof() != intro[--nIntroUsed]) + input.putback(char_traits::to_char_type(intro[nIntroUsed])); + } + } + state = newState; + } + + switch (state) { + case uis_utf8: + m_charSet = utf8; + break; + case uis_utf16le: + m_charSet = utf16le; + break; + case uis_utf16be: + m_charSet = utf16be; + break; + case uis_utf32le: + m_charSet = utf32le; + break; + case uis_utf32be: + m_charSet = utf32be; + break; + default: + m_charSet = utf8; + break; + } + + ReadAheadTo(0); +} + +Stream::~Stream() { delete[] m_pPrefetched; } + +char Stream::peek() const { + if (m_readahead.empty()) { + return Stream::eof(); + } + + return m_readahead[0]; +} + +Stream::operator bool() const { + return m_input.good() || + (!m_readahead.empty() && m_readahead[0] != Stream::eof()); +} + +// get +// . Extracts a character from the stream and updates our position +char Stream::get() { + char ch = peek(); + AdvanceCurrent(); + m_mark.column++; + + if (ch == '\n') { + m_mark.column = 0; + m_mark.line++; + } + + return ch; +} + +// get +// . Extracts 'n' characters from the stream and updates our position +std::string Stream::get(int n) { + std::string ret; + ret.reserve(n); + for (int i = 0; i < n; i++) + ret += get(); + return ret; +} + +// eat +// . Eats 'n' characters and updates our position. +void Stream::eat(int n) { + for (int i = 0; i < n; i++) + get(); +} + +void Stream::AdvanceCurrent() { + if (!m_readahead.empty()) { + m_readahead.pop_front(); + m_mark.pos++; + } + + ReadAheadTo(0); +} + +bool Stream::_ReadAheadTo(size_t i) const { + while (m_input.good() && (m_readahead.size() <= i)) { + switch (m_charSet) { + case utf8: + StreamInUtf8(); + break; + case utf16le: + StreamInUtf16(); + break; + case utf16be: + StreamInUtf16(); + break; + case utf32le: + StreamInUtf32(); + break; + case utf32be: + StreamInUtf32(); + break; + } + } + + // signal end of stream + if (!m_input.good()) + m_readahead.push_back(Stream::eof()); + + return m_readahead.size() > i; +} + +void Stream::StreamInUtf8() const { + unsigned char b = GetNextByte(); + if (m_input.good()) { + m_readahead.push_back(b); + } +} + +void Stream::StreamInUtf16() const { + unsigned long ch = 0; + unsigned char bytes[2]; + int nBigEnd = (m_charSet == utf16be) ? 0 : 1; + + bytes[0] = GetNextByte(); + bytes[1] = GetNextByte(); + if (!m_input.good()) { + return; + } + ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | + static_cast<unsigned long>(bytes[1 ^ nBigEnd]); + + if (ch >= 0xDC00 && ch < 0xE000) { + // Trailing (low) surrogate...ugh, wrong order + QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); + return; + } else if (ch >= 0xD800 && ch < 0xDC00) { + // ch is a leading (high) surrogate + + // Four byte UTF-8 code point + + // Read the trailing (low) surrogate + for (;;) { + bytes[0] = GetNextByte(); + bytes[1] = GetNextByte(); + if (!m_input.good()) { + QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); + return; + } + unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | + static_cast<unsigned long>(bytes[1 ^ nBigEnd]); + if (chLow < 0xDC00 || chLow >= 0xE000) { + // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the + // stream. + QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); + + // Deal with the next UTF-16 unit + if (chLow < 0xD800 || chLow >= 0xE000) { + // Easiest case: queue the codepoint and return + QueueUnicodeCodepoint(m_readahead, ch); + return; + } else { + // Start the loop over with the new high surrogate + ch = chLow; + continue; + } + } + + // Select the payload bits from the high surrogate + ch &= 0x3FF; + ch <<= 10; + + // Include bits from low surrogate + ch |= (chLow & 0x3FF); + + // Add the surrogacy offset + ch += 0x10000; + break; + } + } + + QueueUnicodeCodepoint(m_readahead, ch); +} + +inline char* ReadBuffer(unsigned char* pBuffer) { + return reinterpret_cast<char*>(pBuffer); +} + +unsigned char Stream::GetNextByte() const { + if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) { + std::streambuf* pBuf = m_input.rdbuf(); + m_nPrefetchedAvailable = static_cast<std::size_t>( + pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE)); + m_nPrefetchedUsed = 0; + if (!m_nPrefetchedAvailable) { + m_input.setstate(std::ios_base::eofbit); + } + + if (0 == m_nPrefetchedAvailable) { + return 0; + } + } + + return m_pPrefetched[m_nPrefetchedUsed++]; +} + +void Stream::StreamInUtf32() const { + static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}}; + + unsigned long ch = 0; + unsigned char bytes[4]; + int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0]; + + bytes[0] = GetNextByte(); + bytes[1] = GetNextByte(); + bytes[2] = GetNextByte(); + bytes[3] = GetNextByte(); + if (!m_input.good()) { + return; + } + + for (int i = 0; i < 4; ++i) { + ch <<= 8; + ch |= bytes[pIndexes[i]]; + } + + QueueUnicodeCodepoint(m_readahead, ch); +} +} |