summaryrefslogtreecommitdiffstats
path: root/contrib/libs/yaml-cpp/src/stream.cpp
diff options
context:
space:
mode:
authorbnagaev <[email protected]>2022-02-10 16:47:04 +0300
committerDaniil Cherednik <[email protected]>2022-02-10 16:47:04 +0300
commitd6449ba66291ff0c0d352c82e6eb3efb4c8a7e8d (patch)
treed5dca6d44593f5e52556a1cc7b1ab0386e096ebe /contrib/libs/yaml-cpp/src/stream.cpp
parent1861d4c1402bb2c67a3e6b43b51706081b74508a (diff)
Restoring authorship annotation for <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/yaml-cpp/src/stream.cpp')
-rw-r--r--contrib/libs/yaml-cpp/src/stream.cpp896
1 files changed, 448 insertions, 448 deletions
diff --git a/contrib/libs/yaml-cpp/src/stream.cpp b/contrib/libs/yaml-cpp/src/stream.cpp
index 3b013cfa7d3..070eda1ad9b 100644
--- a/contrib/libs/yaml-cpp/src/stream.cpp
+++ b/contrib/libs/yaml-cpp/src/stream.cpp
@@ -1,448 +1,448 @@
-#include <iostream>
-
-#include "stream.h"
-
-#ifndef YAML_PREFETCH_SIZE
-#define YAML_PREFETCH_SIZE 2048
-#endif
-
-#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
-#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
-
-#define CP_REPLACEMENT_CHARACTER (0xFFFD)
-
-namespace YAML {
-enum UtfIntroState {
- uis_start,
- uis_utfbe_b1,
- uis_utf32be_b2,
- uis_utf32be_bom3,
- uis_utf32be,
- uis_utf16be,
- uis_utf16be_bom1,
- uis_utfle_bom1,
- uis_utf16le_bom2,
- uis_utf32le_bom3,
- uis_utf16le,
- uis_utf32le,
- uis_utf8_imp,
- uis_utf16le_imp,
- uis_utf32le_imp3,
- uis_utf8_bom1,
- uis_utf8_bom2,
- uis_utf8,
- uis_error
-};
-
-enum UtfIntroCharType {
- uict00,
- uictBB,
- uictBF,
- uictEF,
- uictFE,
- uictFF,
- uictAscii,
- uictOther,
- uictMax
-};
-
-static bool s_introFinalState[] = {
- false, // uis_start
- false, // uis_utfbe_b1
- false, // uis_utf32be_b2
- false, // uis_utf32be_bom3
- true, // uis_utf32be
- true, // uis_utf16be
- false, // uis_utf16be_bom1
- false, // uis_utfle_bom1
- false, // uis_utf16le_bom2
- false, // uis_utf32le_bom3
- true, // uis_utf16le
- true, // uis_utf32le
- false, // uis_utf8_imp
- false, // uis_utf16le_imp
- false, // uis_utf32le_imp3
- false, // uis_utf8_bom1
- false, // uis_utf8_bom2
- true, // uis_utf8
- true, // uis_error
-};
-
-static UtfIntroState s_introTransitions[][uictMax] = {
- // uict00, uictBB, uictBF, uictEF,
- // uictFE, uictFF, uictAscii, uictOther
- {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
- uis_utfle_bom1, uis_utf8_imp, uis_utf8},
- {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf16be, uis_utf8},
- {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
- uis_utf8, uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
- uis_utf8},
- {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
- uis_utf32be, uis_utf32be, uis_utf32be},
- {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
- uis_utf16be, uis_utf16be, uis_utf16be},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
- uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
- uis_utf8, uis_utf8},
- {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
- uis_utf32le, uis_utf32le, uis_utf32le},
- {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8, uis_utf8},
- {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8},
-};
-
-static char s_introUngetCount[][uictMax] = {
- // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
- {0, 1, 1, 0, 0, 0, 0, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {3, 3, 3, 3, 0, 3, 3, 3},
- {4, 4, 4, 4, 4, 0, 4, 4},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {2, 2, 2, 2, 2, 0, 2, 2},
- {2, 2, 2, 2, 0, 2, 2, 2},
- {0, 1, 1, 1, 1, 1, 1, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {0, 3, 3, 3, 3, 3, 3, 3},
- {4, 4, 4, 4, 4, 4, 4, 4},
- {2, 0, 2, 2, 2, 2, 2, 2},
- {3, 3, 0, 3, 3, 3, 3, 3},
- {1, 1, 1, 1, 1, 1, 1, 1},
-};
-
-inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
- if (std::istream::traits_type::eof() == ch) {
- return uictOther;
- }
-
- switch (ch) {
- case 0:
- return uict00;
- case 0xBB:
- return uictBB;
- case 0xBF:
- return uictBF;
- case 0xEF:
- return uictEF;
- case 0xFE:
- return uictFE;
- case 0xFF:
- return uictFF;
- }
-
- if ((ch > 0) && (ch < 0xFF)) {
- return uictAscii;
- }
-
- return uictOther;
-}
-
-inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
- unsigned char rshift) {
- const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
- const unsigned char mask = (0xFF >> (lead_bits + 1));
- return static_cast<char>(
- static_cast<unsigned char>(header | ((ch >> rshift) & mask)));
-}
-
-inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
- // We are not allowed to queue the Stream::eof() codepoint, so
- // replace it with CP_REPLACEMENT_CHARACTER
- if (static_cast<unsigned long>(Stream::eof()) == ch) {
- ch = CP_REPLACEMENT_CHARACTER;
- }
-
- if (ch < 0x80) {
- q.push_back(Utf8Adjust(ch, 0, 0));
- } else if (ch < 0x800) {
- q.push_back(Utf8Adjust(ch, 2, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- } else if (ch < 0x10000) {
- q.push_back(Utf8Adjust(ch, 3, 12));
- q.push_back(Utf8Adjust(ch, 1, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- } else {
- q.push_back(Utf8Adjust(ch, 4, 18));
- q.push_back(Utf8Adjust(ch, 1, 12));
- q.push_back(Utf8Adjust(ch, 1, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- }
-}
-
-Stream::Stream(std::istream& input)
- : m_input(input),
- m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
- m_nPrefetchedAvailable(0),
- m_nPrefetchedUsed(0) {
- typedef std::istream::traits_type char_traits;
-
- if (!input)
- return;
-
- // Determine (or guess) the character-set by reading the BOM, if any. See
- // the YAML specification for the determination algorithm.
- char_traits::int_type intro[4];
- int nIntroUsed = 0;
- UtfIntroState state = uis_start;
- for (; !s_introFinalState[state];) {
- std::istream::int_type ch = input.get();
- intro[nIntroUsed++] = ch;
- UtfIntroCharType charType = IntroCharTypeOf(ch);
- UtfIntroState newState = s_introTransitions[state][charType];
- int nUngets = s_introUngetCount[state][charType];
- if (nUngets > 0) {
- input.clear();
- for (; nUngets > 0; --nUngets) {
- if (char_traits::eof() != intro[--nIntroUsed])
- input.putback(char_traits::to_char_type(intro[nIntroUsed]));
- }
- }
- state = newState;
- }
-
- switch (state) {
- case uis_utf8:
- m_charSet = utf8;
- break;
- case uis_utf16le:
- m_charSet = utf16le;
- break;
- case uis_utf16be:
- m_charSet = utf16be;
- break;
- case uis_utf32le:
- m_charSet = utf32le;
- break;
- case uis_utf32be:
- m_charSet = utf32be;
- break;
- default:
- m_charSet = utf8;
- break;
- }
-
- ReadAheadTo(0);
-}
-
-Stream::~Stream() { delete[] m_pPrefetched; }
-
-char Stream::peek() const {
- if (m_readahead.empty()) {
- return Stream::eof();
- }
-
- return m_readahead[0];
-}
-
-Stream::operator bool() const {
- return m_input.good() ||
- (!m_readahead.empty() && m_readahead[0] != Stream::eof());
-}
-
-// get
-// . Extracts a character from the stream and updates our position
-char Stream::get() {
- char ch = peek();
- AdvanceCurrent();
- m_mark.column++;
-
- if (ch == '\n') {
- m_mark.column = 0;
- m_mark.line++;
- }
-
- return ch;
-}
-
-// get
-// . Extracts 'n' characters from the stream and updates our position
-std::string Stream::get(int n) {
- std::string ret;
- ret.reserve(n);
- for (int i = 0; i < n; i++)
- ret += get();
- return ret;
-}
-
-// eat
-// . Eats 'n' characters and updates our position.
-void Stream::eat(int n) {
- for (int i = 0; i < n; i++)
- get();
-}
-
-void Stream::AdvanceCurrent() {
- if (!m_readahead.empty()) {
- m_readahead.pop_front();
- m_mark.pos++;
- }
-
- ReadAheadTo(0);
-}
-
-bool Stream::_ReadAheadTo(size_t i) const {
- while (m_input.good() && (m_readahead.size() <= i)) {
- switch (m_charSet) {
- case utf8:
- StreamInUtf8();
- break;
- case utf16le:
- StreamInUtf16();
- break;
- case utf16be:
- StreamInUtf16();
- break;
- case utf32le:
- StreamInUtf32();
- break;
- case utf32be:
- StreamInUtf32();
- break;
- }
- }
-
- // signal end of stream
- if (!m_input.good())
- m_readahead.push_back(Stream::eof());
-
- return m_readahead.size() > i;
-}
-
-void Stream::StreamInUtf8() const {
- unsigned char b = GetNextByte();
- if (m_input.good()) {
- m_readahead.push_back(b);
- }
-}
-
-void Stream::StreamInUtf16() const {
- unsigned long ch = 0;
- unsigned char bytes[2];
- int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
-
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- if (!m_input.good()) {
- return;
- }
- ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
- static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
-
- if (ch >= 0xDC00 && ch < 0xE000) {
- // Trailing (low) surrogate...ugh, wrong order
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
- return;
- } else if (ch >= 0xD800 && ch < 0xDC00) {
- // ch is a leading (high) surrogate
-
- // Four byte UTF-8 code point
-
- // Read the trailing (low) surrogate
- for (;;) {
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- if (!m_input.good()) {
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
- return;
- }
- unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
- static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
- if (chLow < 0xDC00 || chLow >= 0xE000) {
- // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the
- // stream.
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
-
- // Deal with the next UTF-16 unit
- if (chLow < 0xD800 || chLow >= 0xE000) {
- // Easiest case: queue the codepoint and return
- QueueUnicodeCodepoint(m_readahead, ch);
- return;
- } else {
- // Start the loop over with the new high surrogate
- ch = chLow;
- continue;
- }
- }
-
- // Select the payload bits from the high surrogate
- ch &= 0x3FF;
- ch <<= 10;
-
- // Include bits from low surrogate
- ch |= (chLow & 0x3FF);
-
- // Add the surrogacy offset
- ch += 0x10000;
- break;
- }
- }
-
- QueueUnicodeCodepoint(m_readahead, ch);
-}
-
-inline char* ReadBuffer(unsigned char* pBuffer) {
- return reinterpret_cast<char*>(pBuffer);
-}
-
-unsigned char Stream::GetNextByte() const {
- if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) {
- std::streambuf* pBuf = m_input.rdbuf();
- m_nPrefetchedAvailable = static_cast<std::size_t>(
- pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
- m_nPrefetchedUsed = 0;
- if (!m_nPrefetchedAvailable) {
- m_input.setstate(std::ios_base::eofbit);
- }
-
- if (0 == m_nPrefetchedAvailable) {
- return 0;
- }
- }
-
- return m_pPrefetched[m_nPrefetchedUsed++];
-}
-
-void Stream::StreamInUtf32() const {
- static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}};
-
- unsigned long ch = 0;
- unsigned char bytes[4];
- int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
-
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- bytes[2] = GetNextByte();
- bytes[3] = GetNextByte();
- if (!m_input.good()) {
- return;
- }
-
- for (int i = 0; i < 4; ++i) {
- ch <<= 8;
- ch |= bytes[pIndexes[i]];
- }
-
- QueueUnicodeCodepoint(m_readahead, ch);
-}
-}
+#include <iostream>
+
+#include "stream.h"
+
+#ifndef YAML_PREFETCH_SIZE
+#define YAML_PREFETCH_SIZE 2048
+#endif
+
+#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
+#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
+
+#define CP_REPLACEMENT_CHARACTER (0xFFFD)
+
+namespace YAML {
+enum UtfIntroState {
+ uis_start,
+ uis_utfbe_b1,
+ uis_utf32be_b2,
+ uis_utf32be_bom3,
+ uis_utf32be,
+ uis_utf16be,
+ uis_utf16be_bom1,
+ uis_utfle_bom1,
+ uis_utf16le_bom2,
+ uis_utf32le_bom3,
+ uis_utf16le,
+ uis_utf32le,
+ uis_utf8_imp,
+ uis_utf16le_imp,
+ uis_utf32le_imp3,
+ uis_utf8_bom1,
+ uis_utf8_bom2,
+ uis_utf8,
+ uis_error
+};
+
+enum UtfIntroCharType {
+ uict00,
+ uictBB,
+ uictBF,
+ uictEF,
+ uictFE,
+ uictFF,
+ uictAscii,
+ uictOther,
+ uictMax
+};
+
+static bool s_introFinalState[] = {
+ false, // uis_start
+ false, // uis_utfbe_b1
+ false, // uis_utf32be_b2
+ false, // uis_utf32be_bom3
+ true, // uis_utf32be
+ true, // uis_utf16be
+ false, // uis_utf16be_bom1
+ false, // uis_utfle_bom1
+ false, // uis_utf16le_bom2
+ false, // uis_utf32le_bom3
+ true, // uis_utf16le
+ true, // uis_utf32le
+ false, // uis_utf8_imp
+ false, // uis_utf16le_imp
+ false, // uis_utf32le_imp3
+ false, // uis_utf8_bom1
+ false, // uis_utf8_bom2
+ true, // uis_utf8
+ true, // uis_error
+};
+
+static UtfIntroState s_introTransitions[][uictMax] = {
+ // uict00, uictBB, uictBF, uictEF,
+ // uictFE, uictFF, uictAscii, uictOther
+ {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
+ uis_utfle_bom1, uis_utf8_imp, uis_utf8},
+ {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
+ uis_utf16be, uis_utf8},
+ {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
+ uis_utf8, uis_utf8},
+ {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
+ uis_utf8},
+ {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
+ uis_utf32be, uis_utf32be, uis_utf32be},
+ {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
+ uis_utf16be, uis_utf16be, uis_utf16be},
+ {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
+ uis_utf8},
+ {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
+ uis_utf8, uis_utf8},
+ {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
+ uis_utf16le, uis_utf16le, uis_utf16le},
+ {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
+ uis_utf16le, uis_utf16le, uis_utf16le},
+ {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
+ uis_utf16le, uis_utf16le, uis_utf16le},
+ {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
+ uis_utf32le, uis_utf32le, uis_utf32le},
+ {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
+ uis_utf8, uis_utf8},
+ {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
+ uis_utf16le, uis_utf16le, uis_utf16le},
+ {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
+ uis_utf16le, uis_utf16le, uis_utf16le},
+ {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
+ uis_utf8},
+ {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
+ uis_utf8},
+ {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
+ uis_utf8},
+};
+
+static char s_introUngetCount[][uictMax] = {
+ // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
+ {0, 1, 1, 0, 0, 0, 0, 1},
+ {0, 2, 2, 2, 2, 2, 2, 2},
+ {3, 3, 3, 3, 0, 3, 3, 3},
+ {4, 4, 4, 4, 4, 0, 4, 4},
+ {1, 1, 1, 1, 1, 1, 1, 1},
+ {1, 1, 1, 1, 1, 1, 1, 1},
+ {2, 2, 2, 2, 2, 0, 2, 2},
+ {2, 2, 2, 2, 0, 2, 2, 2},
+ {0, 1, 1, 1, 1, 1, 1, 1},
+ {0, 2, 2, 2, 2, 2, 2, 2},
+ {1, 1, 1, 1, 1, 1, 1, 1},
+ {1, 1, 1, 1, 1, 1, 1, 1},
+ {0, 2, 2, 2, 2, 2, 2, 2},
+ {0, 3, 3, 3, 3, 3, 3, 3},
+ {4, 4, 4, 4, 4, 4, 4, 4},
+ {2, 0, 2, 2, 2, 2, 2, 2},
+ {3, 3, 0, 3, 3, 3, 3, 3},
+ {1, 1, 1, 1, 1, 1, 1, 1},
+};
+
+inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
+ if (std::istream::traits_type::eof() == ch) {
+ return uictOther;
+ }
+
+ switch (ch) {
+ case 0:
+ return uict00;
+ case 0xBB:
+ return uictBB;
+ case 0xBF:
+ return uictBF;
+ case 0xEF:
+ return uictEF;
+ case 0xFE:
+ return uictFE;
+ case 0xFF:
+ return uictFF;
+ }
+
+ if ((ch > 0) && (ch < 0xFF)) {
+ return uictAscii;
+ }
+
+ return uictOther;
+}
+
+inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
+ unsigned char rshift) {
+ const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
+ const unsigned char mask = (0xFF >> (lead_bits + 1));
+ return static_cast<char>(
+ static_cast<unsigned char>(header | ((ch >> rshift) & mask)));
+}
+
+inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
+ // We are not allowed to queue the Stream::eof() codepoint, so
+ // replace it with CP_REPLACEMENT_CHARACTER
+ if (static_cast<unsigned long>(Stream::eof()) == ch) {
+ ch = CP_REPLACEMENT_CHARACTER;
+ }
+
+ if (ch < 0x80) {
+ q.push_back(Utf8Adjust(ch, 0, 0));
+ } else if (ch < 0x800) {
+ q.push_back(Utf8Adjust(ch, 2, 6));
+ q.push_back(Utf8Adjust(ch, 1, 0));
+ } else if (ch < 0x10000) {
+ q.push_back(Utf8Adjust(ch, 3, 12));
+ q.push_back(Utf8Adjust(ch, 1, 6));
+ q.push_back(Utf8Adjust(ch, 1, 0));
+ } else {
+ q.push_back(Utf8Adjust(ch, 4, 18));
+ q.push_back(Utf8Adjust(ch, 1, 12));
+ q.push_back(Utf8Adjust(ch, 1, 6));
+ q.push_back(Utf8Adjust(ch, 1, 0));
+ }
+}
+
+Stream::Stream(std::istream& input)
+ : m_input(input),
+ m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
+ m_nPrefetchedAvailable(0),
+ m_nPrefetchedUsed(0) {
+ typedef std::istream::traits_type char_traits;
+
+ if (!input)
+ return;
+
+ // Determine (or guess) the character-set by reading the BOM, if any. See
+ // the YAML specification for the determination algorithm.
+ char_traits::int_type intro[4];
+ int nIntroUsed = 0;
+ UtfIntroState state = uis_start;
+ for (; !s_introFinalState[state];) {
+ std::istream::int_type ch = input.get();
+ intro[nIntroUsed++] = ch;
+ UtfIntroCharType charType = IntroCharTypeOf(ch);
+ UtfIntroState newState = s_introTransitions[state][charType];
+ int nUngets = s_introUngetCount[state][charType];
+ if (nUngets > 0) {
+ input.clear();
+ for (; nUngets > 0; --nUngets) {
+ if (char_traits::eof() != intro[--nIntroUsed])
+ input.putback(char_traits::to_char_type(intro[nIntroUsed]));
+ }
+ }
+ state = newState;
+ }
+
+ switch (state) {
+ case uis_utf8:
+ m_charSet = utf8;
+ break;
+ case uis_utf16le:
+ m_charSet = utf16le;
+ break;
+ case uis_utf16be:
+ m_charSet = utf16be;
+ break;
+ case uis_utf32le:
+ m_charSet = utf32le;
+ break;
+ case uis_utf32be:
+ m_charSet = utf32be;
+ break;
+ default:
+ m_charSet = utf8;
+ break;
+ }
+
+ ReadAheadTo(0);
+}
+
+Stream::~Stream() { delete[] m_pPrefetched; }
+
+char Stream::peek() const {
+ if (m_readahead.empty()) {
+ return Stream::eof();
+ }
+
+ return m_readahead[0];
+}
+
+Stream::operator bool() const {
+ return m_input.good() ||
+ (!m_readahead.empty() && m_readahead[0] != Stream::eof());
+}
+
+// get
+// . Extracts a character from the stream and updates our position
+char Stream::get() {
+ char ch = peek();
+ AdvanceCurrent();
+ m_mark.column++;
+
+ if (ch == '\n') {
+ m_mark.column = 0;
+ m_mark.line++;
+ }
+
+ return ch;
+}
+
+// get
+// . Extracts 'n' characters from the stream and updates our position
+std::string Stream::get(int n) {
+ std::string ret;
+ ret.reserve(n);
+ for (int i = 0; i < n; i++)
+ ret += get();
+ return ret;
+}
+
+// eat
+// . Eats 'n' characters and updates our position.
+void Stream::eat(int n) {
+ for (int i = 0; i < n; i++)
+ get();
+}
+
+void Stream::AdvanceCurrent() {
+ if (!m_readahead.empty()) {
+ m_readahead.pop_front();
+ m_mark.pos++;
+ }
+
+ ReadAheadTo(0);
+}
+
+bool Stream::_ReadAheadTo(size_t i) const {
+ while (m_input.good() && (m_readahead.size() <= i)) {
+ switch (m_charSet) {
+ case utf8:
+ StreamInUtf8();
+ break;
+ case utf16le:
+ StreamInUtf16();
+ break;
+ case utf16be:
+ StreamInUtf16();
+ break;
+ case utf32le:
+ StreamInUtf32();
+ break;
+ case utf32be:
+ StreamInUtf32();
+ break;
+ }
+ }
+
+ // signal end of stream
+ if (!m_input.good())
+ m_readahead.push_back(Stream::eof());
+
+ return m_readahead.size() > i;
+}
+
+void Stream::StreamInUtf8() const {
+ unsigned char b = GetNextByte();
+ if (m_input.good()) {
+ m_readahead.push_back(b);
+ }
+}
+
+void Stream::StreamInUtf16() const {
+ unsigned long ch = 0;
+ unsigned char bytes[2];
+ int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
+
+ bytes[0] = GetNextByte();
+ bytes[1] = GetNextByte();
+ if (!m_input.good()) {
+ return;
+ }
+ ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
+ static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
+
+ if (ch >= 0xDC00 && ch < 0xE000) {
+ // Trailing (low) surrogate...ugh, wrong order
+ QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
+ return;
+ } else if (ch >= 0xD800 && ch < 0xDC00) {
+ // ch is a leading (high) surrogate
+
+ // Four byte UTF-8 code point
+
+ // Read the trailing (low) surrogate
+ for (;;) {
+ bytes[0] = GetNextByte();
+ bytes[1] = GetNextByte();
+ if (!m_input.good()) {
+ QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
+ return;
+ }
+ unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
+ static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
+ if (chLow < 0xDC00 || chLow >= 0xE000) {
+ // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the
+ // stream.
+ QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
+
+ // Deal with the next UTF-16 unit
+ if (chLow < 0xD800 || chLow >= 0xE000) {
+ // Easiest case: queue the codepoint and return
+ QueueUnicodeCodepoint(m_readahead, ch);
+ return;
+ } else {
+ // Start the loop over with the new high surrogate
+ ch = chLow;
+ continue;
+ }
+ }
+
+ // Select the payload bits from the high surrogate
+ ch &= 0x3FF;
+ ch <<= 10;
+
+ // Include bits from low surrogate
+ ch |= (chLow & 0x3FF);
+
+ // Add the surrogacy offset
+ ch += 0x10000;
+ break;
+ }
+ }
+
+ QueueUnicodeCodepoint(m_readahead, ch);
+}
+
+inline char* ReadBuffer(unsigned char* pBuffer) {
+ return reinterpret_cast<char*>(pBuffer);
+}
+
+unsigned char Stream::GetNextByte() const {
+ if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) {
+ std::streambuf* pBuf = m_input.rdbuf();
+ m_nPrefetchedAvailable = static_cast<std::size_t>(
+ pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
+ m_nPrefetchedUsed = 0;
+ if (!m_nPrefetchedAvailable) {
+ m_input.setstate(std::ios_base::eofbit);
+ }
+
+ if (0 == m_nPrefetchedAvailable) {
+ return 0;
+ }
+ }
+
+ return m_pPrefetched[m_nPrefetchedUsed++];
+}
+
+void Stream::StreamInUtf32() const {
+ static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}};
+
+ unsigned long ch = 0;
+ unsigned char bytes[4];
+ int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
+
+ bytes[0] = GetNextByte();
+ bytes[1] = GetNextByte();
+ bytes[2] = GetNextByte();
+ bytes[3] = GetNextByte();
+ if (!m_input.good()) {
+ return;
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ ch <<= 8;
+ ch |= bytes[pIndexes[i]];
+ }
+
+ QueueUnicodeCodepoint(m_readahead, ch);
+}
+}