aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/yson_pull/detail/cescape_encode.h
blob: d1dae0f96e6d960b14506f0fae6efb9287add1cd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#pragma once

#include <util/system/types.h>

// Whether to ensure strict ASCII compatibility
// Turns UTF-8 strings into unreadable garbage for no known reason
//#define CESCAPE_STRICT_ASCII

namespace NYsonPull { 
    namespace NDetail { 
        namespace NCEscape { 
            namespace NImpl { 
                inline ui8 hex_digit(ui8 value) { 
                    constexpr ui8 hex_digits[] = "0123456789ABCDEF"; 
                    return hex_digits[value]; 
                } 

                inline ui8 oct_digit(ui8 value) { 
                    return '0' + value; 
                } 

                inline bool is_printable(ui8 c) { 
#ifdef CESCAPE_STRICT_ASCII
                    return c >= 32 && c <= 126; 
#else
                    return c >= 32; 
#endif
                } 

                inline bool is_hex_digit(ui8 c) { 
                    return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); 
                } 

                inline bool is_oct_digit(ui8 c) { 
                    return c >= '0' && c <= '7'; 
                } 

                constexpr size_t ESCAPE_C_BUFFER_SIZE = 4; 

                inline size_t escape_char( 
                    ui8 c, 
                    ui8 next, 
                    ui8 r[ESCAPE_C_BUFFER_SIZE]) { 
                    // (1) Printable characters go as-is, except backslash and double quote. 
                    // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible). 
                    // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal. 
                    if (c == '\"') { 
                        r[0] = '\\'; 
                        r[1] = '\"'; 
                        return 2; 
                    } else if (c == '\\') { 
                        r[0] = '\\'; 
                        r[1] = '\\'; 
                        return 2; 
                    } else if (is_printable(c)) { 
                        r[0] = c; 
                        return 1; 
                    } else if (c == '\r') { 
                        r[0] = '\\'; 
                        r[1] = 'r'; 
                        return 2; 
                    } else if (c == '\n') { 
                        r[0] = '\\'; 
                        r[1] = 'n'; 
                        return 2; 
                    } else if (c == '\t') { 
                        r[0] = '\\'; 
                        r[1] = 't'; 
                        return 2; 
                    } else if (c < 8 && !is_oct_digit(next)) { 
                        r[0] = '\\'; 
                        r[1] = oct_digit(c); 
                        return 2; 
                    } else if (!is_hex_digit(next)) { 
                        r[0] = '\\'; 
                        r[1] = 'x'; 
                        r[2] = hex_digit((c & 0xF0) >> 4); 
                        r[3] = hex_digit((c & 0x0F) >> 0); 
                        return 4; 
                    } else { 
                        r[0] = '\\'; 
                        r[1] = oct_digit((c & 0700) >> 6); 
                        r[2] = oct_digit((c & 0070) >> 3); 
                        r[3] = oct_digit((c & 0007) >> 0); 
                        return 4; 
                    } 
                } 

                template <typename T> 
                inline void escape_impl(const ui8* str, size_t len, T&& consume) { 
                    ui8 buffer[ESCAPE_C_BUFFER_SIZE]; 

                    size_t i, j; 
                    for (i = 0, j = 0; i < len; ++i) { 
                        auto next_char = i + 1 < len ? str[i + 1] : 0; 
                        size_t rlen = escape_char(str[i], next_char, buffer); 

                        if (rlen > 1) { 
                            consume(str + j, i - j); 
                            j = i + 1; 
                            consume(buffer, rlen); 
                        } 
                    } 
 
                    if (j > 0) { 
                        consume(str + j, len - j); 
                    } else { 
                        consume(str, len); 
                    } 
                } 
            } 
        }     // namespace NCEscape 
    }         // namespace NDetail 
}