/*
* Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include "yajl_lex.h"
#include "yajl_buf.h"
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#ifdef YAJL_LEXER_DEBUG
static const char *
tokToStr(yajl_tok tok)
{
switch (tok) {
case yajl_tok_bool: return "bool";
case yajl_tok_colon: return "colon";
case yajl_tok_comma: return "comma";
case yajl_tok_eof: return "eof";
case yajl_tok_error: return "error";
case yajl_tok_left_brace: return "brace";
case yajl_tok_left_bracket: return "bracket";
case yajl_tok_null: return "null";
case yajl_tok_inf: return "infinity";
case yajl_tok_minus_inf: return "-infinity";
case yajl_tok_integer: return "integer";
case yajl_tok_double: return "double";
case yajl_tok_right_brace: return "brace";
case yajl_tok_right_bracket: return "bracket";
case yajl_tok_string: return "string";
case yajl_tok_string_with_escapes: return "string_with_escapes";
}
return "unknown";
}
#endif
/* Impact of the stream parsing feature on the lexer:
*
* YAJL support stream parsing. That is, the ability to parse the first
* bits of a chunk of JSON before the last bits are available (still on
* the network or disk). This makes the lexer more complex. The
* responsibility of the lexer is to handle transparently the case where
* a chunk boundary falls in the middle of a token. This is
* accomplished is via a buffer and a character reading abstraction.
*
* Overview of implementation
*
* When we lex to end of input string before end of token is hit, we
* copy all of the input text composing the token into our lexBuf.
*
* Every time we read a character, we do so through the readChar function.
* readChar's responsibility is to handle pulling all chars from the buffer
* before pulling chars from input text
*/
struct yajl_lexer_t {
/* the overal line and char offset into the data */
size_t lineOff;
size_t charOff;
/* error */
yajl_lex_error error;
/* a input buffer to handle the case where a token is spread over
* multiple chunks */
yajl_buf buf;
/* in the case where we have data in the lexBuf, bufOff holds
* the current offset into the lexBuf. */
size_t bufOff;
/* are we using the lex buf? */
unsigned int bufInUse;
/* shall we allow comments? */
unsigned int allowComments;
/* shall we validate utf8 inside strings? */
unsigned int validateUTF8;
yajl_alloc_funcs * alloc;
};
#define readChar(lxr, txt, off) \
(((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
(*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
((txt)[(*(off))++]))
#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
yajl_lexer
yajl_lex_alloc(yajl_alloc_funcs * alloc,
unsigned int allowComments,
unsigned int validateUTF8)
{
yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
lxr->buf = yajl_buf_alloc(alloc);
lxr->allowComments = allowComments;
lxr->validateUTF8 = validateUTF8;
lxr->alloc = alloc;
return lxr;
}
void
yajl_lex_free(yajl_lexer lxr)
{
yajl_buf_free(lxr->buf);
YA_FREE(lxr->alloc, lxr);
return;
}
/* a lookup table which lets us quickly determine three things:
* VEC - valid escaped control char
* note. the solidus '/' may be escaped or not.
* IJC - invalid json char
* VHC - valid hex char
* NFP - needs further processing (from a string scanning perspective)
* NUC - needs utf8 checking when enabled (from a string scanning perspective)
*/
#define VEC 0x01
#define IJC 0x02
#define VHC 0x04
#define NFP 0x08
#define NUC 0x10
static const char charLookupTable[256] =
{
/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
};
/** process a variable length utf8 encoded codepoint.
*
* returns:
* yajl_tok_string - if valid utf8 char was parsed and offset was
* advanced
* yajl_tok_eof - if end of input was hit before validation could
* complete
* yajl_tok_error - if invalid utf8 was encountered
*
* NOTE: on error the offset will point to the first char of the
* invalid utf8 */
#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
static yajl_tok
yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset,
unsigned char curChar)
{
if (curChar <= 0x7f) {
/* single byte */
return yajl_tok_string;
} else if ((curChar >> 5) == 0x6) {
/* two byte */
UTF8_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if ((curChar >> 6) == 0x2) return yajl_tok_string;
} else if ((curChar >> 4) == 0x0e) {
/* three byte */
UTF8_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if ((curChar >> 6) == 0x2) {
UTF8_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if ((curChar >> 6) == 0x2) return yajl_tok_string;
}
} else if ((curChar >> 3) == 0x1e) {
/* four byte */
UTF8_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if ((curChar >> 6) == 0x2) {
UTF8_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if ((curChar >> 6) == 0x2) {
UTF8_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if ((curChar >> 6) == 0x2) return yajl_tok_string;
}
}
}
return yajl_tok_error;
}
/* lex a string. input is the lexer, pointer to beginning of
* json text, and start of string (offset).
* a token is returned which has the following meanings:
* yajl_tok_string: lex of string was successful. offset points to
* terminating '"'.
* yajl_tok_eof: end of text was encountered before we could complete
* the lex.
* yajl_tok_error: embedded in the string were unallowable chars. offset
* points to the offending char
*/
#define STR_CHECK_EOF \
if (*offset >= jsonTextLen) { \
tok = yajl_tok_eof; \
goto finish_string_lex; \
}
/** scan a string for interesting characters that might need further
* review. return the number of chars that are uninteresting and can
* be skipped.
* (lth) hi world, any thoughts on how to make this routine faster? */
static size_t
yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
{
unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
size_t skip = 0;
while (skip < len && !(charLookupTable[*buf] & mask))
{
skip++;
buf++;
}
return skip;
}
static yajl_tok
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset)
{
yajl_tok tok = yajl_tok_error;
int hasEscapes = 0;
for (;;) {
unsigned char curChar;
/* now jump into a faster scanning routine to skip as much
* of the buffers as possible */
{
const unsigned char * p;
size_t len;
if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
lexer->bufOff < yajl_buf_len(lexer->buf)))
{
p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
(lexer->bufOff));
len = yajl_buf_len(lexer->buf) - lexer->bufOff;
lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
}
else if (*offset < jsonTextLen)
{
p = jsonText + *offset;
len = jsonTextLen - *offset;
*offset += yajl_string_scan(p, len, lexer->validateUTF8);
}
}
STR_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
/* quote terminates */
if (curChar == '"') {
tok = yajl_tok_string;
break;
}
/* backslash escapes a set of control chars, */
else if (curChar == '\\') {
hasEscapes = 1;
STR_CHECK_EOF;
/* special case \u */
curChar = readChar(lexer, jsonText, offset);
if (curChar == 'u') {
unsigned int i = 0;
for (i=0;i<4;i++) {
STR_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if (!(charLookupTable[curChar] & VHC)) {
/* back up to offending char */
unreadChar(lexer, offset);
lexer->error = yajl_lex_string_invalid_hex_char;
goto finish_string_lex;
}
}
} else if (!(charLookupTable[curChar] & VEC)) {
/* back up to offending char */
unreadChar(lexer, offset);
lexer->error = yajl_lex_string_invalid_escaped_char;
goto finish_string_lex;
}
}
/* when not validating UTF8 it's a simple table lookup to determine
* if the present character is invalid */
else if(charLookupTable[curChar] & IJC) {
/* back up to offending char */
unreadChar(lexer, offset);
lexer->error = yajl_lex_string_invalid_json_char;
goto finish_string_lex;
}
/* when in validate UTF8 mode we need to do some extra work */
else if (lexer->validateUTF8) {
yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
offset, curChar);
if (t == yajl_tok_eof) {
tok = yajl_tok_eof;
goto finish_string_lex;
} else if (t == yajl_tok_error) {
lexer->error = yajl_lex_string_invalid_utf8;
goto finish_string_lex;
}
}
/* accept it, and move on */
}
finish_string_lex:
/* tell our buddy, the parser, wether he needs to process this string
* again */
if (hasEscapes && tok == yajl_tok_string) {
tok = yajl_tok_string_with_escapes;
}
return tok;
}
#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
static yajl_tok
yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset)
{
/** XXX: numbers are the only entities in json that we must lex
* _beyond_ in order to know that they are complete. There
* is an ambiguous case for integers at EOF. */
unsigned char c;
yajl_tok tok = yajl_tok_integer;
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
/* optional leading minus */
char minus = 0;
if (c == '-') {
minus = 1;
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
}
/* a single zero, or a series of integers */
if (c == '0') {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
} else if (c >= '1' && c <= '9') {
do {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
} while (c >= '0' && c <= '9');
} else if (c == 'i') {
if (readChar(lexer, jsonText, offset) != 'n') {
unreadChar(lexer, offset);
lexer->error = yajl_lex_invalid_infinity;
return yajl_tok_error;
}
if (readChar(lexer, jsonText, offset) != 'f') {
unreadChar(lexer, offset);
lexer->error = yajl_lex_invalid_infinity;
return yajl_tok_error;
}
if (minus) {
return yajl_tok_minus_inf;
} else {
return yajl_tok_inf;
}
} else {
unreadChar(lexer, offset);
lexer->error = yajl_lex_missing_integer_after_minus;
return yajl_tok_error;
}
/* optional fraction (indicates this is floating point) */
if (c == '.') {
int numRd = 0;
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
while (c >= '0' && c <= '9') {
numRd++;
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
}
if (!numRd) {
unreadChar(lexer, offset);
lexer->error = yajl_lex_missing_integer_after_decimal;
return yajl_tok_error;
}
tok = yajl_tok_double;
}
/* optional exponent (indicates this is floating point) */
if (c == 'e' || c == 'E') {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
/* optional sign */
if (c == '+' || c == '-') {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
}
if (c >= '0' && c <= '9') {
do {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
} while (c >= '0' && c <= '9');
} else {
unreadChar(lexer, offset);
lexer->error = yajl_lex_missing_integer_after_exponent;
return yajl_tok_error;
}
tok = yajl_tok_double;
}
/* we always go "one too far" */
unreadChar(lexer, offset);
return tok;
}
static yajl_tok
yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset)
{
unsigned char c;
yajl_tok tok = yajl_tok_comment;
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
/* either slash or star expected */
if (c == '/') {
/* now we throw away until end of line */
do {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
} while (c != '\n');
} else if (c == '*') {
/* now we throw away until end of comment */
for (;;) {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
if (c == '*') {
RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
if (c == '/') {
break;
} else {
unreadChar(lexer, offset);
}
}
}
} else {
lexer->error = yajl_lex_invalid_char;
tok = yajl_tok_error;
}
return tok;
}
#define MATCH(want_value, target_token) \
const char * want = want_value; \
do { \
if (*offset >= jsonTextLen) { \
tok = yajl_tok_eof; \
goto lexed; \
} \
c = readChar(lexer, jsonText, offset); \
if (c != *want) { \
unreadChar(lexer, offset); \
lexer->error = yajl_lex_invalid_string; \
tok = yajl_tok_error; \
goto lexed; \
} \
} while (*(++want)); \
tok = target_token; \
goto lexed;
yajl_tok
yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset,
const unsigned char ** outBuf, size_t * outLen)
{
yajl_tok tok = yajl_tok_error;
unsigned char c;
size_t startOffset = *offset;
*outBuf = NULL;
*outLen = 0;
for (;;) {
assert(*offset <= jsonTextLen);
if (*offset >= jsonTextLen) {
tok = yajl_tok_eof;
goto lexed;
}
c = readChar(lexer, jsonText, offset);
switch (c) {
case '{':
tok = yajl_tok_left_bracket;
goto lexed;
case '}':
tok = yajl_tok_right_bracket;
goto lexed;
case '[':
tok = yajl_tok_left_brace;
goto lexed;
case ']':
tok = yajl_tok_right_brace;
goto lexed;
case ',':
tok = yajl_tok_comma;
goto lexed;
case ':':
tok = yajl_tok_colon;
goto lexed;
case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
startOffset++;
break;
case 't': {
MATCH("rue", yajl_tok_bool);
}
case 'f': {
MATCH("alse", yajl_tok_bool);
}
case 'n': {
MATCH("ull", yajl_tok_null);
}
case '"': {
tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
jsonTextLen, offset);
goto lexed;
}
case '-':
case 'i':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': {
/* integer parsing wants to start from the beginning */
unreadChar(lexer, offset);
tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
jsonTextLen, offset);
goto lexed;
}
case '/':
/* hey, look, a probable comment! If comments are disabled
* it's an error. */
if (!lexer->allowComments) {
unreadChar(lexer, offset);
lexer->error = yajl_lex_unallowed_comment;
tok = yajl_tok_error;
goto lexed;
}
/* if comments are enabled, then we should try to lex
* the thing. possible outcomes are
* - successful lex (tok_comment, which means continue),
* - malformed comment opening (slash not followed by
* '*' or '/') (tok_error)
* - eof hit. (tok_eof) */
tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
jsonTextLen, offset);
if (tok == yajl_tok_comment) {
/* "error" is silly, but that's the initial
* state of tok. guilty until proven innocent. */
tok = yajl_tok_error;
yajl_buf_clear(lexer->buf);
lexer->bufInUse = 0;
startOffset = *offset;
break;
}
/* hit error or eof, bail */
goto lexed;
default:
lexer->error = yajl_lex_invalid_char;
tok = yajl_tok_error;
goto lexed;
}
}
lexed:
/* need to append to buffer if the buffer is in use or
* if it's an EOF token */
if (tok == yajl_tok_eof || lexer->bufInUse) {
if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
lexer->bufInUse = 1;
yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
lexer->bufOff = 0;
if (tok != yajl_tok_eof) {
*outBuf = yajl_buf_data(lexer->buf);
*outLen = yajl_buf_len(lexer->buf);
lexer->bufInUse = 0;
}
} else if (tok != yajl_tok_error) {
*outBuf = jsonText + startOffset;
*outLen = *offset - startOffset;
}
/* special case for strings. skip the quotes. */
if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
{
assert(*outLen >= 2);
(*outBuf)++;
*outLen -= 2;
}
#ifdef YAJL_LEXER_DEBUG
if (tok == yajl_tok_error) {
printf("lexical error: %s\n",
yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
} else if (tok == yajl_tok_eof) {
printf("EOF hit\n");
} else {
printf("lexed %s: '", tokToStr(tok));
fwrite(*outBuf, 1, *outLen, stdout);
printf("'\n");
}
#endif
return tok;
}
const char *
yajl_lex_error_to_string(yajl_lex_error error)
{
switch (error) {
case yajl_lex_e_ok:
return "ok, no error";
case yajl_lex_string_invalid_utf8:
return "invalid bytes in UTF8 string.";
case yajl_lex_string_invalid_escaped_char:
return "inside a string, '\\' occurs before a character "
"which it may not.";
case yajl_lex_string_invalid_json_char:
return "invalid character inside string.";
case yajl_lex_string_invalid_hex_char:
return "invalid (non-hex) character occurs after '\\u' inside "
"string.";
case yajl_lex_invalid_char:
return "invalid char in json text.";
case yajl_lex_invalid_string:
return "invalid string in json text.";
case yajl_lex_missing_integer_after_exponent:
return "malformed number, a digit is required after the exponent.";
case yajl_lex_missing_integer_after_decimal:
return "malformed number, a digit is required after the "
"decimal point.";
case yajl_lex_missing_integer_after_minus:
return "malformed number, a digit is required after the "
"minus sign.";
case yajl_lex_invalid_infinity:
return "malformed number, a token inf required for number starting "
"from 'i'";
case yajl_lex_unallowed_comment:
return "probable comment found in input text, comments are "
"not enabled.";
}
return "unknown error code";
}
/** allows access to more specific information about the lexical
* error when yajl_lex_lex returns yajl_tok_error. */
yajl_lex_error
yajl_lex_get_error(yajl_lexer lexer)
{
if (lexer == NULL) return (yajl_lex_error) -1;
return lexer->error;
}
size_t yajl_lex_current_line(yajl_lexer lexer)
{
return lexer->lineOff;
}
size_t yajl_lex_current_char(yajl_lexer lexer)
{
return lexer->charOff;
}
yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t offset)
{
const unsigned char * outBuf;
size_t outLen;
size_t bufLen = yajl_buf_len(lexer->buf);
size_t bufOff = lexer->bufOff;
unsigned int bufInUse = lexer->bufInUse;
yajl_tok tok;
tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
&outBuf, &outLen);
lexer->bufOff = bufOff;
lexer->bufInUse = bufInUse;
yajl_buf_truncate(lexer->buf, bufLen);
return tok;
}
size_t yajl_lex_buf_capacity(yajl_lexer lexer)
{
return yajl_buf_capacity(lexer->buf);
}