Report errors from lexical and stream level in parser

Before, only the syntax level (parse_*) was able to set the error
string. This patch fixes the situation so that lexical (lex_*) and
stream (stream_*) levels can report detailed error messages.

Also, instead of 0, EOF is now returned by stream on error.
This commit is contained in:
Petri Lehtinen 2009-07-13 23:34:31 +03:00
parent 1f65a3cae7
commit df307f1bc3
2 changed files with 93 additions and 57 deletions

View File

@ -32,6 +32,7 @@ typedef struct {
get_func get; get_func get;
eof_func eof; eof_func eof;
void *data; void *data;
int stream_pos;
char buffer[5]; char buffer[5];
int buffer_pos; int buffer_pos;
} stream_t; } stream_t;
@ -52,14 +53,25 @@ typedef struct {
/*** error reporting ***/ /*** error reporting ***/
static void error_init(json_error_t *error)
{
if(error)
{
error->text[0] = '\0';
error->line = -1;
}
}
static void error_set(json_error_t *error, const lex_t *lex, static void error_set(json_error_t *error, const lex_t *lex,
const char *msg, ...) const char *msg, ...)
{ {
va_list ap; va_list ap;
char text[JSON_ERROR_TEXT_LENGTH]; char text[JSON_ERROR_TEXT_LENGTH];
if(!error) if(!error || error->text[0] != '\0') {
/* error already set */
return; return;
}
va_start(ap, msg); va_start(ap, msg);
vsnprintf(text, JSON_ERROR_TEXT_LENGTH, msg, ap); vsnprintf(text, JSON_ERROR_TEXT_LENGTH, msg, ap);
@ -95,16 +107,17 @@ void stream_init(stream_t *stream, get_func get, eof_func eof, void *data)
stream->get = get; stream->get = get;
stream->eof = eof; stream->eof = eof;
stream->data = data; stream->data = data;
stream->stream_pos = 0;
stream->buffer[0] = '\0'; stream->buffer[0] = '\0';
stream->buffer_pos = 0; stream->buffer_pos = 0;
} }
static char stream_get(stream_t *stream) static char stream_get(stream_t *stream, json_error_t *error)
{ {
char c;
if(!stream->buffer[stream->buffer_pos]) if(!stream->buffer[stream->buffer_pos])
{ {
char c;
stream->buffer[0] = stream->get(stream->data); stream->buffer[0] = stream->get(stream->data);
stream->buffer_pos = 0; stream->buffer_pos = 0;
@ -120,7 +133,7 @@ static char stream_get(stream_t *stream)
count = utf8_check_first(c); count = utf8_check_first(c);
if(!count) if(!count)
return 0; goto out;
assert(count >= 2); assert(count >= 2);
@ -128,15 +141,23 @@ static char stream_get(stream_t *stream)
stream->buffer[i] = stream->get(stream->data); stream->buffer[i] = stream->get(stream->data);
if(!utf8_check_full(stream->buffer, count)) if(!utf8_check_full(stream->buffer, count))
return 0; goto out;
stream->stream_pos += count;
stream->buffer[count] = '\0'; stream->buffer[count] = '\0';
} }
else else {
stream->buffer[1] = '\0'; stream->buffer[1] = '\0';
stream->stream_pos++;
}
} }
return (char)stream->buffer[stream->buffer_pos++]; return (char)stream->buffer[stream->buffer_pos++];
out:
error_set(error, NULL, "unable to decode byte 0x%x at position %d",
(unsigned char)c, stream->stream_pos);
return EOF;
} }
static void stream_unget(stream_t *stream, char c) static void stream_unget(stream_t *stream, char c)
@ -147,9 +168,9 @@ static void stream_unget(stream_t *stream, char c)
} }
static int lex_get(lex_t *lex) static int lex_get(lex_t *lex, json_error_t *error)
{ {
return stream_get(&lex->stream); return stream_get(&lex->stream, error);
} }
static int lex_eof(lex_t *lex) static int lex_eof(lex_t *lex)
@ -162,9 +183,9 @@ static void lex_save(lex_t *lex, char c)
strbuffer_append_byte(&lex->saved_text, c); strbuffer_append_byte(&lex->saved_text, c);
} }
static int lex_get_save(lex_t *lex) static int lex_get_save(lex_t *lex, json_error_t *error)
{ {
char c = stream_get(&lex->stream); char c = stream_get(&lex->stream, error);
lex_save(lex, c); lex_save(lex, c);
return c; return c;
} }
@ -172,12 +193,13 @@ static int lex_get_save(lex_t *lex)
static void lex_unget_unsave(lex_t *lex, char c) static void lex_unget_unsave(lex_t *lex, char c)
{ {
char d; char d;
stream_unget(&lex->stream, c); if(c != EOF)
stream_unget(&lex->stream, c);
d = strbuffer_pop(&lex->saved_text); d = strbuffer_pop(&lex->saved_text);
assert(c == d); assert(c == d);
} }
static void lex_scan_string(lex_t *lex) static void lex_scan_string(lex_t *lex, json_error_t *error)
{ {
char c; char c;
const char *p; const char *p;
@ -186,40 +208,49 @@ static void lex_scan_string(lex_t *lex)
lex->token = TOKEN_INVALID; lex->token = TOKEN_INVALID;
/* skip the " */ /* skip the " */
c = lex_get_save(lex); c = lex_get_save(lex, error);
while(c != '"') { while(c != '"') {
if(c == EOF && lex_eof(lex)) if(c == EOF) {
if(lex_eof(lex))
error_set(error, lex, "premature end of input");
goto out; goto out;
}
else if(0 <= c && c <= 0x1F) { else if(0 <= c && c <= 0x1F) {
/* control character */ /* control character */
lex_unget_unsave(lex, c); lex_unget_unsave(lex, c);
if(c == '\n')
error_set(error, lex, "unexpected newline", c);
else
error_set(error, lex, "control character 0x%x", c);
goto out; goto out;
} }
else if(c == '\\') { else if(c == '\\') {
c = lex_get_save(lex); c = lex_get_save(lex, error);
if(c == 'u') { if(c == 'u') {
c = lex_get_save(lex); c = lex_get_save(lex, error);
for(int i = 0; i < 4; i++) { for(int i = 0; i < 4; i++) {
if(!isxdigit(c)) { if(!isxdigit(c)) {
lex_unget_unsave(lex, c); lex_unget_unsave(lex, c);
error_set(error, lex, "invalid escape");
goto out; goto out;
} }
c = lex_get_save(lex); c = lex_get_save(lex, error);
} }
} }
else if(c == '"' || c == '\\' || c == '/' || c == 'b' || else if(c == '"' || c == '\\' || c == '/' || c == 'b' ||
c == 'f' || c == 'n' || c == 'r' || c == 't') c == 'f' || c == 'n' || c == 'r' || c == 't')
c = lex_get_save(lex); c = lex_get_save(lex, error);
else { else {
lex_unget_unsave(lex, c); lex_unget_unsave(lex, c);
error_set(error, lex, "invalid escape");
goto out; goto out;
} }
} }
else else
c = lex_get_save(lex); c = lex_get_save(lex, error);
} }
/* the actual value is at most of the same length as the source /* the actual value is at most of the same length as the source
@ -245,7 +276,8 @@ static void lex_scan_string(lex_t *lex)
if(*p == '\\') { if(*p == '\\') {
p++; p++;
if(*p == 'u') { if(*p == 'u') {
/* TODO: \uXXXX not supported yet */ /* TODO */
error_set(error, lex, "\\u escapes are not yet supported");
free(lex->value.string); free(lex->value.string);
lex->value.string = NULL; lex->value.string = NULL;
goto out; goto out;
@ -275,7 +307,7 @@ out:
return; return;
} }
static void lex_scan_number(lex_t *lex, char c) static void lex_scan_number(lex_t *lex, char c, json_error_t *error)
{ {
const char *saved_text; const char *saved_text;
char *end; char *end;
@ -283,19 +315,19 @@ static void lex_scan_number(lex_t *lex, char c)
lex->token = TOKEN_INVALID; lex->token = TOKEN_INVALID;
if(c == '-') if(c == '-')
c = lex_get_save(lex); c = lex_get_save(lex, error);
if(c == '0') { if(c == '0') {
c = lex_get_save(lex); c = lex_get_save(lex, error);
if(isdigit(c)) { if(isdigit(c)) {
lex_unget_unsave(lex, c); lex_unget_unsave(lex, c);
goto out; goto out;
} }
} }
else /* c != '0' */ { else /* c != '0' */ {
c = lex_get_save(lex); c = lex_get_save(lex, error);
while(isdigit(c)) while(isdigit(c))
c = lex_get_save(lex); c = lex_get_save(lex, error);
} }
if(c != '.' && c != 'E' && c != 'e') { if(c != '.' && c != 'E' && c != 'e') {
@ -310,29 +342,29 @@ static void lex_scan_number(lex_t *lex, char c)
} }
if(c == '.') { if(c == '.') {
c = lex_get(lex); c = lex_get(lex, error);
if(!isdigit(c)) if(!isdigit(c))
goto out; goto out;
lex_save(lex, c); lex_save(lex, c);
c = lex_get_save(lex); c = lex_get_save(lex, error);
while(isdigit(c)) while(isdigit(c))
c = lex_get_save(lex); c = lex_get_save(lex, error);
} }
if(c == 'E' || c == 'e') { if(c == 'E' || c == 'e') {
c = lex_get_save(lex); c = lex_get_save(lex, error);
if(c == '+' || c == '-') if(c == '+' || c == '-')
c = lex_get_save(lex); c = lex_get_save(lex, error);
if(!isdigit(c)) { if(!isdigit(c)) {
lex_unget_unsave(lex, c); lex_unget_unsave(lex, c);
goto out; goto out;
} }
c = lex_get_save(lex); c = lex_get_save(lex, error);
while(isdigit(c)) while(isdigit(c))
c = lex_get_save(lex); c = lex_get_save(lex, error);
} }
lex_unget_unsave(lex, c); lex_unget_unsave(lex, c);
@ -346,7 +378,7 @@ out:
return; return;
} }
static int lex_scan(lex_t *lex) static int lex_scan(lex_t *lex, json_error_t *error)
{ {
char c; char c;
@ -357,17 +389,20 @@ static int lex_scan(lex_t *lex)
lex->value.string = NULL; lex->value.string = NULL;
} }
c = lex_get(lex); c = lex_get(lex, error);
while(c == ' ' || c == '\t' || c == '\n' || c == '\r') while(c == ' ' || c == '\t' || c == '\n' || c == '\r')
{ {
if(c == '\n') if(c == '\n')
lex->line++; lex->line++;
c = lex_get(lex); c = lex_get(lex, error);
} }
if(c == EOF && lex_eof(lex)) { if(c == EOF) {
lex->token = TOKEN_EOF; if(lex_eof(lex))
lex->token = TOKEN_EOF;
else
lex->token = TOKEN_INVALID;
goto out; goto out;
} }
@ -377,18 +412,18 @@ static int lex_scan(lex_t *lex)
lex->token = c; lex->token = c;
else if(c == '"') else if(c == '"')
lex_scan_string(lex); lex_scan_string(lex, error);
else if(isdigit(c) || c == '-') else if(isdigit(c) || c == '-')
lex_scan_number(lex, c); lex_scan_number(lex, c, error);
else if(isupper(c) || islower(c)) { else if(isupper(c) || islower(c)) {
/* eat up the whole identifier for clearer error messages */ /* eat up the whole identifier for clearer error messages */
const char *saved_text; const char *saved_text;
c = lex_get_save(lex); c = lex_get_save(lex, error);
while(isupper(c) || islower(c)) while(isupper(c) || islower(c))
c = lex_get_save(lex); c = lex_get_save(lex, error);
lex_unget_unsave(lex, c); lex_unget_unsave(lex, c);
saved_text = strbuffer_value(&lex->saved_text); saved_text = strbuffer_value(&lex->saved_text);
@ -439,7 +474,7 @@ static json_t *parse_object(lex_t *lex, json_error_t *error)
if(!object) if(!object)
return NULL; return NULL;
lex_scan(lex); lex_scan(lex, error);
if(lex->token == '}') if(lex->token == '}')
return object; return object;
@ -456,14 +491,14 @@ static json_t *parse_object(lex_t *lex, json_error_t *error)
if(!key) if(!key)
return NULL; return NULL;
lex_scan(lex); lex_scan(lex, error);
if(lex->token != ':') { if(lex->token != ':') {
free(key); free(key);
error_set(error, lex, "':' expected"); error_set(error, lex, "':' expected");
goto error; goto error;
} }
lex_scan(lex); lex_scan(lex, error);
value = parse_value(lex, error); value = parse_value(lex, error);
if(!value) { if(!value) {
free(key); free(key);
@ -479,11 +514,11 @@ static json_t *parse_object(lex_t *lex, json_error_t *error)
json_decref(value); json_decref(value);
free(key); free(key);
lex_scan(lex); lex_scan(lex, error);
if(lex->token != ',') if(lex->token != ',')
break; break;
lex_scan(lex); lex_scan(lex, error);
} }
if(lex->token != '}') { if(lex->token != '}') {
@ -504,7 +539,7 @@ static json_t *parse_array(lex_t *lex, json_error_t *error)
if(!array) if(!array)
return NULL; return NULL;
lex_scan(lex); lex_scan(lex, error);
if(lex->token == ']') if(lex->token == ']')
return array; return array;
@ -519,11 +554,11 @@ static json_t *parse_array(lex_t *lex, json_error_t *error)
} }
json_decref(elem); json_decref(elem);
lex_scan(lex); lex_scan(lex, error);
if(lex->token != ',') if(lex->token != ',')
break; break;
lex_scan(lex); lex_scan(lex, error);
} }
if(lex->token != ']') { if(lex->token != ']') {
@ -595,8 +630,9 @@ static json_t *parse_value(lex_t *lex, json_error_t *error)
json_t *parse_json(lex_t *lex, json_error_t *error) json_t *parse_json(lex_t *lex, json_error_t *error)
{ {
lex_scan(lex); error_init(error);
lex_scan(lex, error);
if(lex->token != '[' && lex->token != '{') { if(lex->token != '[' && lex->token != '{') {
error_set(error, lex, "'[' or '{' expected"); error_set(error, lex, "'[' or '{' expected");
return NULL; return NULL;
@ -664,7 +700,7 @@ json_t *json_loads(const char *string, json_error_t *error)
if(!result) if(!result)
goto out; goto out;
lex_scan(&lex); lex_scan(&lex, error);
if(lex.token != TOKEN_EOF) { if(lex.token != TOKEN_EOF) {
error_set(error, &lex, "end of file expected"); error_set(error, &lex, "end of file expected");
json_decref(result); json_decref(result);

10
test/testdata/invalid vendored
View File

@ -30,7 +30,7 @@ unexpected token near ','
["a ["a
==== ====
1 1
invalid token near '"a' unexpected newline near '"a'
======== ========
["a" ["a"
==== ====
@ -50,12 +50,12 @@ string or '}' expected near ','
{" {"
==== ====
1 1
string or '}' expected near '"' unexpected newline near '"'
======== ========
{"a {"a
==== ====
1 1
string or '}' expected near '"a' unexpected newline near '"a'
======== ========
{"a" {"a"
==== ====
@ -75,7 +75,7 @@ unexpected token near end of file
{"a":"a {"a":"a
==== ====
1 1
invalid token near '"a' unexpected newline near '"a'
======== ========
{[ {[
==== ====
@ -135,4 +135,4 @@ invalid token near '-0'
[" <-- tab character"] [" <-- tab character"]
==== ====
1 1
invalid token near '"' control character 0x9 near '"'