mirror of git://gcc.gnu.org/git/gcc.git
2390 lines
59 KiB
C++
2390 lines
59 KiB
C++
/* JSON parsing
|
||
Copyright (C) 2017-2025 Free Software Foundation, Inc.
|
||
Contributed by David Malcolm <dmalcolm@redhat.com>.
|
||
|
||
This file is part of GCC.
|
||
|
||
GCC is free software; you can redistribute it and/or modify it under
|
||
the terms of the GNU General Public License as published by the Free
|
||
Software Foundation; either version 3, or (at your option) any later
|
||
version.
|
||
|
||
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
for more details.
|
||
|
||
You should have received a copy of the GNU General Public License
|
||
along with GCC; see the file COPYING3. If not see
|
||
<http://www.gnu.org/licenses/>. */
|
||
|
||
#include "config.h"
|
||
#include "system.h"
|
||
#include "coretypes.h"
|
||
#include "json-parsing.h"
|
||
#include "pretty-print.h"
|
||
#include "math.h"
|
||
#include "selftest.h"
|
||
|
||
using namespace json;
|
||
|
||
/* Declarations relating to parsing JSON, all within an
|
||
anonymous namespace. */
|
||
|
||
namespace {
|
||
|
||
/* A typedef representing a single unicode character. */
|
||
|
||
typedef unsigned unichar;
|
||
|
||
/* An enum for discriminating different kinds of JSON token. */
|
||
|
||
enum token_id
|
||
{
|
||
TOK_ERROR,
|
||
|
||
TOK_EOF,
|
||
|
||
/* Punctuation. */
|
||
TOK_OPEN_SQUARE,
|
||
TOK_OPEN_CURLY,
|
||
TOK_CLOSE_SQUARE,
|
||
TOK_CLOSE_CURLY,
|
||
TOK_COLON,
|
||
TOK_COMMA,
|
||
|
||
/* Literal names. */
|
||
TOK_TRUE,
|
||
TOK_FALSE,
|
||
TOK_NULL,
|
||
|
||
TOK_STRING,
|
||
TOK_FLOAT_NUMBER,
|
||
TOK_INTEGER_NUMBER
|
||
};
|
||
|
||
/* Human-readable descriptions of enum token_id. */
|
||
|
||
static const char *token_id_name[] = {
|
||
"error",
|
||
"EOF",
|
||
"'['",
|
||
"'{'",
|
||
"']'",
|
||
"'}'",
|
||
"':'",
|
||
"','",
|
||
"'true'",
|
||
"'false'",
|
||
"'null'",
|
||
"string",
|
||
"number",
|
||
"number"
|
||
};
|
||
|
||
/* Tokens within the JSON lexer. */
|
||
|
||
struct token
|
||
{
|
||
/* The kind of token. */
|
||
enum token_id id;
|
||
|
||
/* The location of this token within the unicode
|
||
character stream. */
|
||
location_map::range range;
|
||
|
||
union
|
||
{
|
||
/* Value for TOK_ERROR and TOK_STRING. */
|
||
char *string;
|
||
|
||
/* Value for TOK_FLOAT_NUMBER. */
|
||
double float_number;
|
||
|
||
/* Value for TOK_INTEGER_NUMBER. */
|
||
long integer_number;
|
||
} u;
|
||
};
|
||
|
||
/* A class for lexing JSON. */
|
||
|
||
class lexer
|
||
{
|
||
public:
|
||
lexer (bool support_comments);
|
||
~lexer ();
|
||
|
||
std::unique_ptr<error> add_utf8 (size_t length, const char *utf8_buf);
|
||
|
||
const token *peek ();
|
||
|
||
void consume ();
|
||
|
||
private:
|
||
bool get_char (unichar &out_char, location_map::point *out_point);
|
||
void unget_char ();
|
||
location_map::point get_next_point () const;
|
||
static void dump_token (FILE *outf, const token *tok);
|
||
void lex_token (token *out);
|
||
void lex_string (token *out);
|
||
void lex_number (token *out, unichar first_char);
|
||
bool rest_of_literal (token *out, const char *suffix);
|
||
std::unique_ptr<error> make_error (const char *msg);
|
||
bool consume_single_line_comment (token *out);
|
||
bool consume_multiline_comment (token *out);
|
||
|
||
private:
|
||
auto_vec<unichar> m_buffer;
|
||
int m_next_char_idx;
|
||
int m_next_char_line;
|
||
int m_next_char_column;
|
||
int m_prev_line_final_column; /* for handling unget_char after a '\n'. */
|
||
|
||
static const int MAX_TOKENS = 1;
|
||
token m_next_tokens[MAX_TOKENS];
|
||
int m_num_next_tokens;
|
||
|
||
bool m_support_comments;
|
||
};
|
||
|
||
/* A class for parsing JSON. */
|
||
|
||
class parser
|
||
{
|
||
public:
|
||
parser (location_map *out_loc_map,
|
||
bool support_comments);
|
||
~parser ();
|
||
|
||
std::unique_ptr<error>
|
||
add_utf8 (size_t length, const char *utf8_buf);
|
||
|
||
parser_result_t parse_value (int depth);
|
||
parser_result_t parse_object (int depth);
|
||
parser_result_t parse_array (int depth);
|
||
|
||
std::unique_ptr<error>
|
||
require_eof ();
|
||
|
||
private:
|
||
location_map::point get_next_token_start ();
|
||
location_map::point get_next_token_end ();
|
||
|
||
std::unique_ptr<error>
|
||
require (enum token_id tok_id);
|
||
|
||
result<enum token_id, std::unique_ptr<error>>
|
||
require_one_of (enum token_id tok_id_a, enum token_id tok_id_b);
|
||
|
||
std::unique_ptr<error>
|
||
error_at (const location_map::range &r,
|
||
const char *fmt, ...) ATTRIBUTE_PRINTF_3;
|
||
|
||
void maybe_record_range (json::value *jv, const location_map::range &r);
|
||
void maybe_record_range (json::value *jv,
|
||
const location_map::point &start,
|
||
const location_map::point &end);
|
||
|
||
private:
|
||
lexer m_lexer;
|
||
location_map *m_loc_map;
|
||
};
|
||
|
||
} // anonymous namespace for parsing implementation
|
||
|
||
/* Parser implementation. */
|
||
|
||
/* lexer's ctor. */
|
||
|
||
lexer::lexer (bool support_comments)
|
||
: m_buffer (), m_next_char_idx (0),
|
||
m_next_char_line (1), m_next_char_column (0),
|
||
m_prev_line_final_column (-1),
|
||
m_num_next_tokens (0),
|
||
m_support_comments (support_comments)
|
||
{
|
||
}
|
||
|
||
/* lexer's dtor. */
|
||
|
||
lexer::~lexer ()
|
||
{
|
||
while (m_num_next_tokens > 0)
|
||
consume ();
|
||
}
|
||
|
||
/* Peek the next token. */
|
||
|
||
const token *
|
||
lexer::peek ()
|
||
{
|
||
if (m_num_next_tokens == 0)
|
||
{
|
||
lex_token (&m_next_tokens[0]);
|
||
m_num_next_tokens++;
|
||
}
|
||
return &m_next_tokens[0];
|
||
}
|
||
|
||
/* Consume the next token. */
|
||
|
||
void
|
||
lexer::consume ()
|
||
{
|
||
if (m_num_next_tokens == 0)
|
||
peek ();
|
||
|
||
gcc_assert (m_num_next_tokens > 0);
|
||
gcc_assert (m_num_next_tokens <= MAX_TOKENS);
|
||
|
||
if (0)
|
||
{
|
||
fprintf (stderr, "consuming token: ");
|
||
dump_token (stderr, &m_next_tokens[0]);
|
||
fprintf (stderr, "\n");
|
||
}
|
||
|
||
if (m_next_tokens[0].id == TOK_ERROR
|
||
|| m_next_tokens[0].id == TOK_STRING)
|
||
free (m_next_tokens[0].u.string);
|
||
|
||
m_num_next_tokens--;
|
||
memmove (&m_next_tokens[0], &m_next_tokens[1],
|
||
sizeof (token) * m_num_next_tokens);
|
||
}
|
||
|
||
/* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this lexer's
|
||
buffer.
|
||
Return null if successful, or the error if there was a problem. */
|
||
|
||
std::unique_ptr<error>
|
||
lexer::add_utf8 (size_t length, const char *utf8_buf)
|
||
{
|
||
/* Adapted from charset.c:one_utf8_to_cppchar. */
|
||
static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
|
||
static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
||
|
||
const uchar *inbuf = (const unsigned char *) (utf8_buf);
|
||
const uchar **inbufp = &inbuf;
|
||
size_t *inbytesleftp = &length;
|
||
|
||
while (length > 0)
|
||
{
|
||
unichar c;
|
||
const uchar *inbuf = *inbufp;
|
||
size_t nbytes, i;
|
||
|
||
c = *inbuf;
|
||
if (c < 0x80)
|
||
{
|
||
m_buffer.safe_push (c);
|
||
*inbytesleftp -= 1;
|
||
*inbufp += 1;
|
||
continue;
|
||
}
|
||
|
||
/* The number of leading 1-bits in the first byte indicates how many
|
||
bytes follow. */
|
||
for (nbytes = 2; nbytes < 7; nbytes++)
|
||
if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
|
||
goto found;
|
||
return make_error ("ill-formed UTF-8 sequence");
|
||
found:
|
||
|
||
if (*inbytesleftp < nbytes)
|
||
return make_error ("ill-formed UTF-8 sequence");
|
||
|
||
c = (c & masks[nbytes-1]);
|
||
inbuf++;
|
||
for (i = 1; i < nbytes; i++)
|
||
{
|
||
unichar n = *inbuf++;
|
||
if ((n & 0xC0) != 0x80)
|
||
return make_error ("ill-formed UTF-8 sequence");
|
||
c = ((c << 6) + (n & 0x3F));
|
||
}
|
||
|
||
/* Make sure the shortest possible encoding was used. */
|
||
if (( c <= 0x7F && nbytes > 1)
|
||
|| (c <= 0x7FF && nbytes > 2)
|
||
|| (c <= 0xFFFF && nbytes > 3)
|
||
|| (c <= 0x1FFFFF && nbytes > 4)
|
||
|| (c <= 0x3FFFFFF && nbytes > 5))
|
||
return make_error ("ill-formed UTF-8:"
|
||
" shortest possible encoding not used");
|
||
|
||
/* Make sure the character is valid. */
|
||
if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
|
||
return make_error ("ill-formed UTF-8: invalid character");
|
||
|
||
m_buffer.safe_push (c);
|
||
*inbufp = inbuf;
|
||
*inbytesleftp -= nbytes;
|
||
}
|
||
return nullptr;
|
||
}
|
||
|
||
/* Attempt to get the next unicode character from this lexer's buffer.
|
||
If successful, write it to OUT_CHAR, and its location to *OUT_POINT,
|
||
and return true.
|
||
Otherwise, return false. */
|
||
|
||
bool
|
||
lexer::get_char (unichar &out_char, location_map::point *out_point)
|
||
{
|
||
if (m_next_char_idx >= (int)m_buffer.length ())
|
||
return false;
|
||
|
||
if (out_point)
|
||
*out_point = get_next_point ();
|
||
out_char = m_buffer[m_next_char_idx++];
|
||
|
||
if (out_char == '\n')
|
||
{
|
||
m_next_char_line++;
|
||
m_prev_line_final_column = m_next_char_column;
|
||
m_next_char_column = 0;
|
||
}
|
||
else
|
||
m_next_char_column++;
|
||
|
||
return true;
|
||
}
|
||
|
||
/* Undo the last successful get_char. */
|
||
|
||
void
|
||
lexer::unget_char ()
|
||
{
|
||
--m_next_char_idx;
|
||
if (m_next_char_column > 0)
|
||
--m_next_char_column;
|
||
else
|
||
{
|
||
m_next_char_line--;
|
||
m_next_char_column = m_prev_line_final_column;
|
||
/* We don't support more than one unget_char in a row. */
|
||
gcc_assert (m_prev_line_final_column != -1);
|
||
m_prev_line_final_column = -1;
|
||
}
|
||
}
|
||
|
||
/* Get the location of the next char. */
|
||
|
||
location_map::point
|
||
lexer::get_next_point () const
|
||
{
|
||
location_map::point result;
|
||
result.m_unichar_idx = m_next_char_idx;
|
||
result.m_line = m_next_char_line;
|
||
result.m_column = m_next_char_column;
|
||
return result;
|
||
}
|
||
|
||
/* Print a textual representation of TOK to OUTF.
|
||
This is intended for debugging the lexer and parser,
|
||
rather than for user-facing output. */
|
||
|
||
void
|
||
lexer::dump_token (FILE *outf, const token *tok)
|
||
{
|
||
switch (tok->id)
|
||
{
|
||
case TOK_ERROR:
|
||
fprintf (outf, "TOK_ERROR (\"%s\")", tok->u.string);
|
||
break;
|
||
|
||
case TOK_EOF:
|
||
fprintf (outf, "TOK_EOF");
|
||
break;
|
||
|
||
case TOK_OPEN_SQUARE:
|
||
fprintf (outf, "TOK_OPEN_SQUARE");
|
||
break;
|
||
|
||
case TOK_OPEN_CURLY:
|
||
fprintf (outf, "TOK_OPEN_CURLY");
|
||
break;
|
||
|
||
case TOK_CLOSE_SQUARE:
|
||
fprintf (outf, "TOK_CLOSE_SQUARE");
|
||
break;
|
||
|
||
case TOK_CLOSE_CURLY:
|
||
fprintf (outf, "TOK_CLOSE_CURLY");
|
||
break;
|
||
|
||
case TOK_COLON:
|
||
fprintf (outf, "TOK_COLON");
|
||
break;
|
||
|
||
case TOK_COMMA:
|
||
fprintf (outf, "TOK_COMMA");
|
||
break;
|
||
|
||
case TOK_TRUE:
|
||
fprintf (outf, "TOK_TRUE");
|
||
break;
|
||
|
||
case TOK_FALSE:
|
||
fprintf (outf, "TOK_FALSE");
|
||
break;
|
||
|
||
case TOK_NULL:
|
||
fprintf (outf, "TOK_NULL");
|
||
break;
|
||
|
||
case TOK_STRING:
|
||
fprintf (outf, "TOK_STRING (\"%s\")", tok->u.string);
|
||
break;
|
||
|
||
case TOK_FLOAT_NUMBER:
|
||
fprintf (outf, "TOK_FLOAT_NUMBER (%f)", tok->u.float_number);
|
||
break;
|
||
|
||
case TOK_INTEGER_NUMBER:
|
||
fprintf (outf, "TOK_INTEGER_NUMBER (%ld)", tok->u.integer_number);
|
||
break;
|
||
|
||
default:
|
||
gcc_unreachable ();
|
||
break;
|
||
}
|
||
}
|
||
|
||
/* Treat "//" as a comment to the end of the line.
|
||
|
||
This isn't compliant with the JSON spec,
|
||
but is very handy for writing DejaGnu tests.
|
||
|
||
Return true if EOF and populate *OUT, false otherwise. */
|
||
|
||
bool
|
||
lexer::consume_single_line_comment (token *out)
|
||
{
|
||
while (1)
|
||
{
|
||
unichar next_char;
|
||
if (!get_char (next_char, nullptr))
|
||
{
|
||
out->id = TOK_EOF;
|
||
location_map::point p = get_next_point ();
|
||
out->range.m_start = p;
|
||
out->range.m_end = p;
|
||
return true;
|
||
}
|
||
if (next_char == '\n')
|
||
return false;
|
||
}
|
||
}
|
||
|
||
/* Treat '/' '*' as a multiline comment until the next closing '*' '/'.
|
||
|
||
This isn't compliant with the JSON spec,
|
||
but is very handy for writing DejaGnu tests.
|
||
|
||
Return true if EOF and populate *OUT, false otherwise. */
|
||
|
||
bool
|
||
lexer::consume_multiline_comment (token *out)
|
||
{
|
||
while (1)
|
||
{
|
||
unichar next_char;
|
||
if (!get_char (next_char, nullptr))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
gcc_unreachable (); // TODO
|
||
location_map::point p = get_next_point ();
|
||
out->range.m_start = p;
|
||
out->range.m_end = p;
|
||
return true;
|
||
}
|
||
if (next_char != '*')
|
||
continue;
|
||
if (!get_char (next_char, nullptr))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
gcc_unreachable (); // TODO
|
||
location_map::point p = get_next_point ();
|
||
out->range.m_start = p;
|
||
out->range.m_end = p;
|
||
return true;
|
||
}
|
||
if (next_char == '/')
|
||
return false;
|
||
}
|
||
}
|
||
|
||
/* Attempt to lex the input buffer, writing the next token to OUT.
|
||
On errors, TOK_ERROR (or TOK_EOF) is written to OUT. */
|
||
|
||
void
|
||
lexer::lex_token (token *out)
|
||
{
|
||
/* Skip to next non-whitespace char. */
|
||
unichar next_char;
|
||
location_map::point start_point;
|
||
while (1)
|
||
{
|
||
if (!get_char (next_char, &start_point))
|
||
{
|
||
out->id = TOK_EOF;
|
||
location_map::point p = get_next_point ();
|
||
out->range.m_start = p;
|
||
out->range.m_end = p;
|
||
return;
|
||
}
|
||
if (m_support_comments)
|
||
if (next_char == '/')
|
||
{
|
||
location_map::point point;
|
||
unichar next_next_char;
|
||
if (get_char (next_next_char, &point))
|
||
{
|
||
switch (next_next_char)
|
||
{
|
||
case '/':
|
||
if (consume_single_line_comment (out))
|
||
return;
|
||
continue;
|
||
case '*':
|
||
if (consume_multiline_comment (out))
|
||
return;
|
||
continue;
|
||
default:
|
||
/* A stray single '/'. Break out of loop, so that we
|
||
handle it below as an unexpected character. */
|
||
goto non_whitespace;
|
||
}
|
||
}
|
||
}
|
||
if (next_char != ' '
|
||
&& next_char != '\t'
|
||
&& next_char != '\n'
|
||
&& next_char != '\r')
|
||
break;
|
||
}
|
||
|
||
non_whitespace:
|
||
|
||
out->range.m_start = start_point;
|
||
out->range.m_end = start_point;
|
||
|
||
switch (next_char)
|
||
{
|
||
case '[':
|
||
out->id = TOK_OPEN_SQUARE;
|
||
break;
|
||
|
||
case '{':
|
||
out->id = TOK_OPEN_CURLY;
|
||
break;
|
||
|
||
case ']':
|
||
out->id = TOK_CLOSE_SQUARE;
|
||
break;
|
||
|
||
case '}':
|
||
out->id = TOK_CLOSE_CURLY;
|
||
break;
|
||
|
||
case ':':
|
||
out->id = TOK_COLON;
|
||
break;
|
||
|
||
case ',':
|
||
out->id = TOK_COMMA;
|
||
break;
|
||
|
||
case '"':
|
||
lex_string (out);
|
||
break;
|
||
|
||
case '-':
|
||
case '0':
|
||
case '1':
|
||
case '2':
|
||
case '3':
|
||
case '4':
|
||
case '5':
|
||
case '6':
|
||
case '7':
|
||
case '8':
|
||
case '9':
|
||
lex_number (out, next_char);
|
||
break;
|
||
|
||
case 't':
|
||
/* Handle literal "true". */
|
||
if (rest_of_literal (out, "rue"))
|
||
{
|
||
out->id = TOK_TRUE;
|
||
break;
|
||
}
|
||
else
|
||
goto err;
|
||
|
||
case 'f':
|
||
/* Handle literal "false". */
|
||
if (rest_of_literal (out, "alse"))
|
||
{
|
||
out->id = TOK_FALSE;
|
||
break;
|
||
}
|
||
else
|
||
goto err;
|
||
|
||
case 'n':
|
||
/* Handle literal "null". */
|
||
if (rest_of_literal (out, "ull"))
|
||
{
|
||
out->id = TOK_NULL;
|
||
break;
|
||
}
|
||
else
|
||
goto err;
|
||
|
||
err:
|
||
default:
|
||
out->id = TOK_ERROR;
|
||
out->u.string = xasprintf ("unexpected character: '%c'", next_char);
|
||
break;
|
||
}
|
||
}
|
||
|
||
/* Having consumed an open-quote character from the lexer's buffer, attempt
|
||
to lex the rest of a JSON string, writing the result to OUT (or TOK_ERROR)
|
||
if an error occurred.
|
||
(ECMA-404 section 9; RFC 7159 section 7). */
|
||
|
||
void
|
||
lexer::lex_string (token *out)
|
||
{
|
||
auto_vec<unichar> content;
|
||
bool still_going = true;
|
||
while (still_going)
|
||
{
|
||
unichar uc;
|
||
if (!get_char (uc, &out->range.m_end))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_end = get_next_point ();
|
||
out->u.string = xstrdup ("EOF within string");
|
||
return;
|
||
}
|
||
switch (uc)
|
||
{
|
||
case '"':
|
||
still_going = false;
|
||
break;
|
||
case '\\':
|
||
{
|
||
unichar next_char;
|
||
if (!get_char (next_char, &out->range.m_end))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_end = get_next_point ();
|
||
out->u.string = xstrdup ("EOF within string");;
|
||
return;
|
||
}
|
||
switch (next_char)
|
||
{
|
||
case '"':
|
||
case '\\':
|
||
case '/':
|
||
content.safe_push (next_char);
|
||
break;
|
||
|
||
case 'b':
|
||
content.safe_push ('\b');
|
||
break;
|
||
|
||
case 'f':
|
||
content.safe_push ('\f');
|
||
break;
|
||
|
||
case 'n':
|
||
content.safe_push ('\n');
|
||
break;
|
||
|
||
case 'r':
|
||
content.safe_push ('\r');
|
||
break;
|
||
|
||
case 't':
|
||
content.safe_push ('\t');
|
||
break;
|
||
|
||
case 'u':
|
||
{
|
||
unichar result = 0;
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
unichar hexdigit;
|
||
if (!get_char (hexdigit, &out->range.m_end))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_end = get_next_point ();
|
||
out->u.string = xstrdup ("EOF within string");
|
||
return;
|
||
}
|
||
result <<= 4;
|
||
if (hexdigit >= '0' && hexdigit <= '9')
|
||
result += hexdigit - '0';
|
||
else if (hexdigit >= 'a' && hexdigit <= 'f')
|
||
result += (hexdigit - 'a') + 10;
|
||
else if (hexdigit >= 'A' && hexdigit <= 'F')
|
||
result += (hexdigit - 'A') + 10;
|
||
else
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = out->range.m_end;
|
||
out->u.string = xstrdup ("bogus hex char");
|
||
return;
|
||
}
|
||
}
|
||
content.safe_push (result);
|
||
}
|
||
break;
|
||
|
||
default:
|
||
out->id = TOK_ERROR;
|
||
out->u.string = xstrdup ("unrecognized escape char");
|
||
return;
|
||
}
|
||
}
|
||
break;
|
||
|
||
default:
|
||
/* Reject unescaped control characters U+0000 through U+001F
|
||
(ECMA-404 section 9 para 1; RFC 7159 section 7 para 1). */
|
||
if (uc <= 0x1f)
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = out->range.m_end;
|
||
out->u.string = xstrdup ("unescaped control char");
|
||
return;
|
||
}
|
||
|
||
/* Otherwise, add regular unicode code point. */
|
||
content.safe_push (uc);
|
||
break;
|
||
}
|
||
}
|
||
|
||
out->id = TOK_STRING;
|
||
|
||
auto_vec<char> utf8_buf;
|
||
// Adapted from libcpp/charset.c:one_cppchar_to_utf8
|
||
for (unsigned i = 0; i < content.length (); i++)
|
||
{
|
||
static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
||
static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
|
||
size_t nbytes;
|
||
uchar buf[6], *p = &buf[6];
|
||
unichar c = content[i];
|
||
|
||
nbytes = 1;
|
||
if (c < 0x80)
|
||
*--p = c;
|
||
else
|
||
{
|
||
do
|
||
{
|
||
*--p = ((c & 0x3F) | 0x80);
|
||
c >>= 6;
|
||
nbytes++;
|
||
}
|
||
while (c >= 0x3F || (c & limits[nbytes-1]));
|
||
*--p = (c | masks[nbytes-1]);
|
||
}
|
||
|
||
while (p < &buf[6])
|
||
utf8_buf.safe_push (*p++);
|
||
}
|
||
|
||
out->u.string = XNEWVEC (char, utf8_buf.length () + 1);
|
||
for (unsigned i = 0; i < utf8_buf.length (); i++)
|
||
out->u.string[i] = utf8_buf[i];
|
||
out->u.string[utf8_buf.length ()] = '\0';
|
||
}
|
||
|
||
/* Having consumed FIRST_CHAR, an initial digit or '-' character from
|
||
the lexer's buffer attempt to lex the rest of a JSON number, writing
|
||
the result to OUT (or TOK_ERROR) if an error occurred.
|
||
(ECMA-404 section 8; RFC 7159 section 6). */
|
||
|
||
void
|
||
lexer::lex_number (token *out, unichar first_char)
|
||
{
|
||
bool negate = false;
|
||
double value = 0.0;
|
||
if (first_char == '-')
|
||
{
|
||
negate = true;
|
||
if (!get_char (first_char, &out->range.m_end))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = out->range.m_end;
|
||
out->u.string = xstrdup ("expected digit");
|
||
return;
|
||
}
|
||
}
|
||
|
||
if (first_char == '0')
|
||
value = 0.0;
|
||
else if (!ISDIGIT (first_char))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = out->range.m_end;
|
||
out->u.string = xstrdup ("expected digit");
|
||
return;
|
||
}
|
||
else
|
||
{
|
||
/* Got a nonzero digit; expect zero or more digits. */
|
||
value = first_char - '0';
|
||
while (1)
|
||
{
|
||
unichar uc;
|
||
location_map::point point;
|
||
if (!get_char (uc, &point))
|
||
break;
|
||
if (ISDIGIT (uc))
|
||
{
|
||
value *= 10;
|
||
value += uc -'0';
|
||
out->range.m_end = point;
|
||
continue;
|
||
}
|
||
else
|
||
{
|
||
unget_char ();
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Optional '.', followed by one or more decimals. */
|
||
unichar next_char;
|
||
location_map::point point;
|
||
if (get_char (next_char, &point))
|
||
{
|
||
if (next_char == '.')
|
||
{
|
||
/* Parse decimal digits. */
|
||
bool had_digit = false;
|
||
double digit_factor = 0.1;
|
||
while (get_char (next_char, &point))
|
||
{
|
||
if (!ISDIGIT (next_char))
|
||
{
|
||
unget_char ();
|
||
break;
|
||
}
|
||
value += (next_char - '0') * digit_factor;
|
||
digit_factor *= 0.1;
|
||
had_digit = true;
|
||
out->range.m_end = point;
|
||
}
|
||
if (!had_digit)
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = point;
|
||
out->range.m_start = point;
|
||
out->u.string = xstrdup ("expected digit");
|
||
return;
|
||
}
|
||
}
|
||
else
|
||
unget_char ();
|
||
}
|
||
|
||
/* Parse 'e' and 'E'. */
|
||
unichar exponent_char;
|
||
if (get_char (exponent_char, &point))
|
||
{
|
||
if (exponent_char == 'e' || exponent_char == 'E')
|
||
{
|
||
/* Optional +/-. */
|
||
unichar sign_char;
|
||
int exponent = 0;
|
||
bool negate_exponent = false;
|
||
bool had_exponent_digit = false;
|
||
if (!get_char (sign_char, &point))
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = point;
|
||
out->range.m_start = point;
|
||
out->u.string = xstrdup ("EOF within exponent");
|
||
return;
|
||
}
|
||
if (sign_char == '-')
|
||
negate_exponent = true;
|
||
else if (sign_char == '+')
|
||
;
|
||
else if (ISDIGIT (sign_char))
|
||
{
|
||
exponent = sign_char - '0';
|
||
had_exponent_digit = true;
|
||
}
|
||
else
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = point;
|
||
out->range.m_start = point;
|
||
out->u.string
|
||
= xstrdup ("expected '-','+' or digit within exponent");
|
||
return;
|
||
}
|
||
out->range.m_end = point;
|
||
|
||
/* One or more digits (we might have seen the digit above,
|
||
though). */
|
||
while (1)
|
||
{
|
||
unichar uc;
|
||
location_map::point point;
|
||
if (!get_char (uc, &point))
|
||
break;
|
||
if (ISDIGIT (uc))
|
||
{
|
||
exponent *= 10;
|
||
exponent += uc -'0';
|
||
had_exponent_digit = true;
|
||
out->range.m_end = point;
|
||
continue;
|
||
}
|
||
else
|
||
{
|
||
unget_char ();
|
||
break;
|
||
}
|
||
}
|
||
if (!had_exponent_digit)
|
||
{
|
||
out->id = TOK_ERROR;
|
||
out->range.m_start = point;
|
||
out->range.m_start = point;
|
||
out->u.string = xstrdup ("expected digit within exponent");
|
||
return;
|
||
}
|
||
if (negate_exponent)
|
||
exponent = -exponent;
|
||
value = value * pow (10, exponent);
|
||
}
|
||
else
|
||
unget_char ();
|
||
}
|
||
|
||
if (negate)
|
||
value = -value;
|
||
|
||
if (value == (long)value)
|
||
{
|
||
out->id = TOK_INTEGER_NUMBER;
|
||
out->u.integer_number = value;
|
||
}
|
||
else
|
||
{
|
||
out->id = TOK_FLOAT_NUMBER;
|
||
out->u.float_number = value;
|
||
}
|
||
}
|
||
|
||
/* Determine if the next characters to be lexed match SUFFIX.
|
||
SUFFIX must be pure ASCII and not contain newlines.
|
||
If so, consume the characters and return true.
|
||
Otherwise, return false. */
|
||
|
||
bool
|
||
lexer::rest_of_literal (token *out, const char *suffix)
|
||
{
|
||
int suffix_idx = 0;
|
||
int buf_idx = m_next_char_idx;
|
||
while (1)
|
||
{
|
||
if (suffix[suffix_idx] == '\0')
|
||
{
|
||
m_next_char_idx += suffix_idx;
|
||
m_next_char_column += suffix_idx;
|
||
out->range.m_end.m_unichar_idx += suffix_idx;
|
||
out->range.m_end.m_column += suffix_idx;
|
||
return true;
|
||
}
|
||
if (buf_idx >= (int)m_buffer.length ())
|
||
return false;
|
||
/* This assumes that suffix is ASCII. */
|
||
if (m_buffer[buf_idx] != (unichar)suffix[suffix_idx])
|
||
return false;
|
||
buf_idx++;
|
||
suffix_idx++;
|
||
}
|
||
}
|
||
|
||
/* Create a new error instance for MSG, using the location of the next
|
||
character for the location of the error. */
|
||
|
||
std::unique_ptr<error>
|
||
lexer::make_error (const char *msg)
|
||
{
|
||
location_map::point p;
|
||
p.m_unichar_idx = m_next_char_idx;
|
||
p.m_line = m_next_char_line;
|
||
p.m_column = m_next_char_column;
|
||
location_map::range r;
|
||
r.m_start = p;
|
||
r.m_end = p;
|
||
return std::make_unique<error> (r, xstrdup (msg));
|
||
}
|
||
|
||
/* parser's ctor. */
|
||
|
||
parser::parser (location_map *out_loc_map,
|
||
bool support_comments)
|
||
: m_lexer (support_comments), m_loc_map (out_loc_map)
|
||
{
|
||
}
|
||
|
||
/* parser's dtor. */
|
||
|
||
parser::~parser ()
|
||
{
|
||
if (m_loc_map)
|
||
m_loc_map->on_finished_parsing ();
|
||
}
|
||
|
||
/* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this parser's
|
||
lexer's buffer. */
|
||
|
||
std::unique_ptr<error>
|
||
parser::add_utf8 (size_t length, const char *utf8_buf)
|
||
{
|
||
return m_lexer.add_utf8 (length, utf8_buf);
|
||
}
|
||
|
||
/* Parse a JSON value (object, array, number, string, or literal).
|
||
(ECMA-404 section 5; RFC 7159 section 3). */
|
||
|
||
parser_result_t
|
||
parser::parse_value (int depth)
|
||
{
|
||
const token *tok = m_lexer.peek ();
|
||
|
||
/* Avoid stack overflow with deeply-nested inputs; RFC 7159 section 9
|
||
states: "An implementation may set limits on the maximum depth
|
||
of nesting.".
|
||
|
||
Ideally we'd avoid this limit (e.g. by rewriting parse_value,
|
||
parse_object, and parse_array into a single function with a vec of
|
||
state). */
|
||
const int MAX_DEPTH = 100;
|
||
if (depth >= MAX_DEPTH)
|
||
return error_at (tok->range, "maximum nesting depth exceeded: %i",
|
||
MAX_DEPTH);
|
||
|
||
switch (tok->id)
|
||
{
|
||
case TOK_OPEN_CURLY:
|
||
return parse_object (depth);
|
||
|
||
case TOK_STRING:
|
||
{
|
||
auto val = std::make_unique<string> (tok->u.string);
|
||
m_lexer.consume ();
|
||
maybe_record_range (val.get (), tok->range);
|
||
return parser_result_t (std::move (val));
|
||
}
|
||
|
||
case TOK_OPEN_SQUARE:
|
||
return parse_array (depth);
|
||
|
||
case TOK_FLOAT_NUMBER:
|
||
{
|
||
auto val = std::make_unique<float_number> (tok->u.float_number);
|
||
m_lexer.consume ();
|
||
maybe_record_range (val.get (), tok->range);
|
||
return parser_result_t (std::move (val));
|
||
}
|
||
|
||
case TOK_INTEGER_NUMBER:
|
||
{
|
||
auto val = std::make_unique<integer_number> (tok->u.integer_number);
|
||
m_lexer.consume ();
|
||
maybe_record_range (val.get (), tok->range);
|
||
return parser_result_t (std::move (val));
|
||
}
|
||
|
||
case TOK_TRUE:
|
||
{
|
||
auto val = std::make_unique<literal> (JSON_TRUE);
|
||
m_lexer.consume ();
|
||
maybe_record_range (val.get (), tok->range);
|
||
return parser_result_t (std::move (val));
|
||
}
|
||
|
||
case TOK_FALSE:
|
||
{
|
||
auto val = std::make_unique<literal> (JSON_FALSE);
|
||
m_lexer.consume ();
|
||
maybe_record_range (val.get (), tok->range);
|
||
return parser_result_t (std::move (val));
|
||
}
|
||
|
||
case TOK_NULL:
|
||
{
|
||
auto val = std::make_unique<literal> (JSON_NULL);
|
||
m_lexer.consume ();
|
||
maybe_record_range (val.get (), tok->range);
|
||
return parser_result_t (std::move (val));
|
||
}
|
||
|
||
case TOK_ERROR:
|
||
return error_at (tok->range, "invalid JSON token: %s", tok->u.string);
|
||
|
||
default:
|
||
return error_at (tok->range, "expected a JSON value but got %s",
|
||
token_id_name[tok->id]);
|
||
}
|
||
}
|
||
|
||
/* Parse a JSON object.
|
||
(ECMA-404 section 6; RFC 7159 section 4). */
|
||
|
||
parser_result_t
|
||
parser::parse_object (int depth)
|
||
{
|
||
location_map::point start = get_next_token_start ();
|
||
|
||
require (TOK_OPEN_CURLY);
|
||
|
||
auto obj = std::make_unique<object> ();
|
||
|
||
const token *tok = m_lexer.peek ();
|
||
if (tok->id == TOK_CLOSE_CURLY)
|
||
{
|
||
location_map::point end = get_next_token_end ();
|
||
maybe_record_range (obj.get (), start, end);
|
||
if (auto err = require (TOK_CLOSE_CURLY))
|
||
return parser_result_t (std::move (err));
|
||
return parser_result_t (std::move (obj));
|
||
}
|
||
if (tok->id != TOK_STRING)
|
||
return error_at (tok->range,
|
||
"expected string for object key after '{'; got %s",
|
||
token_id_name[tok->id]);
|
||
while (true)
|
||
{
|
||
tok = m_lexer.peek ();
|
||
if (tok->id != TOK_STRING)
|
||
return error_at (tok->range,
|
||
"expected string for object key after ','; got %s",
|
||
token_id_name[tok->id]);
|
||
label_text key = label_text::take (xstrdup (tok->u.string));
|
||
m_lexer.consume ();
|
||
|
||
if (auto err = require (TOK_COLON))
|
||
return parser_result_t (std::move (err));
|
||
|
||
parser_result_t r = parse_value (depth + 1);
|
||
if (r.m_err)
|
||
return r;
|
||
if (!r.m_val)
|
||
return parser_result_t (std::move (obj));
|
||
|
||
/* We don't enforce uniqueness for keys. */
|
||
obj->set (key.get (), std::move (r.m_val));
|
||
|
||
location_map::point end = get_next_token_end ();
|
||
result<enum token_id, std::unique_ptr<error>> result
|
||
(require_one_of (TOK_COMMA, TOK_CLOSE_CURLY));
|
||
if (result.m_err)
|
||
return parser_result_t (std::move (result.m_err));
|
||
if (result.m_val == TOK_COMMA)
|
||
continue;
|
||
else
|
||
{
|
||
/* TOK_CLOSE_CURLY. */
|
||
maybe_record_range (obj.get (), start, end);
|
||
return parser_result_t (std::move (obj));
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Parse a JSON array.
|
||
(ECMA-404 section 7; RFC 7159 section 5). */
|
||
|
||
parser_result_t
|
||
parser::parse_array (int depth)
|
||
{
|
||
location_map::point start = get_next_token_start ();
|
||
if (auto err = require (TOK_OPEN_SQUARE))
|
||
return parser_result_t (std::move (err));
|
||
|
||
auto arr = std::make_unique<array> ();
|
||
|
||
const token *tok = m_lexer.peek ();
|
||
if (tok->id == TOK_CLOSE_SQUARE)
|
||
{
|
||
location_map::point end = get_next_token_end ();
|
||
maybe_record_range (arr.get (), start, end);
|
||
m_lexer.consume ();
|
||
return parser_result_t (std::move (arr));
|
||
}
|
||
|
||
while (true)
|
||
{
|
||
parser_result_t r = parse_value (depth + 1);
|
||
if (r.m_err)
|
||
return r;
|
||
|
||
arr->append (std::move (r.m_val));
|
||
|
||
location_map::point end = get_next_token_end ();
|
||
result<enum token_id, std::unique_ptr<error>> result
|
||
(require_one_of (TOK_COMMA, TOK_CLOSE_SQUARE));
|
||
if (result.m_err)
|
||
return parser_result_t (std::move (result.m_err));
|
||
if (result.m_val == TOK_COMMA)
|
||
continue;
|
||
else
|
||
{
|
||
/* TOK_CLOSE_SQUARE. */
|
||
maybe_record_range (arr.get (), start, end);
|
||
return parser_result_t (std::move (arr));
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Get the start point of the next token. */
|
||
|
||
location_map::point
|
||
parser::get_next_token_start ()
|
||
{
|
||
const token *tok = m_lexer.peek ();
|
||
return tok->range.m_start;
|
||
}
|
||
|
||
/* Get the end point of the next token. */
|
||
|
||
location_map::point
|
||
parser::get_next_token_end ()
|
||
{
|
||
const token *tok = m_lexer.peek ();
|
||
return tok->range.m_end;
|
||
}
|
||
|
||
/* Require an EOF, or fail if there is surplus input. */
|
||
|
||
std::unique_ptr<error>
|
||
parser::require_eof ()
|
||
{
|
||
return require (TOK_EOF);
|
||
}
|
||
|
||
/* Consume the next token, issuing an error if it is not of kind TOK_ID. */
|
||
|
||
std::unique_ptr<error>
|
||
parser::require (enum token_id tok_id)
|
||
{
|
||
const token *tok = m_lexer.peek ();
|
||
if (tok->id != tok_id)
|
||
{
|
||
if (tok->id == TOK_ERROR)
|
||
return error_at (tok->range,
|
||
"expected %s; got bad token: %s",
|
||
token_id_name[tok_id], tok->u.string);
|
||
else
|
||
return error_at (tok->range,
|
||
"expected %s; got %s", token_id_name[tok_id],
|
||
token_id_name[tok->id]);
|
||
}
|
||
m_lexer.consume ();
|
||
return nullptr;
|
||
}
|
||
|
||
/* Consume the next token, issuing an error if it is not of
|
||
kind TOK_ID_A or TOK_ID_B.
|
||
Return which kind it was. */
|
||
|
||
result<enum token_id, std::unique_ptr<error>>
|
||
parser::require_one_of (enum token_id tok_id_a, enum token_id tok_id_b)
|
||
{
|
||
const token *tok = m_lexer.peek ();
|
||
if ((tok->id != tok_id_a)
|
||
&& (tok->id != tok_id_b))
|
||
{
|
||
if (tok->id == TOK_ERROR)
|
||
return error_at (tok->range, "expected %s or %s; got bad token: %s",
|
||
token_id_name[tok_id_a], token_id_name[tok_id_b],
|
||
tok->u.string);
|
||
else
|
||
return error_at (tok->range, "expected %s or %s; got %s",
|
||
token_id_name[tok_id_a], token_id_name[tok_id_b],
|
||
token_id_name[tok->id]);
|
||
}
|
||
enum token_id id = tok->id;
|
||
m_lexer.consume ();
|
||
return result<enum token_id, std::unique_ptr<error>> (id);
|
||
}
|
||
|
||
/* Genarate a parsing error. */
|
||
|
||
std::unique_ptr<error>
|
||
parser::error_at (const location_map::range &r, const char *fmt, ...)
|
||
{
|
||
va_list ap;
|
||
va_start (ap, fmt);
|
||
char *formatted_msg = xvasprintf (fmt, ap);
|
||
va_end (ap);
|
||
|
||
return std::make_unique<error> (r, formatted_msg);
|
||
}
|
||
|
||
/* Record that JV has range R within the input file. */
|
||
|
||
void
|
||
parser::maybe_record_range (json::value *jv, const location_map::range &r)
|
||
{
|
||
if (m_loc_map)
|
||
m_loc_map->record_range_for_value (jv, r);
|
||
}
|
||
|
||
/* Record that JV has range START to END within the input file. */
|
||
|
||
void
|
||
parser::maybe_record_range (json::value *jv,
|
||
const location_map::point &start,
|
||
const location_map::point &end)
|
||
{
|
||
if (m_loc_map)
|
||
{
|
||
location_map::range r;
|
||
r.m_start = start;
|
||
r.m_end = end;
|
||
m_loc_map->record_range_for_value (jv, r);
|
||
}
|
||
}
|
||
|
||
/* Attempt to parse the UTF-8 encoded buffer at UTF8_BUF
|
||
of the given LENGTH.
|
||
If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
|
||
buffer, as an extension to JSON, otherwise forbid them.
|
||
If successful, return an json::value in the result.
|
||
if there was a problem, return a json::error in the result.
|
||
If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
|
||
source locations of nodes seen during parsing. */
|
||
|
||
parser_result_t
|
||
json::parse_utf8_string (size_t length,
|
||
const char *utf8_buf,
|
||
bool allow_comments,
|
||
location_map *out_loc_map)
|
||
{
|
||
parser p (out_loc_map, allow_comments);
|
||
if (auto err = p.add_utf8 (length, utf8_buf))
|
||
return parser_result_t (std::move (err));
|
||
parser_result_t r = p.parse_value (0);
|
||
if (r.m_err)
|
||
return r;
|
||
if (auto err = p.require_eof ())
|
||
return parser_result_t (std::move (err));
|
||
return r;
|
||
}
|
||
|
||
/* Attempt to parse the nil-terminated UTF-8 encoded buffer at
|
||
UTF8_BUF.
|
||
If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
|
||
buffer, as an extension to JSON, otherwise forbid them.
|
||
If successful, return a non-NULL json::value *.
|
||
if there was a problem, return NULL and write an error
|
||
message to err_out, which must be deleted by the caller.
|
||
If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
|
||
source locations of nodes seen during parsing. */
|
||
|
||
json::parser_result_t
|
||
json::parse_utf8_string (const char *utf8,
|
||
bool allow_comments,
|
||
location_map *out_loc_map)
|
||
{
|
||
return parse_utf8_string (strlen (utf8), utf8, allow_comments,
|
||
out_loc_map);
|
||
}
|
||
|
||
|
||
#if CHECKING_P
|
||
|
||
namespace selftest {
|
||
|
||
/* Selftests. */
|
||
|
||
#define ASSERT_PRINT_EQ(JV, FORMATTED, EXPECTED_JSON) \
|
||
assert_print_eq (SELFTEST_LOCATION, JV, FORMATTED, EXPECTED_JSON)
|
||
|
||
/* Implementation detail of ASSERT_RANGE_EQ. */
|
||
|
||
static void
|
||
assert_point_eq (const location &loc,
|
||
const location_map::point &actual_point,
|
||
size_t exp_unichar_idx, int exp_line, int exp_column)
|
||
{
|
||
ASSERT_EQ_AT (loc, actual_point.m_unichar_idx, exp_unichar_idx);
|
||
ASSERT_EQ_AT (loc, actual_point.m_line, exp_line);
|
||
ASSERT_EQ_AT (loc, actual_point.m_column, exp_column);
|
||
}
|
||
|
||
/* Implementation detail of ASSERT_RANGE_EQ. */
|
||
|
||
static void
|
||
assert_range_eq (const location &loc,
|
||
const location_map::range &actual_range,
|
||
/* Expected location. */
|
||
size_t start_unichar_idx, int start_line, int start_column,
|
||
size_t end_unichar_idx, int end_line, int end_column)
|
||
{
|
||
assert_point_eq (loc, actual_range.m_start,
|
||
start_unichar_idx, start_line, start_column);
|
||
assert_point_eq (loc, actual_range.m_end,
|
||
end_unichar_idx, end_line, end_column);
|
||
}
|
||
|
||
/* Assert that ACTUAL_RANGE starts at
|
||
(START_UNICHAR_IDX, START_LINE, START_COLUMN)
|
||
and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN). */
|
||
|
||
#define ASSERT_RANGE_EQ(ACTUAL_RANGE, \
|
||
START_UNICHAR_IDX, START_LINE, START_COLUMN, \
|
||
END_UNICHAR_IDX, END_LINE, END_COLUMN) \
|
||
assert_range_eq ((SELFTEST_LOCATION), (ACTUAL_RANGE), \
|
||
(START_UNICHAR_IDX), (START_LINE), (START_COLUMN), \
|
||
(END_UNICHAR_IDX), (END_LINE), (END_COLUMN))
|
||
|
||
/* Implementation detail of ASSERT_ERR_EQ. */
|
||
|
||
static void
|
||
assert_err_eq (const location &loc,
|
||
const json::error *actual_err,
|
||
/* Expected location. */
|
||
size_t start_unichar_idx, int start_line, int start_column,
|
||
size_t end_unichar_idx, int end_line, int end_column,
|
||
const char *expected_msg)
|
||
{
|
||
ASSERT_TRUE_AT (loc, actual_err);
|
||
const location_map::range &actual_range = actual_err->get_range ();
|
||
ASSERT_EQ_AT (loc, actual_range.m_start.m_unichar_idx, start_unichar_idx);
|
||
ASSERT_EQ_AT (loc, actual_range.m_start.m_line, start_line);
|
||
ASSERT_EQ_AT (loc, actual_range.m_start.m_column, start_column);
|
||
ASSERT_EQ_AT (loc, actual_range.m_end.m_unichar_idx, end_unichar_idx);
|
||
ASSERT_EQ_AT (loc, actual_range.m_end.m_line, end_line);
|
||
ASSERT_EQ_AT (loc, actual_range.m_end.m_column, end_column);
|
||
ASSERT_STREQ_AT (loc, actual_err->get_msg (), expected_msg);
|
||
}
|
||
|
||
/* Assert that ACTUAL_ERR is a non-NULL json::error *,
|
||
with message EXPECTED_MSG, and that its location starts
|
||
at (START_UNICHAR_IDX, START_LINE, START_COLUMN)
|
||
and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN). */
|
||
|
||
#define ASSERT_ERR_EQ(ACTUAL_ERR, \
|
||
START_UNICHAR_IDX, START_LINE, START_COLUMN, \
|
||
END_UNICHAR_IDX, END_LINE, END_COLUMN, \
|
||
EXPECTED_MSG) \
|
||
assert_err_eq ((SELFTEST_LOCATION), (ACTUAL_ERR), \
|
||
(START_UNICHAR_IDX), (START_LINE), (START_COLUMN), \
|
||
(END_UNICHAR_IDX), (END_LINE), (END_COLUMN), \
|
||
(EXPECTED_MSG))
|
||
|
||
/* Verify that the JSON lexer works as expected. */
|
||
|
||
static void
|
||
test_lexer ()
|
||
{
|
||
lexer l (false);
|
||
const char *str
|
||
/* 0 1 2 3 4 . */
|
||
/* 01234567890123456789012345678901234567890123456789. */
|
||
= (" 1066 -1 \n"
|
||
" -273.15 1e6\n"
|
||
" [ ] null true false { } \"foo\" \n");
|
||
auto err = l.add_utf8 (strlen (str), str);
|
||
ASSERT_EQ (err, nullptr);
|
||
|
||
/* Line 1. */
|
||
{
|
||
const size_t line_offset = 0;
|
||
|
||
/* Expect token: "1066" in columns 4-7. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 1066);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 4, 1, 4,
|
||
line_offset + 7, 1, 7);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "-1" in columns 11-12. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, -1);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 11, 1, 11,
|
||
line_offset + 12, 1, 12);
|
||
l.consume ();
|
||
}
|
||
}
|
||
|
||
/* Line 2. */
|
||
{
|
||
const size_t line_offset = 16;
|
||
|
||
/* Expect token: "-273.15" in columns 4-10. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_FLOAT_NUMBER);
|
||
ASSERT_EQ (int(tok->u.float_number), int(-273.15));
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 4, 2, 4,
|
||
line_offset + 10, 2, 10);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "1e6" in columns 12-14. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 1000000);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 12, 2, 12,
|
||
line_offset + 14, 2, 14);
|
||
l.consume ();
|
||
}
|
||
}
|
||
|
||
/* Line 3. */
|
||
{
|
||
const size_t line_offset = 32;
|
||
|
||
/* Expect token: "[". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_OPEN_SQUARE);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 2, 3, 2,
|
||
line_offset + 2, 3, 2);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "]". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_CLOSE_SQUARE);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 6, 3, 6,
|
||
line_offset + 6, 3, 6);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "null". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_NULL);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 8, 3, 8,
|
||
line_offset + 11, 3, 11);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "true". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_TRUE);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 15, 3, 15,
|
||
line_offset + 18, 3, 18);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "false". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_FALSE);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 21, 3, 21,
|
||
line_offset + 25, 3, 25);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "{". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_OPEN_CURLY);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 28, 3, 28,
|
||
line_offset + 28, 3, 28);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "}". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_CLOSE_CURLY);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 31, 3, 31,
|
||
line_offset + 31, 3, 31);
|
||
l.consume ();
|
||
}
|
||
/* Expect token: "\"foo\"". */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_STRING);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 34, 3, 34,
|
||
line_offset + 38, 3, 38);
|
||
l.consume ();
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Verify that the JSON lexer complains about single-line comments
|
||
when comments are disabled. */
|
||
|
||
static void
|
||
test_lexing_unsupported_single_line_comment ()
|
||
{
|
||
lexer l (false);
|
||
const char *str
|
||
/* 0 1 2 3 4 . */
|
||
/* 01234567890123456789012345678901234567890123456789. */
|
||
= (" 1066 // Hello world\n");
|
||
auto err = l.add_utf8 (strlen (str), str);
|
||
ASSERT_EQ (err, nullptr);
|
||
|
||
/* Line 1. */
|
||
{
|
||
const size_t line_offset = 0;
|
||
const int line_1 = 1;
|
||
|
||
/* Expect token: "1066" in columns 4-7. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 1066);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 4, line_1, 4,
|
||
line_offset + 7, line_1, 7);
|
||
l.consume ();
|
||
}
|
||
|
||
/* Expect error. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_ERROR);
|
||
ASSERT_STREQ (tok->u.string, "unexpected character: '/'");
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 11, line_1, 11,
|
||
line_offset + 11, line_1, 11);
|
||
l.consume ();
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Verify that the JSON lexer complains about multiline comments
|
||
when comments are disabled. */
|
||
|
||
static void
|
||
test_lexing_unsupported_multiline_comment ()
|
||
{
|
||
lexer l (false);
|
||
const char *str
|
||
/* 0 1 2 3 4 . */
|
||
/* 01234567890123456789012345678901234567890123456789. */
|
||
= (" 1066 /* Hello world\n"
|
||
" continuation of comment\n"
|
||
" end of comment */ 42\n");
|
||
auto err = l.add_utf8 (strlen (str), str);
|
||
ASSERT_EQ (err, nullptr);
|
||
|
||
/* Line 1. */
|
||
{
|
||
const size_t line_offset = 0;
|
||
const int line_1 = 1;
|
||
|
||
/* Expect token: "1066" in line 1, columns 4-7. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 1066);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 4, line_1, 4,
|
||
line_offset + 7, line_1, 7);
|
||
l.consume ();
|
||
}
|
||
|
||
/* Expect error. */
|
||
{
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_ERROR);
|
||
ASSERT_STREQ (tok->u.string, "unexpected character: '/'");
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_offset + 11, line_1, 11,
|
||
line_offset + 11, line_1, 11);
|
||
l.consume ();
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Verify that the JSON lexer handles single-line comments
|
||
when comments are enabled. */
|
||
|
||
static void
|
||
test_lexing_supported_single_line_comment ()
|
||
{
|
||
lexer l (true);
|
||
const char *str
|
||
/* 0 1 2 3 4 . */
|
||
/* 01234567890123456789012345678901234567890123456789. */
|
||
= (" 1066 // Hello world\n"
|
||
" 42 // etc\n");
|
||
auto err = l.add_utf8 (strlen (str), str);
|
||
ASSERT_EQ (err, nullptr);
|
||
|
||
const size_t line_1_offset = 0;
|
||
const size_t line_2_offset = 26;
|
||
const size_t line_3_offset = line_2_offset + 17;
|
||
|
||
/* Expect token: "1066" in line 1, columns 4-7. */
|
||
{
|
||
const int line_1 = 1;
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 1066);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_1_offset + 4, line_1, 4,
|
||
line_1_offset + 7, line_1, 7);
|
||
l.consume ();
|
||
}
|
||
|
||
/* Expect token: "42" in line 2, columns 5-6. */
|
||
{
|
||
const int line_2 = 2;
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 42);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_2_offset + 5, line_2, 5,
|
||
line_2_offset + 6, line_2, 6);
|
||
l.consume ();
|
||
}
|
||
|
||
/* Expect EOF. */
|
||
{
|
||
const int line_3 = 3;
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_EOF);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_3_offset + 0, line_3, 0,
|
||
line_3_offset + 0, line_3, 0);
|
||
l.consume ();
|
||
}
|
||
}
|
||
|
||
/* Verify that the JSON lexer handles multiline comments
|
||
when comments are enabled. */
|
||
|
||
static void
|
||
test_lexing_supported_multiline_comment ()
|
||
{
|
||
lexer l (true);
|
||
const char *str
|
||
/* 0 1 2 3 4 . */
|
||
/* 01234567890123456789012345678901234567890123456789. */
|
||
= (" 1066 /* Hello world\n"
|
||
" continuation of comment\n"
|
||
" end of comment */ 42\n");
|
||
auto err = l.add_utf8 (strlen (str), str);
|
||
ASSERT_EQ (err, nullptr);
|
||
|
||
const size_t line_1_offset = 0;
|
||
const size_t line_2_offset = 26;
|
||
const size_t line_3_offset = line_2_offset + 25;
|
||
const size_t line_4_offset = line_3_offset + 23;
|
||
|
||
/* Expect token: "1066" in line 1, columns 4-7. */
|
||
{
|
||
const int line_1 = 1;
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 1066);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_1_offset + 4, line_1, 4,
|
||
line_1_offset + 7, line_1, 7);
|
||
l.consume ();
|
||
}
|
||
|
||
/* Expect token: "42" in line 3, columns 20-21. */
|
||
{
|
||
const int line_3 = 3;
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
|
||
ASSERT_EQ (tok->u.integer_number, 42);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_3_offset + 20, line_3, 20,
|
||
line_3_offset + 21, line_3, 21);
|
||
l.consume ();
|
||
}
|
||
|
||
/* Expect EOF. */
|
||
{
|
||
const int line_4 = 4;
|
||
const token *tok = l.peek ();
|
||
ASSERT_EQ (tok->id, TOK_EOF);
|
||
ASSERT_RANGE_EQ (tok->range,
|
||
line_4_offset + 0, line_4, 0,
|
||
line_4_offset + 0, line_4, 0);
|
||
l.consume ();
|
||
}
|
||
}
|
||
|
||
/* Helper class for writing JSON parsing testcases.
|
||
Attempts to parse a string in ctor, and captures the result (either
|
||
a json::value or a json::error), and a location map. */
|
||
|
||
struct parser_testcase
|
||
{
|
||
public:
|
||
parser_testcase (const char *utf8_string, bool allow_comments = false)
|
||
: m_loc_map (),
|
||
m_result (parse_utf8_string (utf8_string, allow_comments, &m_loc_map))
|
||
{
|
||
}
|
||
|
||
const json::value *get_value () const { return m_result.m_val.get (); }
|
||
const json::error *get_error () const { return m_result.m_err.get (); }
|
||
|
||
const location_map::range *
|
||
get_range_for_value (const json::value *jv) const
|
||
{
|
||
return m_loc_map.get_range_for_value (jv);
|
||
}
|
||
|
||
private:
|
||
/* Concrete implementation of location_map for use in
|
||
JSON parsing selftests. */
|
||
class test_location_map : public location_map
|
||
{
|
||
public:
|
||
void record_range_for_value (json::value *jv, const range &r) final override
|
||
{
|
||
m_map.put (jv, r);
|
||
}
|
||
|
||
range *get_range_for_value (const json::value *jv) const
|
||
{
|
||
return const_cast<hash_map<const json::value *, range> &> (m_map)
|
||
.get (jv);
|
||
}
|
||
|
||
private:
|
||
hash_map<const json::value *, range> m_map;
|
||
};
|
||
|
||
test_location_map m_loc_map;
|
||
json::parser_result_t m_result;
|
||
};
|
||
|
||
/* Verify that parse_utf8_string works as expected. */
|
||
|
||
static void
|
||
test_parse_string ()
|
||
{
|
||
const int line_1 = 1;
|
||
|
||
{
|
||
parser_testcase tc ("\"foo\"");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_STRING);
|
||
ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (), "foo");
|
||
ASSERT_PRINT_EQ (*jv, true, "\"foo\"");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
4, line_1, 4);
|
||
}
|
||
|
||
{
|
||
const char *contains_quotes = "\"before \\\"quoted\\\" after\"";
|
||
parser_testcase tc (contains_quotes);
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_STRING);
|
||
ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
|
||
"before \"quoted\" after");
|
||
ASSERT_PRINT_EQ (*jv, true, contains_quotes);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
24, line_1, 24);
|
||
}
|
||
|
||
/* Test of non-ASCII input. This string is the Japanese word "mojibake",
|
||
written as C octal-escaped UTF-8. */
|
||
const char *mojibake = (/* Opening quote. */
|
||
"\""
|
||
/* U+6587 CJK UNIFIED IDEOGRAPH-6587
|
||
UTF-8: 0xE6 0x96 0x87
|
||
C octal escaped UTF-8: \346\226\207. */
|
||
"\346\226\207"
|
||
/* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
|
||
UTF-8: 0xE5 0xAD 0x97
|
||
C octal escaped UTF-8: \345\255\227. */
|
||
"\345\255\227"
|
||
/* U+5316 CJK UNIFIED IDEOGRAPH-5316
|
||
UTF-8: 0xE5 0x8C 0x96
|
||
C octal escaped UTF-8: \345\214\226. */
|
||
"\345\214\226"
|
||
/* U+3051 HIRAGANA LETTER KE
|
||
UTF-8: 0xE3 0x81 0x91
|
||
C octal escaped UTF-8: \343\201\221. */
|
||
"\343\201\221"
|
||
/* Closing quote. */
|
||
"\"");
|
||
{
|
||
parser_testcase tc (mojibake);
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_STRING);
|
||
/* Result of get_string should be UTF-8 encoded, without quotes. */
|
||
ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
|
||
"\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
|
||
/* Result of dump should be UTF-8 encoded, with quotes. */
|
||
ASSERT_PRINT_EQ (*jv, false, mojibake);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
5, line_1, 5);
|
||
}
|
||
|
||
/* Test of \u-escaped unicode. This is "mojibake" again, as above. */
|
||
{
|
||
const char *escaped_unicode = "\"\\u6587\\u5b57\\u5316\\u3051\"";
|
||
parser_testcase tc (escaped_unicode);
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_STRING);
|
||
/* Result of get_string should be UTF-8 encoded, without quotes. */
|
||
ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
|
||
"\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
|
||
/* Result of dump should be UTF-8 encoded, with quotes. */
|
||
ASSERT_PRINT_EQ (*jv, false, mojibake);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
25, line_1, 25);
|
||
}
|
||
}
|
||
|
||
/* Verify that we can parse various kinds of JSON numbers. */
|
||
|
||
static void
|
||
test_parse_number ()
|
||
{
|
||
const int line_1 = 1;
|
||
|
||
{
|
||
parser_testcase tc ("42");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
|
||
ASSERT_EQ (as_a <const json::integer_number *> (jv)->get (), 42.0);
|
||
ASSERT_PRINT_EQ (*jv, true, "42");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
1, line_1, 1);
|
||
}
|
||
|
||
/* Negative number. */
|
||
{
|
||
parser_testcase tc ("-17");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
|
||
ASSERT_EQ (as_a<const json::integer_number *> (jv)->get (), -17.0);
|
||
ASSERT_PRINT_EQ (*jv, true, "-17");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
2, line_1, 2);
|
||
}
|
||
|
||
/* Decimal. */
|
||
{
|
||
parser_testcase tc ("3.141");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (JSON_FLOAT, jv->get_kind ());
|
||
ASSERT_NEAR (3.141, ((const json::float_number *)jv)->get (), 0.001);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
4, line_1, 4);
|
||
}
|
||
|
||
/* Exponents. */
|
||
{
|
||
{
|
||
parser_testcase tc ("3.141e+0");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_FLOAT);
|
||
ASSERT_NEAR (as_a <const json::float_number *> (jv)->get (), 3.141, 0.1);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
7, line_1, 7);
|
||
}
|
||
{
|
||
parser_testcase tc ("42e2");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
|
||
ASSERT_EQ (as_a <const json::integer_number *> (jv)->get (), 4200);
|
||
ASSERT_PRINT_EQ (*jv, true, "4200");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
3, line_1, 3);
|
||
}
|
||
{
|
||
parser_testcase tc ("42e-1");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_FLOAT);
|
||
ASSERT_NEAR (as_a <const json::float_number *> (jv)->get (), 4.2, 0.1);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
4, line_1, 4);
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Verify that JSON array parsing works. */
|
||
|
||
static void
|
||
test_parse_array ()
|
||
{
|
||
const int line_1 = 1;
|
||
|
||
parser_testcase tc ("[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_EQ (jv->get_kind (), JSON_ARRAY);
|
||
const json::array *arr = as_a <const json::array *> (jv);
|
||
ASSERT_EQ (arr->length (), 10);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
29, line_1, 29);
|
||
for (int i = 0; i < 10; i++)
|
||
{
|
||
json::value *element = arr->get (i);
|
||
ASSERT_EQ (element->get_kind (), JSON_INTEGER);
|
||
ASSERT_EQ (as_a <json::integer_number *> (element)->get (), i);
|
||
range = tc.get_range_for_value (element);
|
||
ASSERT_TRUE (range);
|
||
const int offset = 1 + (i * 3);
|
||
ASSERT_RANGE_EQ (*range,
|
||
offset, line_1, offset,
|
||
offset, line_1, offset);
|
||
}
|
||
ASSERT_PRINT_EQ (*jv, false, "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
|
||
}
|
||
|
||
/* Verify that JSON object parsing works. */
|
||
|
||
static void
|
||
test_parse_object ()
|
||
{
|
||
const int line_1 = 1;
|
||
std::unique_ptr<error> err;
|
||
/* 0 1 2 3 . */
|
||
/* 01 2345 678 9012 345 6789 0123456789012. */
|
||
parser_testcase tc ("{\"foo\": \"bar\", \"baz\": [42, null]}");
|
||
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_NE (jv, nullptr);
|
||
ASSERT_EQ (jv->get_kind (), JSON_OBJECT);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
32, line_1, 32);
|
||
const json::object *jo = static_cast <const json::object *> (jv);
|
||
|
||
json::value *foo_value = jo->get ("foo");
|
||
ASSERT_NE (foo_value, nullptr);
|
||
ASSERT_EQ (foo_value->get_kind (), JSON_STRING);
|
||
ASSERT_STREQ (as_a <json::string *> (foo_value)->get_string (), "bar");
|
||
range = tc.get_range_for_value (foo_value);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
8, line_1, 8,
|
||
12, line_1, 12);
|
||
|
||
json::value *baz_value = jo->get ("baz");
|
||
ASSERT_NE (baz_value, nullptr);
|
||
ASSERT_EQ (baz_value->get_kind (), JSON_ARRAY);
|
||
range = tc.get_range_for_value (baz_value);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
22, line_1, 22,
|
||
31, line_1, 31);
|
||
|
||
json::array *baz_array = as_a <json::array *> (baz_value);
|
||
ASSERT_EQ (baz_array->length (), 2);
|
||
|
||
json::value *element0 = baz_array->get (0);
|
||
ASSERT_EQ (as_a <json::integer_number *> (element0)->get (), 42);
|
||
range = tc.get_range_for_value (element0);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
23, line_1, 23,
|
||
24, line_1, 24);
|
||
|
||
json::value *element1 = baz_array->get (1);
|
||
ASSERT_EQ (element1->get_kind (), JSON_NULL);
|
||
range = tc.get_range_for_value (element1);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
27, line_1, 27,
|
||
30, line_1, 30);
|
||
}
|
||
|
||
/* Verify that the JSON literals "true", "false" and "null" are parsed
|
||
correctly. */
|
||
|
||
static void
|
||
test_parse_literals ()
|
||
{
|
||
const int line_1 = 1;
|
||
{
|
||
parser_testcase tc ("true");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_NE (jv, nullptr);
|
||
ASSERT_EQ (jv->get_kind (), JSON_TRUE);
|
||
ASSERT_PRINT_EQ (*jv, false, "true");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
3, line_1, 3);
|
||
}
|
||
|
||
{
|
||
parser_testcase tc ("false");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_NE (jv, nullptr);
|
||
ASSERT_EQ (jv->get_kind (), JSON_FALSE);
|
||
ASSERT_PRINT_EQ (*jv, false, "false");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
4, line_1, 4);
|
||
}
|
||
|
||
{
|
||
parser_testcase tc ("null");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_NE (jv, nullptr);
|
||
ASSERT_EQ (jv->get_kind (), JSON_NULL);
|
||
ASSERT_PRINT_EQ (*jv, false, "null");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
3, line_1, 3);
|
||
}
|
||
}
|
||
|
||
/* Verify that we can parse a simple JSON-RPC request. */
|
||
|
||
static void
|
||
test_parse_jsonrpc ()
|
||
{
|
||
std::unique_ptr<error> err;
|
||
const char *request
|
||
/* 0 1 2 3 4. */
|
||
/* 01 23456789 012 3456 789 0123456 789 012345678 90. */
|
||
= ("{\"jsonrpc\": \"2.0\", \"method\": \"subtract\",\n"
|
||
/* 0 1 2 3 4. */
|
||
/* 0 1234567 8901234567890 1234 56789012345678 90. */
|
||
" \"params\": [42, 23], \"id\": 1}");
|
||
const int line_1 = 1;
|
||
const int line_2 = 2;
|
||
const size_t line_2_offset = 41;
|
||
parser_testcase tc (request);
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_NE (jv, nullptr);
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
line_2_offset + 28, line_2, 28);
|
||
}
|
||
|
||
/* Verify that we can parse an empty JSON object. */
|
||
|
||
static void
|
||
test_parse_empty_object ()
|
||
{
|
||
const int line_1 = 1;
|
||
std::unique_ptr<error> err;
|
||
parser_testcase tc ("{}");
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_NE (jv, nullptr);
|
||
ASSERT_EQ (jv->get_kind (), JSON_OBJECT);
|
||
ASSERT_PRINT_EQ (*jv, true, "{}");
|
||
auto range = tc.get_range_for_value (jv);
|
||
ASSERT_TRUE (range);
|
||
ASSERT_RANGE_EQ (*range,
|
||
0, line_1, 0,
|
||
1, line_1, 1);
|
||
}
|
||
|
||
/* Verify that comment-parsing can be enabled or disabled. */
|
||
|
||
static void
|
||
test_parsing_comments ()
|
||
{
|
||
const char *str = ("// foo\n"
|
||
"/*...\n"
|
||
"...*/ 42 // bar\n"
|
||
"/* etc */\n");
|
||
|
||
/* Parsing with comment support disabled. */
|
||
{
|
||
parser_testcase tc (str);
|
||
ASSERT_NE (tc.get_error (), nullptr);
|
||
ASSERT_STREQ (tc.get_error ()->get_msg (),
|
||
"invalid JSON token: unexpected character: '/'");
|
||
ASSERT_EQ (tc.get_value (), nullptr);
|
||
}
|
||
|
||
/* Parsing with comment support enabled. */
|
||
{
|
||
parser_testcase tc (str, true);
|
||
ASSERT_EQ (tc.get_error (), nullptr);
|
||
const json::value *jv = tc.get_value ();
|
||
ASSERT_NE (jv, nullptr);
|
||
ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
|
||
ASSERT_EQ (((const json::integer_number *)jv)->get (), 42);
|
||
}
|
||
}
|
||
|
||
/* Verify that we can parse an empty JSON string. */
|
||
|
||
static void
|
||
test_error_empty_string ()
|
||
{
|
||
const int line_1 = 1;
|
||
parser_testcase tc ("");
|
||
ASSERT_ERR_EQ (tc.get_error (),
|
||
0, line_1, 0,
|
||
0, line_1, 0,
|
||
"expected a JSON value but got EOF");
|
||
ASSERT_EQ (tc.get_value (), nullptr);
|
||
}
|
||
|
||
/* Verify that JSON parsing gracefully handles an invalid token. */
|
||
|
||
static void
|
||
test_error_bad_token ()
|
||
{
|
||
const int line_1 = 1;
|
||
parser_testcase tc (" not valid ");
|
||
ASSERT_ERR_EQ (tc.get_error (),
|
||
2, line_1, 2,
|
||
2, line_1, 2,
|
||
"invalid JSON token: unexpected character: 'n'");
|
||
ASSERT_EQ (tc.get_value (), nullptr);
|
||
}
|
||
|
||
/* Verify that JSON parsing gracefully handles a missing comma
|
||
within an object. */
|
||
|
||
static void
|
||
test_error_object_with_missing_comma ()
|
||
{
|
||
const int line_1 = 1;
|
||
/* 0 1 2. */
|
||
/* 01 2345 6789012 3456 7890. */
|
||
const char *json = "{\"foo\" : 42 \"bar\"";
|
||
parser_testcase tc (json);
|
||
ASSERT_ERR_EQ (tc.get_error (),
|
||
12, line_1, 12,
|
||
16, line_1, 16,
|
||
"expected ',' or '}'; got string");
|
||
ASSERT_EQ (tc.get_value (), nullptr);
|
||
}
|
||
|
||
/* Verify that JSON parsing gracefully handles a missing comma
|
||
within an array. */
|
||
|
||
static void
|
||
test_error_array_with_missing_comma ()
|
||
{
|
||
const int line_1 = 1;
|
||
/* 01234567. */
|
||
const char *json = "[0, 1 42]";
|
||
parser_testcase tc (json);
|
||
ASSERT_ERR_EQ (tc.get_error (),
|
||
6, line_1, 6,
|
||
7, line_1, 7,
|
||
"expected ',' or ']'; got number");
|
||
ASSERT_EQ (tc.get_value (), nullptr);
|
||
}
|
||
|
||
/* Run all of the selftests within this file. */
|
||
|
||
void
|
||
json_parser_cc_tests ()
|
||
{
|
||
test_lexer ();
|
||
test_lexing_unsupported_single_line_comment ();
|
||
test_lexing_unsupported_multiline_comment ();
|
||
test_lexing_supported_single_line_comment ();
|
||
test_lexing_supported_multiline_comment ();
|
||
test_parse_string ();
|
||
test_parse_number ();
|
||
test_parse_array ();
|
||
test_parse_object ();
|
||
test_parse_literals ();
|
||
test_parse_jsonrpc ();
|
||
test_parse_empty_object ();
|
||
test_parsing_comments ();
|
||
test_error_empty_string ();
|
||
test_error_bad_token ();
|
||
test_error_object_with_missing_comma ();
|
||
test_error_array_with_missing_comma ();
|
||
}
|
||
|
||
} // namespace selftest
|
||
|
||
#endif /* #if CHECKING_P */
|