Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
simple_dom: respect the size of text, and make parsers exception-safe. (
#951) (#953)

Co-authored-by: Huiba Li <[email protected]>
  • Loading branch information
photonlibos and lihuiba committed Aug 14, 2025
commit c8261b77d19e5cbcc92db0befee47180cd62bc69
1 change: 1 addition & 0 deletions ecosystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ FetchContent_Declare(
URL ${PHOTON_RAPIDXML_SOURCE}
URL_HASH
SHA256=c3f0b886374981bb20fabcf323d755db4be6dba42064599481da64a85f5b3571
PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/patches/rapidxml.patch
UPDATE_DISCONNECTED 1)
FetchContent_MakeAvailable(rapidxml)
message(STATUS "Rapidxml source dir: ${rapidxml_SOURCE_DIR}")
Expand Down
23 changes: 19 additions & 4 deletions ecosystem/patches/rapidjson.patch
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ index 19f8849b..618492a4 100644
+ kParseBoolsAsStringFlag = 512, //!< Parse all booleans (true/false) as strings.
kParseDefaultFlags = RAPIDJSON_PARSE_DEFAULT_FLAGS //!< Default parse flags. Can be customized by defining RAPIDJSON_PARSE_DEFAULT_FLAGS
};

@@ -201,6 +202,8 @@ struct BaseReaderHandler {
bool Default() { return true; }
bool Null() { return static_cast<Override&>(*this).Default(); }
Expand All @@ -22,7 +22,7 @@ index 19f8849b..618492a4 100644
@@ -714,13 +717,22 @@ private:
RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell());
}

+ template<unsigned parseFlags, typename InputStream, typename Handler>
+ void ParseRawBools(InputStream& is, Handler& handler) {
+
Expand All @@ -33,7 +33,7 @@ index 19f8849b..618492a4 100644
RAPIDJSON_ASSERT(is.Peek() == 't');
+ auto begin = is.PutBegin();
is.Take();

if (RAPIDJSON_LIKELY(Consume(is, 'r') && Consume(is, 'u') && Consume(is, 'e'))) {
- if (RAPIDJSON_UNLIKELY(!handler.Bool(true)))
+ auto copy = !(parseFlags & kParseInsituFlag);
Expand All @@ -49,7 +49,7 @@ index 19f8849b..618492a4 100644
RAPIDJSON_ASSERT(is.Peek() == 'f');
+ auto begin = is.PutBegin();
is.Take();

if (RAPIDJSON_LIKELY(Consume(is, 'a') && Consume(is, 'l') && Consume(is, 's') && Consume(is, 'e'))) {
- if (RAPIDJSON_UNLIKELY(!handler.Bool(false)))
+ auto copy = !(parseFlags & kParseInsituFlag);
Expand All @@ -59,3 +59,18 @@ index 19f8849b..618492a4 100644
RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell());
}
else
diff --git a/include/rapidjson/stream.h b/include/rapidjson/stream.h
index fef82c25..cd51ccd3 100644
--- a/include/rapidjson/stream.h
+++ b/include/rapidjson/stream.h
@@ -147,8 +147,8 @@ struct GenericInsituStringStream {
GenericInsituStringStream(Ch *src) : src_(src), dst_(0), head_(src) {}

// Read
- Ch Peek() { return *src_; }
- Ch Take() { return *src_++; }
+ Ch Peek() { return *src_ ? *src_ : '}'; }
+ Ch Take() { return *src_ ? *src_++ : '}'; }
size_t Tell() { return static_cast<size_t>(src_ - head_); }

// Write
11 changes: 11 additions & 0 deletions ecosystem/patches/rapidxml.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- rapidxml.hpp
+++ rapidxml.hpp
@@ -2205,6 +2205,8 @@
}
// Skip remaining whitespace after node name
skip<whitespace_pred, Flags>(text);
+ if (*text == Ch('\0'))
+ return; // treat it as '>' without increament of text
if (*text != Ch('>'))
RAPIDXML_PARSE_ERROR("expected >", text);
++text; // Skip '>'
37 changes: 26 additions & 11 deletions ecosystem/simple_dom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <stdlib.h>
#include <vector>
#include <algorithm>
#include <memory>
#include <photon/common/alog.h>
#include <photon/common/alog-stdstring.h>
#include <photon/common/utility.h>
Expand Down Expand Up @@ -181,11 +182,13 @@ struct JHandler : public BaseReaderHandler<UTF8<>, JHandler> {
_root = new JNode(text, text_ownership);
}
~JHandler() {
delete _root;
}
JNode* get_root() {
assert(_nodes.size() == 1);
assert(_nodes.front().size() == 1);
_root->set_children(std::move(_nodes.front().front()._children));
}
JNode* get_root() {
DEFER(_root = nullptr);
return _root;
}
void emplace_back(const char* s, size_t length, uint8_t type) {
Expand Down Expand Up @@ -248,9 +251,18 @@ struct JHandler : public BaseReaderHandler<UTF8<>, JHandler> {
}
};

// As some parsers don't support text length, they only support null
// terminated strings, so we have to convert the last trailer to '\0',
// while making the parser to treat it as the trailer.
inline void fix_trail(char* text, size_t size, char trailer) {
auto i = estring_view(text, size).rfind(trailer);
if (i != estring_view::npos) text[i] = '\0';
}

static NodeImpl* parse_json(char* text, size_t size, int flags) {
const auto kFlags = kParseNumbersAsStringsFlag | kParseBoolsAsStringFlag |
kParseInsituFlag | kParseCommentsFlag | kParseTrailingCommasFlag;
fix_trail(text, size, '}');
JHandler h(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
using Encoding = UTF8<>;
GenericInsituStringStream<Encoding> s(text);
Expand Down Expand Up @@ -299,12 +311,13 @@ class XMLNode : public DocNode<XMLNode> {
};

static NodeImpl* parse_xml(char* text, size_t size, int flags) {
fix_trail(text, size, '>');
xml_document<char> doc;
doc.parse<0>(text);
auto root = new XMLNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
auto root = make_unique<XMLNode>(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
assert(root);
root->build(&doc);
return root;
return root.release();
}

class YAMLNode : public DocNode<YAMLNode> {
Expand All @@ -330,10 +343,10 @@ class YAMLNode : public DocNode<YAMLNode> {

static NodeImpl* parse_yaml(char* text, size_t size, int flags) {
auto yaml = ryml::parse_in_place({text, size});
auto root = new YAMLNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
auto root = make_unique<YAMLNode>(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
assert(root);
root->build(yaml.rootref());
return root;
return root.release();
}

class IniNode : public DocNode<IniNode> {
Expand Down Expand Up @@ -394,24 +407,24 @@ static NodeImpl* parse_ini(char* text, size_t size, int flags) {
sort(ctx.begin(), ctx.end());
vector<IniNode> sections, nodes;
estring_view prev_sect;
auto root = new IniNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
auto root = make_unique<IniNode>(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION);
for (auto& x : ctx) {
if (prev_sect != x.section) {
prev_sect = x.section;
if (!nodes.empty() && !sections.empty()) {
sections.back().set_children(std::move(nodes));
assert(nodes.empty());
}
sections.emplace_back(x.section, str{}, root);
sections.emplace_back(x.section, str{}, root.get());
}
nodes.emplace_back(x.key, x.val, root);
nodes.emplace_back(x.key, x.val, root.get());
}
if (!sections.empty()) {
if (!nodes.empty())
sections.back().set_children(std::move(nodes));
root->set_children(std::move(sections));
}
return root;
return root.release();
}

Node parse(char* text, size_t size, int flags) {
Expand All @@ -425,7 +438,9 @@ Node parse(char* text, size_t size, int flags) {
if (flags & DOC_FREE_TEXT_IF_PARSING_FAILED) free(text);
LOG_ERROR_RETURN(EINVAL, nullptr, "invalid document type ", HEX(i));
}
auto r = parsers[i](text, size, flags);
NodeImpl* r = nullptr;
try { r = parsers[i](text, size, flags); }
catch(...) { LOG_ERROR("parsing failed and exception caught"); }
if (!r && (flags & DOC_FREE_TEXT_IF_PARSING_FAILED)) free(text);
return r;
}
Expand Down
5 changes: 5 additions & 0 deletions ecosystem/simple_dom.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ class Node {
double to_double(double def_val = NAN) const {
return value().to_double(def_val);
}
bool to_bool() const {
assert(type() == TYPE::BOOLEAN);
auto v = value();
return v.size() && (v[0] == 't' || v[0] == 'T');
}
using TYPE = NodeImpl::TYPE;

bool operator==(str rhs) const { return value() == rhs; }
Expand Down
16 changes: 8 additions & 8 deletions ecosystem/test/test_simple_dom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ using namespace std;
using namespace photon::SimpleDOM;

// OSS list response
const static char xml[] = R"(
static char xml[] = R"(
<?xml version="1.0" encoding="UTF-8"?>
<ListBucketResult category = "flowers">
<Name>examplebucket</Name>
Expand Down Expand Up @@ -89,7 +89,7 @@ void print_all2(Node node) {

static __attribute__((noinline))
int do_list_object(string_view prefix, ObjectList& result, string* marker) {
auto doc = parse_copy(xml, sizeof(xml), DOC_XML);
auto doc = parse(xml, sizeof(xml)-1, DOC_XML);
EXPECT_TRUE(doc);
auto list_bucket_result = doc["ListBucketResult"];
auto attr = list_bucket_result.get_attributes();
Expand Down Expand Up @@ -190,7 +190,7 @@ void expect_types(Node node, const std::pair<const char*, uint8_t> (&truth)[N])
}

TEST(simple_dom, json) {
const static char json0[] = R"({
static char json0[] = R"({
"hello": "world",
"t": true ,
"f": false,
Expand All @@ -199,7 +199,7 @@ TEST(simple_dom, json) {
"pi": 3.1416,
"a": [1, 2, 3, 4],
})";
auto doc = parse_copy(json0, sizeof(json0), DOC_JSON);
auto doc = parse(json0, sizeof(json0)-1, DOC_JSON);
EXPECT_TRUE(doc);
expect_eq_kvs(doc, {
{"hello", "world"},
Expand All @@ -220,7 +220,7 @@ TEST(simple_dom, json) {

TEST(simple_dom, yaml0) {
static char yaml0[] = "{foo: 1, bar: [2, 3], john: doe}";
auto doc = parse(yaml0, sizeof(yaml0), DOC_YAML);
auto doc = parse(yaml0, sizeof(yaml0)-1, DOC_YAML);
EXPECT_TRUE(doc);
expect_eq_kvs(doc, {{"foo", "1"}, {"john", "doe"}});
expect_eq_vals(doc["bar"], {"2", "3"});
Expand All @@ -245,7 +245,7 @@ newmap: {}
newmap (serialized): {}
I am something: indeed
)";
auto doc = parse(yaml1, sizeof(yaml1), DOC_YAML);
auto doc = parse(yaml1, sizeof(yaml1)-1, DOC_YAML);
EXPECT_TRUE(doc);
expect_eq_kvs(doc, {
{"foo", "says who"},
Expand All @@ -259,7 +259,7 @@ I am something: indeed
"oh so nice", "oh so nice (serialized)"});
}

const static char example_ini[] = R"(
static char example_ini[] = R"(
[protocol] ; Protocol configuration
version=6 ; IPv6

Expand Down Expand Up @@ -306,7 +306,7 @@ funny4 : two : colons
)";

TEST(simple_dom, ini) {
auto doc = parse_copy(example_ini, sizeof(example_ini) - 1, DOC_INI);
auto doc = parse(example_ini, sizeof(example_ini)-1, DOC_INI);
EXPECT_TRUE(doc);
EXPECT_EQ(doc.num_children(), 6);
expect_eq_kvs(doc["protocol"], {
Expand Down
Loading