Skip to content

Commit 50100b5

Browse files
Merge pull request #73 from esl/cdata
Add CDATA escaping to exml
2 parents 79e0f21 + 70328df commit 50100b5

8 files changed

+81
-43
lines changed

c_src/exml.cpp

+29-8
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ namespace {
6868
ERL_NIF_TERM atom_xmlstreamstart;
6969
ERL_NIF_TERM atom_xmlstreamend;
7070
ERL_NIF_TERM atom_pretty;
71+
ERL_NIF_TERM atom_escaped;
72+
ERL_NIF_TERM atom_cdata;
7173
ERL_NIF_TERM atom_true;
7274
constexpr const unsigned char EMPTY[1] = {0};
7375

@@ -154,8 +156,9 @@ ERL_NIF_TERM make_attr_tuple(ParseCtx &ctx,
154156

155157
ERL_NIF_TERM get_xmlcdata(ParseCtx &ctx,
156158
rapidxml::xml_node<unsigned char> *node) {
157-
return enif_make_tuple2(ctx.env, atom_xmlcdata,
158-
to_subbinary(ctx, node->value(), node->value_size()));
159+
return enif_make_tuple3(ctx.env, atom_xmlcdata,
160+
to_subbinary(ctx, node->value(), node->value_size()),
161+
atom_escaped);
159162
}
160163

161164
ERL_NIF_TERM merge_data_nodes(ParseCtx &ctx,
@@ -170,7 +173,7 @@ ERL_NIF_TERM merge_data_nodes(ParseCtx &ctx,
170173
node = node->next_sibling();
171174
}
172175

173-
return enif_make_tuple2(ctx.env, atom_xmlcdata, bin);
176+
return enif_make_tuple3(ctx.env, atom_xmlcdata, bin, atom_escaped);
174177
}
175178

176179
void append_pending_data_nodes(ParseCtx &ctx,
@@ -299,7 +302,15 @@ bool build_cdata(ErlNifEnv *env, xml_document &doc, const ERL_NIF_TERM elem[],
299302
if (!enif_inspect_iolist_as_binary(env, elem[1], &bin))
300303
return false;
301304

302-
auto child = doc.impl.allocate_node(rapidxml::node_data);
305+
rapidxml::node_type cdata_type;
306+
if (enif_compare(atom_escaped, elem[2]) == 0)
307+
cdata_type = rapidxml::node_data;
308+
else if (enif_compare(atom_cdata, elem[2]) == 0)
309+
cdata_type = rapidxml::node_cdata;
310+
else
311+
return false;
312+
313+
auto child = doc.impl.allocate_node(cdata_type);
303314
child->value(bin.size > 0 ? bin.data : EMPTY, bin.size);
304315
node.append_node(child);
305316
return true;
@@ -336,7 +347,7 @@ bool build_attrs(ErlNifEnv *env, xml_document &doc, ERL_NIF_TERM attrs,
336347
bool build_el(ErlNifEnv *env, xml_document &doc, const ERL_NIF_TERM elem[],
337348
rapidxml::xml_node<unsigned char> &node) {
338349
ErlNifBinary name;
339-
if (!enif_inspect_iolist_as_binary(env, elem[1], &name))
350+
if (!enif_inspect_binary(env, elem[1], &name))
340351
return false;
341352

342353
auto child = doc.impl.allocate_node(rapidxml::node_element);
@@ -358,7 +369,7 @@ bool build_child(ErlNifEnv *env, xml_document &doc, ERL_NIF_TERM child,
358369
if (!enif_get_tuple(env, child, &arity, &tuple))
359370
return false;
360371

361-
if (arity == 2 && enif_compare(atom_xmlcdata, tuple[0]) == 0) {
372+
if (arity == 3 && enif_compare(atom_xmlcdata, tuple[0]) == 0) {
362373
if (!build_cdata(env, doc, tuple, node))
363374
return false;
364375
} else if (arity == 4 && enif_compare(atom_xmlel, tuple[0]) == 0) {
@@ -443,6 +454,8 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) {
443454
atom_xmlstreamstart = enif_make_atom(env, "xmlstreamstart");
444455
atom_xmlstreamend = enif_make_atom(env, "xmlstreamend");
445456
atom_pretty = enif_make_atom(env, "pretty");
457+
atom_escaped = enif_make_atom(env, "escaped");
458+
atom_cdata = enif_make_atom(env, "cdata");
446459
atom_true = enif_make_atom(env, "true");
447460

448461
get_static_doc().impl.set_allocator(enif_alloc, enif_free);
@@ -608,7 +621,15 @@ static ERL_NIF_TERM escape_cdata(ErlNifEnv *env, int argc,
608621
if (!enif_inspect_iolist_as_binary(env, argv[0], &bin))
609622
return enif_make_badarg(env);
610623

611-
rapidxml::xml_node<unsigned char> node(rapidxml::node_data);
624+
rapidxml::node_type cdata_type;
625+
if (enif_compare(atom_escaped, argv[1]) == 0)
626+
cdata_type = rapidxml::node_data;
627+
else if (enif_compare(atom_cdata, argv[1]) == 0)
628+
cdata_type = rapidxml::node_cdata;
629+
else
630+
return enif_make_badarg(env);
631+
632+
rapidxml::xml_node<unsigned char> node(cdata_type);
612633
node.value(bin.data, bin.size);
613634
return node_to_binary(env, node, rapidxml::print_no_indenting);
614635
}
@@ -647,7 +668,7 @@ static ERL_NIF_TERM reset_parser(ErlNifEnv *env, int argc,
647668

648669
static ErlNifFunc nif_funcs[] = {
649670
{"create", 2, create}, {"parse", 1, parse},
650-
{"parse_next", 2, parse_next}, {"escape_cdata", 1, escape_cdata},
671+
{"parse_next", 2, parse_next}, {"escape_cdata", 2, escape_cdata},
651672
{"to_binary", 2, to_binary}, {"reset_parser", 1, reset_parser}};
652673
}
653674

c_src/rapidxml_print.hpp

+10-10
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ namespace rapidxml
151151
{
152152
assert(node->type() == node_data);
153153
if (!(flags & print_no_indenting))
154-
out = fill_chars(out, indent, Ch('\t'));
154+
out = fill_chars(out, indent, Ch(' '));
155155
out = copy_and_expand_chars(node->value(), node->value() + node->value_size(), Ch(0), out);
156156
return out;
157157
}
@@ -162,7 +162,7 @@ namespace rapidxml
162162
{
163163
assert(node->type() == node_cdata);
164164
if (!(flags & print_no_indenting))
165-
out = fill_chars(out, indent, Ch('\t'));
165+
out = fill_chars(out, indent, Ch(' '));
166166
*out = Ch('<'); ++out;
167167
*out = Ch('!'); ++out;
168168
*out = Ch('['); ++out;
@@ -187,7 +187,7 @@ namespace rapidxml
187187

188188
// Print element name and attributes, if any
189189
if (!(flags & print_no_indenting))
190-
out = fill_chars(out, indent, Ch('\t'));
190+
out = fill_chars(out, indent, Ch(' '));
191191
*out = Ch('<'), ++out;
192192
out = copy_chars(node->name(), node->name() + node->name_size(), out);
193193
out = print_attributes(out, node, flags);
@@ -221,9 +221,9 @@ namespace rapidxml
221221
// Print all children with full indenting
222222
if (!(flags & print_no_indenting))
223223
*out = Ch('\n'), ++out;
224-
out = print_children(out, node, flags, indent + 1);
224+
out = print_children(out, node, flags, indent + 2);
225225
if (!(flags & print_no_indenting))
226-
out = fill_chars(out, indent, Ch('\t'));
226+
out = fill_chars(out, indent, Ch(' '));
227227
}
228228

229229
// Print node end
@@ -241,7 +241,7 @@ namespace rapidxml
241241
{
242242
// Print declaration start
243243
if (!(flags & print_no_indenting))
244-
out = fill_chars(out, indent, Ch('\t'));
244+
out = fill_chars(out, indent, Ch(' '));
245245
*out = Ch('<'), ++out;
246246
*out = Ch('?'), ++out;
247247
*out = Ch('x'), ++out;
@@ -264,7 +264,7 @@ namespace rapidxml
264264
{
265265
assert(node->type() == node_comment);
266266
if (!(flags & print_no_indenting))
267-
out = fill_chars(out, indent, Ch('\t'));
267+
out = fill_chars(out, indent, Ch(' '));
268268
*out = Ch('<'), ++out;
269269
*out = Ch('!'), ++out;
270270
*out = Ch('-'), ++out;
@@ -282,7 +282,7 @@ namespace rapidxml
282282
{
283283
assert(node->type() == node_doctype);
284284
if (!(flags & print_no_indenting))
285-
out = fill_chars(out, indent, Ch('\t'));
285+
out = fill_chars(out, indent, Ch(' '));
286286
*out = Ch('<'), ++out;
287287
*out = Ch('!'), ++out;
288288
*out = Ch('D'), ++out;
@@ -304,7 +304,7 @@ namespace rapidxml
304304
{
305305
assert(node->type() == node_pi);
306306
if (!(flags & print_no_indenting))
307-
out = fill_chars(out, indent, Ch('\t'));
307+
out = fill_chars(out, indent, Ch(' '));
308308
*out = Ch('<'), ++out;
309309
*out = Ch('?'), ++out;
310310
out = copy_chars(node->name(), node->name() + node->name_size(), out);
@@ -321,7 +321,7 @@ namespace rapidxml
321321
{
322322
assert(node->type() == node_literal);
323323
if (!(flags & print_no_indenting))
324-
out = fill_chars(out, indent, Ch('\t'));
324+
out = fill_chars(out, indent, Ch(' '));
325325
out = copy_chars(node->value(), node->value() + node->value_size(), out);
326326
return out;
327327
}

include/exml.hrl

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
-ifndef(EXML_HEADER).
77
-define(EXML_HEADER, true).
88

9-
-record(xmlcdata, {content = [] :: iodata()}).
9+
-record(xmlcdata, {content = [] :: iodata(),
10+
style = escaped :: escaped | cdata}).
1011

1112
-record(xmlel, {name :: binary(),
1213
attrs = [] :: [exml:attr()],

rebar.config

+4-2
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@
5555
{doc, #{provider => ex_doc}}
5656
]}.
5757
{ex_doc, [
58-
{extras, [<<"README.md">>, <<"LICENSE">>]},
58+
{source_url, <<"https://github.com/esl/exml">>},
5959
{main, <<"readme">>},
60-
{source_url, <<"https://github.com/esl/exml">>}
60+
{extras, [{'README.md', #{title => <<"README">>}},
61+
{'LICENSE', #{title => <<"License">>}}
62+
]}
6163
]}.

src/exml.erl

+14-6
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,26 @@
2727

2828
-type attr() :: {binary(), binary()}.
2929
-type cdata() :: #xmlcdata{}.
30+
%% CDATA record. Printing escaping rules defaults to escaping character-wise.
31+
%%
32+
%% Escaping rules:
33+
%% <ul>
34+
%% <li>`escaped': escapes all characters by regular `&' control escaping.</li>
35+
%% <li>`cdata': wraps the entire string into a `<![CDATA[]]>' section.</li>
36+
%% </ul>
3037
-type element() :: #xmlel{}.
3138
-type item() :: element() | attr() | cdata() | exml_stream:start() | exml_stream:stop().
3239
-type prettify() :: pretty | not_pretty.
40+
%% Printing indentation rule, see `to_iolist/2'.
3341

3442
%% @doc Calculate the length of the original XML payload
3543
-spec xml_size(item() | [item()]) -> non_neg_integer().
3644
xml_size([]) ->
3745
0;
3846
xml_size([Elem | Rest]) ->
3947
xml_size(Elem) + xml_size(Rest);
40-
xml_size(#xmlcdata{ content = Content }) ->
41-
iolist_size(exml_nif:escape_cdata(Content));
48+
xml_size(#xmlcdata{content = Content, style = Style}) ->
49+
iolist_size(exml_nif:escape_cdata(Content, Style));
4250
xml_size(#xmlel{ name = Name, attrs = Attrs, children = [] }) ->
4351
3 % Self-closing: </>
4452
+ byte_size(Name) + xml_size(Attrs);
@@ -56,7 +64,7 @@ xml_size({Key, Value}) when is_binary(Key) ->
5664
+ 4 % ="" and whitespace before
5765
+ byte_size(Value).
5866

59-
%% @doc Sort in ascending order a list of xml `t:item()'.
67+
%% @doc Sort in ascending order a list of xml `t:item/0'.
6068
%%
6169
%% Sorting is defined as calling `lists:sort/1' at:
6270
%% <ul>
@@ -109,7 +117,7 @@ to_iolist(Element) ->
109117
to_pretty_iolist(Element) ->
110118
to_iolist(Element, pretty).
111119

112-
%% @doc Parses a binary or a list of binaries into an XML `t:element()'.
120+
%% @doc Parses a binary or a list of binaries into an XML `t:element/0'.
113121
-spec parse(binary() | [binary()]) -> {ok, element()} | {error, any()}.
114122
parse(XML) ->
115123
exml_nif:parse(XML).
@@ -129,8 +137,8 @@ to_iolist(#xmlstreamstart{name = Name, attrs = Attrs}, _Pretty) ->
129137
[Front, $>];
130138
to_iolist(#xmlstreamend{name = Name}, _Pretty) ->
131139
[<<"</">>, Name, <<">">>];
132-
to_iolist(#xmlcdata{content = Content}, _Pretty) ->
133-
exml_nif:escape_cdata(Content);
140+
to_iolist(#xmlcdata{content = Content, style = Style}, _Pretty) ->
141+
exml_nif:escape_cdata(Content, Style);
134142
to_iolist([Element], Pretty) ->
135143
to_iolist(Element, Pretty);
136144
to_iolist([#xmlstreamstart{name = Name, attrs = Attrs} | Tail] = Elements, Pretty) ->

src/exml_nif.erl

+7-9
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,12 @@
55

66
-module(exml_nif).
77

8-
-nifs([create/2, escape_cdata/1, to_binary/2, parse/1, parse_next/2, reset_parser/1]).
8+
-nifs([create/2, escape_cdata/2, to_binary/2, parse/1, parse_next/2, reset_parser/1]).
99

1010
-type parser() :: term().
11-
-type stream_element() :: exml:element() | exml_stream:start() | exml_stream:stop().
1211

13-
-export([create/2, parse/1, parse_next/2, escape_cdata/1,
12+
-export([create/2, parse/1, parse_next/2, escape_cdata/2,
1413
to_binary/2, reset_parser/1]).
15-
-export_type([parser/0, stream_element/0]).
1614

1715
-on_load(load/0).
1816

@@ -40,12 +38,12 @@ load() ->
4038
erlang:load_nif(filename:join(PrivDir, ?MODULE_STRING), none).
4139

4240
-spec create(MaxChildSize :: non_neg_integer(), InfiniteStream :: boolean()) ->
43-
{ok, parser()} | {error, Reason :: any()}.
41+
{ok, parser()} | {error, Reason :: any()}.
4442
create(_, _) ->
4543
erlang:nif_error(not_loaded).
4644

47-
-spec escape_cdata(Bin :: iodata()) -> binary().
48-
escape_cdata(_Bin) ->
45+
-spec escape_cdata(Bin :: iodata(), atom()) -> binary().
46+
escape_cdata(_Bin, _Style) ->
4947
erlang:nif_error(not_loaded).
5048

5149
-spec to_binary(Elem :: exml:element(), pretty | not_pretty) -> binary().
@@ -57,8 +55,8 @@ parse(_) ->
5755
erlang:nif_error(not_loaded).
5856

5957
-spec parse_next(parser(), Data :: binary() | [binary()]) ->
60-
{ok, stream_element() | undefined, non_neg_integer()} |
61-
{error, Reason :: any()}.
58+
{ok, exml_stream:element() | undefined, non_neg_integer()} |
59+
{error, Reason :: any()}.
6260
parse_next(_, _) ->
6361
erlang:nif_error(not_loaded).
6462

src/exml_stream.erl

+9-7
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
parser_opt/0]).
2222

2323
-record(parser, {
24-
event_parser :: exml_nif:parser(),
24+
event_parser :: term(),
2525
buffer :: [binary()]
2626
}).
2727

@@ -30,15 +30,15 @@
3030
-type stop() :: #xmlstreamend{}.
3131
%% `#xmlstreamend{}' record.
3232
-type parser() :: #parser{}.
33-
%% `#parser{}' record.
34-
-type element() :: exml_nif:stream_element().
35-
%% One of `t:start()', `t:stop()' or `t:exml:element()'.
33+
%% `#parser{}' record. Keeps track of unparsed buffers.
34+
-type element() :: exml:element() | exml_stream:start() | exml_stream:stop().
35+
%% One of `t:exml:element/0', `t:start/0', or `t:stop/0'.
3636

3737
-type parser_opt() :: {infinite_stream, boolean()} | {max_element_size, non_neg_integer()}.
3838
%% Parser options
3939
%%
4040
%% <ul>
41-
%% <li>`infinite_stream': No distinct `t:start()' or `t:stop()', only `#xmlel{}' will be returned.</li>
41+
%% <li>`infinite_stream': No distinct `t:start/0' or `t:stop/0', only `#xmlel{}' will be returned.</li>
4242
%% <li>`max_element_size': Specifies maximum byte size of any parsed XML element.
4343
%% The only exception is the "stream start" element,
4444
%% for which only the size of the opening tag is limited.</li>
@@ -53,7 +53,7 @@
5353
new_parser() ->
5454
new_parser([]).
5555

56-
%% @doc Creates a new parser
56+
%% @doc Creates a new parser. See `t:parser_opt/0' for configuration.
5757
-spec new_parser([parser_opt()]) -> {ok, parser()} | {error, any()}.
5858
new_parser(Opts)->
5959
MaxElementSize = proplists:get_value(max_element_size, Opts, 0),
@@ -65,7 +65,9 @@ new_parser(Opts)->
6565
Error
6666
end.
6767

68-
%% @doc Makes a parser parse input
68+
%% @doc Makes a parser parse input.
69+
%%
70+
%% If successful, returns parsed elements and a new parser with updated buffers.
6971
-spec parse(parser(), binary()) ->
7072
{ok, parser(), [exml_stream:element()]} | {error, Reason :: any()}.
7173
parse(Parser, Input) when is_binary(Input) ->

test/exml_tests.erl

+6
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ size_of_escaped_characters_test() ->
2727
Raw = <<"<a>&amp;</a>">>,
2828
?assertEqual(iolist_size(Raw), exml:xml_size(parse(Raw))).
2929

30+
cdata_size_of_escaped_characters_test() ->
31+
Raw = <<"<a><![CDATA[some stuff]]></a>">>,
32+
CData = #xmlcdata{content = <<"some stuff">>, style = cdata},
33+
Final = #xmlel{name = <<"a">>, children = [CData]},
34+
?assertEqual(iolist_size(Raw), exml:xml_size(Final)).
35+
3036
size_of_exml_with_cdata_test() ->
3137
Raw = <<"<a><![CDATA[ Within this Character Data block I can
3238
use double dashes as much as I want (along with <, &, ', and \")]]></a>">>,

0 commit comments

Comments
 (0)