Orcus
sax_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_SAX_PARSER_HPP
9 #define ORCUS_SAX_PARSER_HPP
10 
11 #include "sax_parser_base.hpp"
12 
13 namespace orcus {
14 
16 {
22  static const uint8_t baseline_version = 10;
23 };
24 
26 {
27 public:
34  {
35  (void)param;
36  }
37 
46  {
47  (void)decl;
48  }
49 
55  void end_declaration(const orcus::pstring& decl)
56  {
57  (void)decl;
58  }
59 
66  {
67  (void)elem;
68  }
69 
76  {
77  (void)elem;
78  }
79 
94  void characters(const orcus::pstring& val, bool transient)
95  {
96  (void)val; (void)transient;
97  }
98 
108  {
109  (void)attr;
110  }
111 };
112 
117 template<typename _Handler, typename _Config = sax_parser_default_config>
119 {
120 public:
121  typedef _Handler handler_type;
122  typedef _Config config_type;
123 
124  sax_parser(const char* content, const size_t size, handler_type& handler);
125  sax_parser(const char* content, const size_t size, bool transient_stream, handler_type& handler);
126  ~sax_parser();
127 
128  void parse();
129 
130 private:
131 
136  void header();
137  void body();
138  void element();
139  void element_open(std::ptrdiff_t begin_pos);
140  void element_close(std::ptrdiff_t begin_pos);
141  void special_tag();
142  void declaration(const char* name_check);
143  void cdata();
144  void doctype();
145  void characters();
146  void attribute();
147 
148 private:
149  handler_type& m_handler;
150 };
151 
152 template<typename _Handler, typename _Config>
154  const char* content, const size_t size, handler_type& handler) :
155  sax::parser_base(content, size, false),
156  m_handler(handler)
157 {
158 }
159 
160 template<typename _Handler, typename _Config>
161 sax_parser<_Handler,_Config>::sax_parser(
162  const char* content, const size_t size, bool transient_stream, handler_type& handler) :
163  sax::parser_base(content, size, transient_stream),
164  m_handler(handler)
165 {
166 }
167 
168 template<typename _Handler, typename _Config>
169 sax_parser<_Handler,_Config>::~sax_parser()
170 {
171 }
172 
173 template<typename _Handler, typename _Config>
174 void sax_parser<_Handler,_Config>::parse()
175 {
176  m_nest_level = 0;
177  mp_char = mp_begin;
178  header();
179  skip_space_and_control();
180  body();
181 
182  assert(m_buffer_pos == 0);
183 }
184 
185 template<typename _Handler, typename _Config>
186 void sax_parser<_Handler,_Config>::header()
187 {
188  // we don't handle multi byte encodings so we can just skip bom entry if exists.
189  skip_bom();
190  skip_space_and_control();
191  if (!has_char() || cur_char() != '<')
192  throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
193 
194  if (config_type::baseline_version >= 11)
195  {
196  // XML version 1.1 requires a header declaration whereas in 1.0 it's
197  // optional.
198  if (next_char_checked() != '?')
199  throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
200 
201  declaration("xml");
202  }
203 }
204 
205 template<typename _Handler, typename _Config>
206 void sax_parser<_Handler,_Config>::body()
207 {
208  while (has_char())
209  {
210  if (cur_char() == '<')
211  {
212  element();
213  if (!m_root_elem_open)
214  // Root element closed. Stop parsing.
215  return;
216  }
217  else if (m_nest_level)
218  // Call characters only when in xml hierarchy.
219  characters();
220  else
221  next();
222  }
223 }
224 
225 template<typename _Handler, typename _Config>
226 void sax_parser<_Handler,_Config>::element()
227 {
228  assert(cur_char() == '<');
229  std::ptrdiff_t pos = offset();
230  char c = next_char_checked();
231  switch (c)
232  {
233  case '/':
234  element_close(pos);
235  break;
236  case '!':
237  special_tag();
238  break;
239  case '?':
240  declaration(nullptr);
241  break;
242  default:
243  if (!is_alpha(c) && c != '_')
244  throw sax::malformed_xml_error("expected an alphabet.", offset());
245  element_open(pos);
246  }
247 }
248 
249 template<typename _Handler, typename _Config>
250 void sax_parser<_Handler,_Config>::element_open(std::ptrdiff_t begin_pos)
251 {
252  assert(is_alpha(cur_char()) || cur_char() == '_');
253 
254  sax::parser_element elem;
255  element_name(elem, begin_pos);
256 
257  while (true)
258  {
259  skip_space_and_control();
260  char c = cur_char();
261  if (c == '/')
262  {
263  // Self-closing element: <element/>
264  if (next_and_char() != '>')
265  throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
266  next();
267  elem.end_pos = offset();
268  m_handler.start_element(elem);
269  reset_buffer_pos();
270  m_handler.end_element(elem);
271  if (!m_nest_level)
272  m_root_elem_open = false;
273 #if ORCUS_DEBUG_SAX_PARSER
274  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
275 #endif
276  return;
277  }
278  else if (c == '>')
279  {
280  // End of opening element: <element>
281  next();
282  elem.end_pos = offset();
283  nest_up();
284  m_handler.start_element(elem);
285  reset_buffer_pos();
286 #if ORCUS_DEBUG_SAX_PARSER
287  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
288 #endif
289  return;
290  }
291  else
292  attribute();
293  }
294 }
295 
296 template<typename _Handler, typename _Config>
297 void sax_parser<_Handler,_Config>::element_close(std::ptrdiff_t begin_pos)
298 {
299  assert(cur_char() == '/');
300  nest_down();
301  next_check();
302  sax::parser_element elem;
303  element_name(elem, begin_pos);
304 
305  if (cur_char() != '>')
306  throw sax::malformed_xml_error("expected '>' to close the element.", offset());
307  next();
308  elem.end_pos = offset();
309 
310  m_handler.end_element(elem);
311 #if ORCUS_DEBUG_SAX_PARSER
312  cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
313 #endif
314  if (!m_nest_level)
315  m_root_elem_open = false;
316 }
317 
318 template<typename _Handler, typename _Config>
319 void sax_parser<_Handler,_Config>::special_tag()
320 {
321  assert(cur_char() == '!');
322  // This can be either <![CDATA, <!--, or <!DOCTYPE.
323  size_t len = remains();
324  if (len < 2)
325  throw sax::malformed_xml_error("special tag too short.", offset());
326 
327  switch (next_and_char())
328  {
329  case '-':
330  {
331  // Possibly comment.
332  if (next_and_char() != '-')
333  throw sax::malformed_xml_error("comment expected.", offset());
334 
335  len -= 2;
336  if (len < 3)
337  throw sax::malformed_xml_error("malformed comment.", offset());
338 
339  next();
340  comment();
341  }
342  break;
343  case '[':
344  {
345  // Possibly a CDATA.
346  expects_next("CDATA[", 6);
347  if (has_char())
348  cdata();
349  }
350  break;
351  case 'D':
352  {
353  // check if this is a DOCTYPE.
354  expects_next("OCTYPE", 6);
355  skip_space_and_control();
356  if (has_char())
357  doctype();
358  }
359  break;
360  default:
361  throw sax::malformed_xml_error("failed to parse special tag.", offset());
362  }
363 }
364 
365 template<typename _Handler, typename _Config>
366 void sax_parser<_Handler,_Config>::declaration(const char* name_check)
367 {
368  assert(cur_char() == '?');
369  next_check();
370 
371  // Get the declaration name first.
372  pstring decl_name;
373  name(decl_name);
374 #if ORCUS_DEBUG_SAX_PARSER
375  cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
376 #endif
377 
378  if (name_check && decl_name != name_check)
379  {
380  std::ostringstream os;
381  os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
382  throw sax::malformed_xml_error(os.str(), offset());
383  }
384 
385  m_handler.start_declaration(decl_name);
386  skip_space_and_control();
387 
388  // Parse the attributes.
389  while (cur_char_checked() != '?')
390  {
391  attribute();
392  skip_space_and_control();
393  }
394  if (next_char_checked() != '>')
395  throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
396 
397  m_handler.end_declaration(decl_name);
398  reset_buffer_pos();
399  next();
400 #if ORCUS_DEBUG_SAX_PARSER
401  cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
402 #endif
403 }
404 
405 template<typename _Handler, typename _Config>
406 void sax_parser<_Handler,_Config>::cdata()
407 {
408  size_t len = remains();
409  assert(len > 3);
410 
411  // Parse until we reach ']]>'.
412  const char* p0 = mp_char;
413  size_t i = 0, match = 0;
414  for (char c = cur_char(); i < len; ++i, c = next_and_char())
415  {
416  if (c == ']')
417  {
418  // Be aware that we may encounter a series of more than two ']'
419  // characters, in which case we'll only count the last two.
420 
421  if (match == 0)
422  // First ']'
423  ++match;
424  else if (match == 1)
425  // Second ']'
426  ++match;
427  }
428  else if (c == '>' && match == 2)
429  {
430  // Found ']]>'.
431  size_t cdata_len = i - 2;
432  m_handler.characters(pstring(p0, cdata_len), transient_stream());
433  next();
434  return;
435  }
436  else
437  match = 0;
438  }
439  throw sax::malformed_xml_error("malformed CDATA section.", offset());
440 }
441 
442 template<typename _Handler, typename _Config>
443 void sax_parser<_Handler,_Config>::doctype()
444 {
445  // Parse the root element first.
446  sax::doctype_declaration param;
447  name(param.root_element);
448  skip_space_and_control();
449 
450  // Either PUBLIC or SYSTEM.
451  size_t len = remains();
452  if (len < 6)
453  throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
454 
455  param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
456  char c = cur_char();
457  if (c == 'P')
458  {
459  if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
460  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
461 
462  param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
463  }
464  else if (c == 'S')
465  {
466  if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
467  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
468  }
469 
470  next_check();
471  skip_space_and_control();
472  has_char_throw("DOCTYPE section too short.");
473 
474  // Parse FPI.
475  value(param.fpi, false);
476 
477  has_char_throw("DOCTYPE section too short.");
478  skip_space_and_control();
479  has_char_throw("DOCTYPE section too short.");
480 
481  if (cur_char() == '>')
482  {
483  // Optional URI not given. Exit.
484 #if ORCUS_DEBUG_SAX_PARSER
485  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
486 #endif
487  m_handler.doctype(param);
488  next();
489  return;
490  }
491 
492  // Parse optional URI.
493  value(param.uri, false);
494 
495  has_char_throw("DOCTYPE section too short.");
496  skip_space_and_control();
497  has_char_throw("DOCTYPE section too short.");
498 
499  if (cur_char() != '>')
500  throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
501 
502 #if ORCUS_DEBUG_SAX_PARSER
503  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
504 #endif
505  m_handler.doctype(param);
506  next();
507 }
508 
509 template<typename _Handler, typename _Config>
510 void sax_parser<_Handler,_Config>::characters()
511 {
512  const char* p0 = mp_char;
513  for (; has_char(); next())
514  {
515  if (cur_char() == '<')
516  break;
517 
518  if (cur_char() == '&')
519  {
520  // Text span with one or more encoded characters. Parse using cell buffer.
521  cell_buffer& buf = get_cell_buffer();
522  buf.reset();
523  buf.append(p0, mp_char-p0);
524  characters_with_encoded_char(buf);
525  if (buf.empty())
526  m_handler.characters(pstring(), transient_stream());
527  else
528  m_handler.characters(pstring(buf.get(), buf.size()), true);
529  return;
530  }
531  }
532 
533  if (mp_char > p0)
534  {
535  pstring val(p0, mp_char-p0);
536  m_handler.characters(val, transient_stream());
537  }
538 }
539 
540 template<typename _Handler, typename _Config>
541 void sax_parser<_Handler,_Config>::attribute()
542 {
543  sax::parser_attribute attr;
544  pstring attr_ns_name, attr_name, attr_value;
545  attribute_name(attr.ns, attr.name);
546 
547 #if ORCUS_DEBUG_SAX_PARSER
548  cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
549 #endif
550 
551  skip_space_and_control();
552 
553  char c = cur_char();
554  if (c != '=')
555  {
556  std::ostringstream os;
557  os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
558  throw sax::malformed_xml_error(os.str(), offset());
559  }
560 
561  next_check(); // skip the '='.
562  skip_space_and_control();
563 
564  attr.transient = value(attr.value, true);
565  if (attr.transient)
566  // Value is stored in a temporary buffer. Push a new buffer.
567  inc_buffer_pos();
568 
569 #if ORCUS_DEBUG_SAX_PARSER
570  cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
571 #endif
572 
573  m_handler.attribute(attr);
574 }
575 
576 }
577 
578 #endif
579 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: pstring.hpp:27
Definition: sax_parser.hpp:15
void start_declaration(const orcus::pstring &decl)
Definition: sax_parser.hpp:45
void end_element(const orcus::sax::parser_element &elem)
Definition: sax_parser.hpp:75
static const uint8_t baseline_version
Definition: sax_parser.hpp:22
void attribute(const orcus::sax::parser_attribute &attr)
Definition: sax_parser.hpp:107
Definition: sax_parser_base.hpp:100
Definition: sax_parser.hpp:25
void end_declaration(const orcus::pstring &decl)
Definition: sax_parser.hpp:55
Definition: parser_base.hpp:40
void start_element(const orcus::sax::parser_element &elem)
Definition: sax_parser.hpp:65
void doctype(const orcus::sax::doctype_declaration &param)
Definition: sax_parser.hpp:33
Definition: sax_parser_base.hpp:85
Definition: sax_parser_base.hpp:45
Definition: base64.hpp:15
Definition: sax_parser.hpp:118
Definition: sax_parser_base.hpp:108
void characters(const orcus::pstring &val, bool transient)
Definition: sax_parser.hpp:94