Orcus
csv_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_CSV_PARSER_HPP
9 #define ORCUS_CSV_PARSER_HPP
10 
11 #include "csv_parser_base.hpp"
12 
13 namespace orcus {
14 
15 template<typename _Handler>
17 {
18 public:
19  typedef _Handler handler_type;
20 
21  csv_parser(const char* p, size_t n, handler_type& hdl, const csv::parser_config& config);
22  void parse();
23 
24 private:
25 
26  // handlers
27  void row();
28  void cell();
29  void quoted_cell();
30 
31  void parse_cell_with_quote(const char* p0, size_t len0);
32 
36  void push_cell_value(const char* p, size_t n);
37 
38 private:
39  handler_type& m_handler;
40 };
41 
42 template<typename _Handler>
44  const char* p, size_t n, handler_type& hdl, const csv::parser_config& config) :
45  csv::parser_base(p, n, config), m_handler(hdl) {}
46 
47 template<typename _Handler>
49 {
50 #if ORCUS_DEBUG_CSV
51  for (const char* p = mp_begin; p < mp_end; ++p)
52  std::cout << *p;
53  std::cout << std::endl;
54 #endif
55 
56  m_handler.begin_parse();
57  while (has_char())
58  row();
59  m_handler.end_parse();
60 }
61 
62 template<typename _Handler>
64 {
65  m_handler.begin_row();
66  while (true)
67  {
68  if (is_text_qualifier(cur_char()))
69  quoted_cell();
70  else
71  cell();
72 
73  if (!has_char())
74  {
75  m_handler.end_row();
76  return;
77  }
78 
79  char c = cur_char();
80  if (c == '\n')
81  {
82  next();
83 #if ORCUS_DEBUG_CSV
84  cout << "(LF)" << endl;
85 #endif
86  m_handler.end_row();
87  return;
88  }
89 
90  if (!is_delim(c))
91  throw orcus::csv::parse_error("expected a delimiter");
92 
93  next();
94 
95  if (m_config.trim_cell_value)
96  skip_blanks();
97 
98  if (!has_char())
99  {
100  m_handler.end_row();
101  return;
102  }
103  }
104 }
105 
106 template<typename _Handler>
108 {
109  const char* p = mp_char;
110  size_t len = 0;
111  char c = cur_char();
112  while (c != '\n' && !is_delim(c))
113  {
114  ++len;
115  next();
116  if (!has_char())
117  break;
118  c = cur_char();
119  }
120 
121  if (!len)
122  p = nullptr;
123 
124  push_cell_value(p, len);
125 }
126 
127 template<typename _Handler>
129 {
130 #if ORCUS_DEBUG_CSV
131  cout << "--- quoted cell" << endl;
132 #endif
133  char c = cur_char();
134  assert(is_text_qualifier(c));
135  next(); // Skip the opening quote.
136  if (!has_char())
137  return;
138 
139  const char* p0 = mp_char;
140  size_t len = 1;
141  for (; has_char(); next(), ++len)
142  {
143  c = cur_char();
144 #if ORCUS_DEBUG_CSV
145  cout << "'" << c << "'" << endl;
146 #endif
147  if (!is_text_qualifier(c))
148  continue;
149 
150  // current char is a quote. Check if the next char is also a text
151  // qualifier.
152 
153  if (has_next() && is_text_qualifier(next_char()))
154  {
155  next();
156  parse_cell_with_quote(p0, len);
157  return;
158  }
159 
160  // Closing quote.
161  m_handler.cell(p0, len-1, false);
162  next();
163  skip_blanks();
164  return;
165  }
166 
167  // Stream ended prematurely. Handle it gracefully.
168  m_handler.cell(p0, len, false);
169 }
170 
171 template<typename _Handler>
172 void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0)
173 {
174 #if ORCUS_DEBUG_CSV
175  using namespace std;
176  cout << "--- parse cell with quote" << endl;
177 #endif
178  assert(is_text_qualifier(cur_char()));
179 
180  // Push the preceding chars to the temp buffer.
181  m_cell_buf.reset();
182  m_cell_buf.append(p0, len0);
183 
184  // Parse the rest, until the closing quote.
185  next();
186  const char* p_cur = mp_char;
187  size_t cur_len = 0;
188  for (; has_char(); next(), ++cur_len)
189  {
190  char c = cur_char();
191 #if ORCUS_DEBUG_CSV
192  cout << "'" << c << "'" << endl;
193 #endif
194  if (!is_text_qualifier(c))
195  continue;
196 
197  if (has_next() && is_text_qualifier(next_char()))
198  {
199  // double quotation. Copy the current segment to the cell buffer.
200  m_cell_buf.append(p_cur, cur_len);
201 
202  next(); // to the 2nd quote.
203  p_cur = mp_char;
204  cur_len = 0;
205  continue;
206  }
207 
208  // closing quote. Flush the current segment to the cell
209  // buffer, push the value to the handler, and exit normally.
210  m_cell_buf.append(p_cur, cur_len);
211 
212  m_handler.cell(m_cell_buf.get(), m_cell_buf.size(), true);
213  next();
214  skip_blanks();
215  return;
216  }
217 
218  // Stream ended prematurely.
219  throw csv::parse_error("stream ended prematurely while parsing quoted cell.");
220 }
221 
222 template<typename _Handler>
223 void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
224 {
225  size_t len = n;
226 
227  if (m_config.trim_cell_value)
228  {
229  // Trim any leading blanks.
230  for (size_t i = 0; i < n; ++i, --len, ++p)
231  {
232  if (!is_blank(*p))
233  break;
234  }
235 
236  // Trim any trailing blanks.
237  if (len)
238  {
239  const char* p_end = p + (len-1);
240  for (; p != p_end; --p_end, --len)
241  {
242  if (!is_blank(*p_end))
243  break;
244  }
245  }
246  }
247 
248  m_handler.cell(p, len, false);
249 #if ORCUS_DEBUG_CSV
250  if (len)
251  cout << "(cell:'" << std::string(p, len) << "')" << endl;
252  else
253  cout << "(cell:'')" << endl;
254 #endif
255 }
256 
257 }
258 
259 #endif
260 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: csv_parser_base.hpp:66
bool trim_cell_value
Definition: csv_parser_base.hpp:52
Definition: csv_parser_base.hpp:57
Definition: csv_parser.hpp:16
Definition: config.hpp:18
bool is_blank(char c) const
Definition: base64.hpp:15
Definition: csv_parser_base.hpp:36