Turi Create  4.0
string_parser.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FLEXIBLE_TYPE_STRING_PARSER_HPP
7 #define TURI_FLEXIBLE_TYPE_STRING_PARSER_HPP
8 #include <boost/algorithm/string.hpp>
9 #include <boost/spirit/include/qi.hpp>
10 #include <core/data/flexible_type/flexible_type.hpp>
11 #include <core/data/flexible_type/string_escape.hpp>
12 
13 /*
14  * Must of this is obtained from
15  * http://boost-spirit.com/home/articles/qi-example/creating-your-own-parser-component-for-spirit-qi/
16  */
17 
18 namespace parser_impl {
19 
20 /**
21  * \internal
22  * The string parsing configuration.
23  *
24  */
25 struct parser_config {
26  /// If any of these character occurs outside of quoted string,
27  /// the string will be terminated
28  std::string restrictions;
29  /// If the delimiter string is seen anywhere outside of a quoted string,
30  /// the string will be terminated.
31  std::string delimiter;
32  /// Whether escape char should be used
33  bool use_escape_char = true;
34  /// The character to use for an escape character
35  char escape_char = '\\';
36  /** whether double quotes inside of a quote are treated as a single quote.
37  * i.e. """hello""" => \"hello\"
38  */
39  char double_quote = true;
40 
41  std::unordered_set<std::string> na_val;
42  std::unordered_set<std::string> true_val;
43  std::unordered_set<std::string> false_val;
44 
45  /**
46  * If this is set (defaults to false), then
47  * the true/false/na substitutions are only permitted on raw
48  * unparsed strings; that is strings before dequoting, de-escaping, etc.
49  */
51 };
52 
53 BOOST_SPIRIT_TERMINAL_EX(restricted_string);
54 
55 } // namespace parser_impl
56 
57 namespace boost {
58 namespace spirit {
59 
60 
61 template <typename T1>
62 struct use_terminal<qi::domain,
63  terminal_ex<parser_impl::tag::restricted_string, fusion::vector1<T1> > >
64  : mpl::true_ {};
65 
66 
67 } } // namespace spirit, boost
68 
69 namespace parser_impl {
70 
71 // anonynous namespace
72 namespace {
73 /**
74  * A buffer which allocates up to STACK_BUF_SIZE
75  * on the stack, but then switches to a std::string heap
76  * once it exceeds the size.
77  */
78 struct stack_buffer {
79  constexpr static size_t STACK_BUF_SIZE = 128;
80  char buf[STACK_BUF_SIZE];
81  std::string altbuf;
82  size_t pos = 0;
83  inline void add_char(char c) {
84  if (pos < STACK_BUF_SIZE) {
85  // write into the stack based buffer
86  buf[pos] = c;
87  } else if (pos == STACK_BUF_SIZE) {
88  // switch buffers
89  altbuf = std::string(buf, STACK_BUF_SIZE);
90  altbuf += c;
91  }else {
92  altbuf += c;
93  }
94  ++pos;
95  }
96 
97  std::string& get_string() {
98  if (pos <= STACK_BUF_SIZE) altbuf = std::string(buf, pos);
99  return altbuf;
100  }
101 };
102 } // anonynous namespace
103 
104 /*
105  * \internal
106  * This class defines a string parser which allows the parser writer to define
107  * a list of characters which are not permitted in unquoted strings. Quoted
108  * strings have no restrictions on what characters they can contain.
109  * Usage:
110  * \code
111  * parser_impl::parser_config config;
112  * config.[set stuff up]
113  * rule = parser_impl::restricted_string(config);
114  * \endcode
115  */
116 struct string_parser
117  : boost::spirit::qi::primitive_parser<string_parser> {
118  // Define the attribute type exposed by this parser component
119  template <typename Context, typename Iterator>
120  struct attribute {
121  typedef ::turi::flexible_type type;
122  };
123 
124  parser_config config;
125 
126  bool has_delimiter = false;
127  char delimiter_first_char;
128  bool delimiter_is_singlechar = false;
129  std::unordered_map<std::string, turi::flexible_type> map_vals; // handle na_val, true_val, false_val
130  bool only_raw_string_substitutions = false;
131 
132  string_parser(){}
133  string_parser(parser_config config):config(config) {
134  has_delimiter = config.delimiter.length() > 0;
135  delimiter_is_singlechar = config.delimiter.length() == 1;
136  if (has_delimiter) delimiter_first_char = config.delimiter[0];
137  for (auto s: config.na_val) {
139  }
140  for (auto s: config.true_val) {
141  map_vals[s] = 1;
142  }
143  for (auto s: config.false_val) {
144  map_vals[s] = 0;
145  }
146  only_raw_string_substitutions = config.only_raw_string_substitutions;
147  }
148 
149  enum class tokenizer_state {
150  START_FIELD, IN_FIELD, IN_QUOTED_FIELD,
151  };
152 
153  static inline bool test_is_delimiter(const char* c, const char* end,
154  const char* delimiter, const char* delimiter_end) {
155  // if I have more delimiter characters than the length of the string
156  // quit.
157  if (delimiter_end - delimiter > end - c) return false;
158  while (delimiter != delimiter_end) {
159  if ((*c) != (*delimiter)) return false;
160  ++c; ++delimiter;
161  }
162  return true;
163  }
164 #define PUSH_CHAR(c) ret.add_char(c); escape_sequence = config.use_escape_char && (c == config.escape_char);
165 
166 // insert a character into the field buffer. resizing it if necessary
167 
168  // This function is called during the actual parsing process
169  template <typename Iterator, typename Context, typename Skipper, typename Attribute>
170  bool parse(Iterator& first, Iterator const& last,
171  Context&, Skipper const& skipper, Attribute& attr) const {
172  boost::spirit::qi::skip_over(first, last, skipper);
173  Iterator cur = first;
174  stack_buffer ret;
175  const char* delimiter_begin = config.delimiter.c_str();
176  const char* delimiter_end = delimiter_begin + config.delimiter.length();
177 
178  tokenizer_state state = tokenizer_state::START_FIELD;
179  bool keep_parsing = true;
180  char quote_char = 0;
181  const char* raw_field_begin = nullptr;
182  // this is set to true for the character immediately after an escape character
183  // and false all other times
184  bool escape_sequence = false;
185  while(keep_parsing && cur != last) {
186  // since escape_sequence can only be true for one character after it is
187  // set to true. I need a flag here. if reset_escape_sequence is true, the
188  // at the end of the loop, I clear escape_sequence
189  bool reset_escape_sequence = escape_sequence;
190 
191  // Next character in file
192  char c = *cur;
193  if(state != tokenizer_state::IN_QUOTED_FIELD &&
194  config.restrictions.find(c) != std::string::npos) break;
195 
196  bool is_delimiter =
197  // current state is not in a quoted field since delimiters in quoted
198  // fields are fine.
199  (state != tokenizer_state::IN_QUOTED_FIELD) &&
200  // and there is a delimiter
201  has_delimiter &&
202  // and current character matches first character of delimiter
203  // and delimiter is either a single character, or we need to do a
204  // more expensive test.
205  delimiter_first_char == c &&
206  (delimiter_is_singlechar ||
207  test_is_delimiter(cur, last, delimiter_begin, delimiter_end));
208 
209  if (is_delimiter) break;
210 
211  ++cur;
212  switch(state) {
213  case tokenizer_state::START_FIELD:
214  raw_field_begin = cur-1; // -1 because cur has already been incremented
215  if (c == '\'' || c == '\"') {
216  quote_char = c;
217  state = tokenizer_state::IN_QUOTED_FIELD;
218  } else {
219  /* begin new unquoted field */
220  PUSH_CHAR(c);
221  state = tokenizer_state::IN_FIELD;
222  }
223  break;
224 
225  case tokenizer_state::IN_FIELD:
226  /* normal character - save in field */
227  PUSH_CHAR(c);
228  break;
229 
230  case tokenizer_state::IN_QUOTED_FIELD:
231  /* in quoted field */
232  if (c == quote_char && !escape_sequence) {
233  if (c == '\"' && config.double_quote) {
234  /* doublequote; " represented by "" */
235  // look ahead one character
236  if (cur + 1 < last && *cur == quote_char) {
237  PUSH_CHAR(c);
238  ++cur;
239  break;
240  }
241  }
242  // we are done.
243  keep_parsing = false;
244  }
245  else {
246  /* normal character - save in field */
247  PUSH_CHAR(c);
248  }
249  break;
250  }
251  if (reset_escape_sequence) escape_sequence = false;
252  }
253  if (cur == first) return false;
254  else {
255  first = cur;
256  if (only_raw_string_substitutions == true && raw_field_begin != nullptr) {
257  std::string raw_str = std::string(raw_field_begin, cur - raw_field_begin);
258  boost::algorithm::trim_right(raw_str);
259  auto map_val_iter = map_vals.find(raw_str);
260  if (map_val_iter != map_vals.end()) {
261  attr = map_val_iter->second;
262  return true;
263  }
264  }
265 
266  std::string final_str = std::move(ret.get_string());
267  if (!quote_char) boost::algorithm::trim_right(final_str);
268  else if (quote_char) {
269  // if was quoted field, we unescape the contents
270  turi::unescape_string(final_str, config.use_escape_char,
271  config.escape_char,
272  quote_char, config.double_quote);
273  }
274 
275  if (only_raw_string_substitutions == false) {
276  auto map_val_iter = map_vals.find(final_str);
277  if (map_val_iter != map_vals.end()) {
278  attr = map_val_iter->second;
279  return true;
280  }
281  }
282  attr = std::move(final_str);
283  return true;
284  }
285  return true;
286  }
287 
288 // This function is called during error handling to create
289 // a human readable string for the error context.
290  template <typename Context>
291  boost::spirit::info what(Context&) const {
292  return boost::spirit::info("string_parser");
293  }
294 };
295 } // namespace parser_impl
296 
297 namespace boost {
298 namespace spirit {
299 namespace qi {
300 
301 // This is the factory function object invoked in order to create
302 // an instance of our iter_pos_parser.
303 template <typename Modifiers, typename T1>
304 struct make_primitive<terminal_ex<parser_impl::tag::restricted_string, fusion::vector1<T1>>, Modifiers> {
305  typedef parser_impl::string_parser result_type;
306 
307  template <typename Terminal>
308  result_type operator()(const Terminal& term, unused_type) const {
309  return result_type(fusion::at_c<0>(term.args));
310  }
311 };
312 
313 }}} // namespace qi, spirit, boost
314 
315 #undef PUSH_CHAR
316 
317 #endif
char escape_char
The character to use for an escape character.
void unescape_string(std::string &cal, bool use_escape_char, char escape_char, char quote_char, bool double_quote)
bool use_escape_char
Whether escape char should be used.