Turi Create  4.0
csv_line_tokenizer.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_LIB_SFRAME_CSV_LINE_TOKENIZER_HPP
7 #define TURI_UNITY_LIB_SFRAME_CSV_LINE_TOKENIZER_HPP
8 #include <vector>
9 #include <string>
10 #include <cstdlib>
11 #include <functional>
12 #include <memory>
13 #include <core/data/flexible_type/flexible_type.hpp>
14 #include <core/parallel/mutex.hpp>
15 #include <iostream>
16 
17 namespace turi {
18 
19 class flexible_type_parser;
20 
21 /**
22  * \ingroup sframe_physical
23  * \addtogroup csv_utils CSV Parsing and Writing
24  * \{
25  */
26 
27 /**
28  * CSV Line Tokenizer.
29  *
30  * To use, simply set the appropriate options inside the struct, and use one of
31  * the tokenize_line functions to parse a line inside a CSV file.
32  *
33  * \note This parser at the moment only handles the case where each row of
34  * the CSV is on one line. It is in fact very possible that this is not the
35  * case. Pandas in particular permits line breaks inside of quoted strings,
36  * and vectors, and that is quite problematic.
37  */
39  /**
40  * If set to true, quotes inside a field will be preserved (Default false).
41  * i.e. if set to true, the 2nd entry in the following row will be read as
42  * ""hello world"" with the quote characters.
43  * \verbatim
44  * 1,"hello world",5
45  * \endverbatim
46  */
47  bool preserve_quoting = false;
48 
49  /**
50  * If escape_char is used.
51  */
52  bool use_escape_char = true;
53  /**
54  * The character to use to identify the beginning of a C escape sequence
55  * (Defualt '\'). i.e. "\n" will be converted to the '\n' character, "\\"
56  * will be converted to "\", etc. Note that only the single character
57  * escapes are converted. unicode (\Unnnn), octal (\nnn), hexadecimal (\xnn)
58  * are not interpreted.
59  */
60  char escape_char = '\\';
61 
62  /**
63  * If set to true, initial spaces before fields are ignored (Default true).
64  */
65  bool skip_initial_space = true;
66 
67 
68  /**
69  * The delimiter character to use to separate fields (Default ",")
70  */
71  std::string delimiter = ",";
72 
73 
74  /**
75  * The string to use to separate lines. Defaults to "\n".
76  * Setting the new line string to "\n" has special effects in that it
77  * causes "\r", "\r\n" and "\n" to be all interpreted as new lines.
78  */
79  std::string line_terminator = "\n";
80 
81  /**
82  * The character used to begin a comment (Default '#'). An occurance of
83  * this character outside of quoted strings will cause the parser to
84  * ignore the remainder of the line.
85  * \verbatim
86  * # this is a
87  * # comment
88  * user,name,rating
89  * 123,hello,45
90  * 312,chu, 21
91  * 333,zzz, 3 # this is also a comment
92  * 444,aaa, 51
93  * \endverbatim
94  */
95  char comment_char = '#';
96 
97  /**
98  * Whether comment char is used
99  */
100  bool has_comment_char = true;
101 
102  /**
103  * If set to true, pairs of quote characters in a quoted string
104  * are interpreted as a single quote (Default false).
105  * For instance, if set to true, the 2nd field of the 2nd line is read as
106  * \b "hello "world""
107  * \verbatim
108  * user, message
109  * 123, "hello ""world"""
110  * \endverbatim
111  */
112  bool double_quote = false;
113 
114  /**
115  * The quote character to use (Default '\"')
116  */
117  char quote_char = '\"';
118 
119  /**
120  * The strings which will be parsed as missing values.
121  *
122  * (also see empty_string_in_na_values)
123  */
124  std::vector<std::string> na_values;
125 
126  /**
127  * string values which map to numeric 1
128  */
129  std::unordered_set<std::string> true_values;
130 
131  /**
132  * string values which map to numeric 0
133  */
134  std::unordered_set<std::string> false_values;
135 
136  /**
137  * If this is set (defaults to false), then
138  * the true/false/na substitutions are only permitted on raw
139  * unparsed strings; that is strings before dequoting, de-escaping, etc.
140  */
142 
143  /**
144  * Constructor. Does nothing but set up internal buffers.
145  */
147 
148  /**
149  * called before any parsing functions are used. Initializes the spirit parser.
150  */
151  void init();
152 
153  /**
154  * Tokenize a single CSV line into seperate fields.
155  * The output vector will be cleared, and each field will be inserted into
156  * the output vector. Returns true on success and false on failure.
157  *
158  * \param str Pointer to string to tokenize.
159  * Contents of string may be modified.
160  * \param len Length of string to tokenize
161  * \param output Output vector which will contain the result
162  *
163  * \returns true on success, false on failure.
164  */
165  bool tokenize_line(const char* str, size_t len,
166  std::vector<std::string>& output);
167 
168  /**
169  * Tokenize a single CSV line into seperate fields, calling a callback
170  * for each parsed token.
171  *
172  * The function is of the form:
173  * \code
174  * bool receive_token(const char* buffer, size_t len) {
175  * // add the first len bytes of the buffer as the parsed token
176  * // return true on success and false on failure.
177  *
178  * // if this function returns false, the tokenize_line call will also
179  * // return false
180  *
181  * // The buffer may be modified
182  * }
183  * \endcode
184  *
185  * For instance, to insert the parsed tokens into an output vector, the
186  * following code could be used:
187  *
188  * \code
189  * return tokenize_line(str,
190  * [&](const char* buf, size_t len)->bool {
191  * output.emplace_back(buf, len);
192  * return true;
193  * });
194  * \endcode
195  *
196  * \param str Pointer to line to tokenize. Contents of string may be modified.
197  * \param len Length of line to tokenize
198  * \param fn Callback function which is called on every token
199  *
200  * \returns true on success, false on failure.
201  */
202  bool tokenize_line(const char* str, size_t len,
203  std::function<bool (std::string&, size_t)> fn);
204 
205  /**
206  * Tokenizes a line directly into array of flexible_type and type specifiers.
207  * This version of tokenize line is strict, requiring that the length of
208  * the output vector matches up exactly with the number of columns, and the
209  * types of the flexible_type be fully specified.
210  *
211  * For instance:
212  * If my input line is
213  * \verbatim
214  * 1, hello world, 2.0
215  * \endverbatim
216  *
217  * then output vector must have 3 elements.
218  *
219  * If the types of the 3 elements in the output vector are:
220  * [flex_type_enum::INTEGER, flex_type_enum::STRING, flex_type_enum::FLOAT]
221  * then, they will be parsed as such emitting an output of
222  * [1, "hello world", 2.0].
223  *
224  * However, if the types of the 3 elements in the output vector are:
225  * [flex_type_enum::STRING, flex_type_enum::STRING, flex_type_enum::STRING]
226  * then, the output will contain be ["1", "hello world", "2.0"].
227  *
228  * Type interpretation failures will produce an error.
229  * For instance if the types are
230  * [flex_type_enum::STRING, flex_type_enum::INTEGER, flex_type_enum::STRING],
231  * since the second element cannot be correctly interpreted as an integer,
232  * the tokenization will fail.
233  *
234  * The types current supported are:
235  * - flex_type_enum::INTEGER
236  * - flex_type_enum::FLOAT
237  * - flex_type_enum::STRING
238  * - flex_type_enum::VECTOR (a vector of numbers specified like [1 2 3]
239  * but allowing separators to be spaces, commas(,)
240  * or semicolons(;). The separator should not
241  * match the CSV separator since the parsers are
242  * independent)
243  *
244  * The tokenizer will not modify the types of the output vector. However,
245  * if permit_undefined is specified, the output type can be set to
246  * flex_type_enum::UNDEFINED for an empty non-string field. For instance:
247  *
248  *
249  * If my input line is
250  * \verbatim
251  * 1, , 2.0
252  * \endverbatim
253  * If I have type specifiers
254  * [flex_type_enum::INTEGER, flex_type_enum::STRING, flex_type_enum::FLOAT]
255  * This will be parsed as [1, "", 2.0] regardless of permit_undefined.
256  *
257  * However if I have type specifiers
258  * [flex_type_enum::INTEGER, flex_type_enum::INTEGER, flex_type_enum::FLOAT]
259  * and permit_undefined == false, This will be parsed as [1, 0, 2.0].
260  *
261  * And if I have type specifiers
262  * [flex_type_enum::INTEGER, flex_type_enum::INTEGER, flex_type_enum::FLOAT]
263  * and permit_undefined == true, This will be parsed as [1, UNDEFINED, 2.0].
264  *
265  * \param str Pointer to line to tokenize
266  * \param len Length of line to tokenize
267  * \param output The output vector which is of the same length as the number
268  * of columns, and has all the types specified.
269  * \param permit_undefined Allows output vector to repr
270  * \param output_order a pointer to an array of the same length as the output.
271  * Essentially column 'i' will be written to output_order[i].
272  * if output_order[i] == -1, the column is ignored.
273  * If output_order == nullptr, this is equivalent to the
274  * having output_order[i] == i
275  *
276  * \returns the number of output entries filled.
277  */
278  size_t tokenize_line(char* str, size_t len,
279  std::vector<flexible_type>& output,
280  bool permit_undefined,
281  const std::vector<size_t>* output_order = nullptr);
282 
283 
284  /**
285  * Parse the buf content into flexible_type.
286  * The type of the flexible_type is determined by the out variable.
287  *
288  * If recursive_parse is set to true, things which parse to strings will
289  * attempt to be reparsed. This allows for instance
290  * the quoted element "123" to be parsed as an integer instead of a string.
291  *
292  * If recursive_parse is true, the contents of the buffer may be modified
293  * (the buffer itself is used to maintain the recursive parse state)
294  */
295  bool parse_as(char** buf, size_t len,
296  const char* raw, size_t rawlen,
297  flexible_type& out, bool recursive_parse=false);
298 
299  /**
300  * Returns a printable string describing the parse error.
301  * This is only filled when \ref tokenize_line fails.
302  * The string is *not* cleared when tokenize line succeeds so this should
303  * not be used for flagging parse errors.
304  */
305  const std::string& get_last_parse_error_diagnosis() const;
306 
307  private:
308  // internal buffer
309  std::string field_buffer;
310  // current length of internal buffer
311  size_t field_buffer_len = 0;
312 
313  // the printable string describing the parse error
314  std::string parse_error;
315  // internal error. filled when tokenizer fails. This is appended to parse_error
316  // when appropriate
317  std::string tokenizer_impl_error;
318  ssize_t tokenizer_impl_fail_pos = -1;
319 
320  // the state of the tokenizer state machine
321  enum class tokenizer_state {
322  START_FIELD, IN_FIELD, IN_QUOTED_FIELD
323  };
324 
325  /**
326  * \param str Pointer to line to tokenize
327  * \param len Length of line to tokenize
328  * \param add_token Callback function which is called on every successful token.
329  * This function is allowed to modify the input string.
330  * \param lookahead_fn Callback function which is called to look ahead
331  * for the end of the token when bracketing [], {} is
332  * encountered. it is called with a (char**, len) and return
333  * true/false on success/failure. This function must not
334  * modify the input string.
335  * \param undotoken Callback function which is called to undo the previously
336  * parsed token. Only called when lookahead succeeds, but later
337  * parsing fails, thus requiring cancellation of the lookahead.
338  *
339  * \note Whether this function modifies the input string is dependent on
340  * whether add_token modifies the input string.
341  */
342  template <typename Fn, typename Fn2, typename Fn3>
343  bool tokenize_line_impl(char* str, size_t len,
344  Fn add_token,
345  Fn2 lookahead,
346  Fn3 undotoken);
347 
348  std::shared_ptr<flexible_type_parser> parser;
349 
350  // some precomputed information about the delimiter so we avoid excess
351  // string comparisons of the delimiter value
352  bool delimiter_is_new_line = false;
353  bool delimiter_is_space_but_not_tab = false;
354  char delimiter_first_character;
355  bool delimiter_is_singlechar = false;
356  bool delimiter_is_not_empty = true;
357  bool empty_string_in_na_values = false;
358  bool is_regular_line_terminator = true;
359 
360 
361 
362  /**
363  * Perform substitutions of true/false/na values
364  */
365  bool check_substitutions(const char* buf, size_t len, flexible_type& out);
366 };
367 /// \}
368 } // namespace turi
369 
370 namespace std {
371 static inline ostream& operator<<(ostream& os, const turi::csv_line_tokenizer& t) {
372  os << "Tokenizer("
373  << "preseve_quoting=" << t.preserve_quoting << ", "
374  << "use_escape_char='" << t.use_escape_char << "', "
375  << "escape_char='" << t.escape_char << "', "
376  << "skip_initial_space=" << t.skip_initial_space << ", "
377  << "delimiter=\"" << t.delimiter << "\", "
378  << "line_terminator=\"" << t.line_terminator << "\", "
379  << "comment_char=\'" << t.comment_char << "', "
380  << "has_comment_char=" << t.has_comment_char << ","
381  << "double_quote=" << t.double_quote << ","
382  << "quote_char=\'" << t.quote_char << "\'"
383  << "na_values=";
384 
385  for (size_t i = 0; i < t.na_values.size(); ++i) {
386  os << t.na_values[i];
387  if (i + 1 != t.na_values.size()) {
388  os << ",";
389  }
390  }
391  os << ")";
392 
393  return os;
394 }
395 } // namespace std
396 
397 #endif
std::unordered_set< std::string > false_values
STL namespace.
const std::string & get_last_parse_error_diagnosis() const
bool tokenize_line(const char *str, size_t len, std::vector< std::string > &output)
std::unordered_set< std::string > true_values
std::vector< std::string > na_values
bool parse_as(char **buf, size_t len, const char *raw, size_t rawlen, flexible_type &out, bool recursive_parse=false)