Turi Create  4.0
parallel_csv_parser.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_LIB_PARALLEL_CSV_PARSER_HPP
7 #define TURI_UNITY_LIB_PARALLEL_CSV_PARSER_HPP
8 #include <string>
9 #include <vector>
10 #include <map>
11 #include <core/data/flexible_type/flexible_type.hpp>
12 #include <core/storage/sframe_data/sframe.hpp>
13 #include <core/storage/sframe_data/csv_line_tokenizer.hpp>
14 #include <core/storage/sframe_data/sframe_constants.hpp>
15 namespace turi {
16 
17 
18 /**
19  * \ingroup sframe_physical
20  * \addtogroup csv_utils CSV Parsing and Writing
21  * \{
22  */
23 
24 /**
25  * std::getline replacement that correctly handles all \\r, \\n and \\r\\n
26  * line break characters.
27  */
28 std::istream& eol_safe_getline(std::istream& is, std::string& t);
29 
30 /**
31  * All the options pertaining to top level CSV file handling
32  */
34  /// Whether the first (non-commented) line of the file is the column name header.
35  bool use_header = true;
36 
37  /// Whether we should just skip line errors.
38  bool continue_on_failure = false;
39 
40  /// Whether failed parses will be stored in an sarray of strings and returned.
41  bool store_errors = false;
42 
43  /// collection of column name->type. Every other column type will be parsed as a string
44  std::map<std::string, flex_type_enum> column_type_hints;
45 
46  /// Output column names
47  std::vector<std::string> output_columns;
48 
49  /// The number of rows to read. If 0, all lines are read
50  size_t row_limit = 0;
51 
52  /// Number of rows at the start of each file to ignore
53  size_t skip_rows = 0;
54 };
55 
56 /**
57  * Parses a CSV file / glob of CSV files to an SFrame.
58  *
59  * \param url Path or Glob to read files
60  * \param tokenizer CSV tokenization options
61  * \param options Other file handling options
62  * \param frame Returned sframe object. This should be an uninitialized sframe.
63  * \param frame_sidx_file Location to save the result. Optional. Defaults to cache.
64  *
65  * \returns a map of filename to sarray<flexible_type> of string type where each
66  * row contains a line of the file that failed to parse. This is only filled
67  * if options.store_errors = true
68  */
69 std::map<std::string, std::shared_ptr<sarray<flexible_type>>> parse_csvs_to_sframe(
70  const std::string& url,
71  csv_line_tokenizer& tokenizer,
73  sframe& frame,
74  std::string frame_sidx_file = "");
75 
76 /// \}
77 } // namespace turi
78 
79 #endif // TURI_UNITY_LIB_PARALLEL_CSV_PARSER_HPP
bool continue_on_failure
Whether we should just skip line errors.
std::map< std::string, flex_type_enum > column_type_hints
collection of column name->type. Every other column type will be parsed as a string ...
bool use_header
Whether the first (non-commented) line of the file is the column name header.
std::map< std::string, std::shared_ptr< sarray< flexible_type > > > parse_csvs_to_sframe(const std::string &url, csv_line_tokenizer &tokenizer, csv_file_handling_options options, sframe &frame, std::string frame_sidx_file="")
std::vector< std::string > output_columns
Output column names.
size_t row_limit
The number of rows to read. If 0, all lines are read.
std::istream & eol_safe_getline(std::istream &is, std::string &t)
size_t skip_rows
Number of rows at the start of each file to ignore.
bool store_errors
Whether failed parses will be stored in an sarray of strings and returned.