Turi Create  4.0
dataframe.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_DATAFRAME_HPP
7 #define TURI_UNITY_DATAFRAME_HPP
8 #include <vector>
9 #include <iterator>
10 #include <core/data/flexible_type/flexible_type.hpp>
11 #include <core/storage/serialization/serialization_includes.hpp>
12 
13 namespace turi {
14 
15 class flexible_type_registry;
16 typedef uint32_t field_id_type;
17 
18 
19 /**
20  * \ingroup sframe_physical
21  * \addtogroup csv_utils CSV Parsing and Writing
22  * \{
23  */
24 
25 /**
26  * Type that represents a Pandas-like dataframe:
27  * A in memory column-wise representation of a table.
28  * The dataframe_t is simply a map from column name to a column of records,
29  * where every column is the same length, and all values within a column
30  * have the same type. This is also the type used for transferring between
31  * pandas dataframe objects and C++.
32  *
33  * Each cell in the dataframe is represented by a \ref flexible_type object,
34  * while this technically allows every cell to be an arbitrary type, we do not
35  * permit that behavior. We require and assume that every cell in a column be of
36  * the same type. This is with the exception of empty cells (NaNs in Pandas)
37  * which are of type UNDEFINED.
38  */
39 struct dataframe_t {
40  /// A vector storing the name of columns
41  std::vector<std::string> names;
42 
43  /// A map from the column name to the type of the column
44  std::map<std::string, flex_type_enum> types;
45 
46  /** A map from the column name to the values of the column.
47  * Every column must have the same length, and all values within a column
48  * must be of the same type. The UNDEFINED type is an exception to the rule
49  * and may be used anywhere to designate an empty entry.
50  */
51  std::map<std::string, std::vector<flexible_type> > values;
52 
53  /**
54  * Fill the dataframe with the content from a csv file.
55  *
56  * \param: path. Path to the csv file.
57  * \param: delimiter. User defined csv sepearator.
58  * \param: use_header. If true, the first line is treated as column header.
59  * Otherwise, X0, X1,... will be used.
60  *
61  * Note: This function will infer and unify the most proper type for each
62  * column.
63  */
64  void read_csv(const std::string& path, char delimiter, bool use_header);
65 
66  /**
67  * Returns the number of rows in the dataframe
68  */
69  inline size_t nrows() const {
70  return (values.begin() == values.end()) ? 0
71  : values.begin()->second.size();
72  }
73 
74  /**
75  * Returns true if the dataframe is empty.
76  */
77  inline bool empty() const {
78  return ncols() == 0 || nrows() == 0;
79  }
80 
81  /**
82  * Convert the values in the column into the specified type.
83  * Throws an exception if the column is not found, or the conversion
84  * cannot be made.
85  */
86  void set_type(std::string key, flex_type_enum type);
87 
88  /// Returns the number of columns in the dataframe
89  inline size_t ncols() const { return values.size(); }
90 
91  /**
92  * Returns true if the dataframe contains a column with the given name.
93  */
94  bool contains(std::string key) const{
95  return types.count(key);
96  }
97 
98  /**
99  * Returns true if the column contains undefined flexible_type value.
100  */
101  bool contains_nan(std::string key) const{
102  if (!contains(key)) {
103  log_and_throw(std::string("Column " + key + " does not exist."));
104  }
105  const std::vector<flexible_type>& col = values.at(key);
106  bool ret = false;
107  for (size_t i = 0; i < col.size(); ++i) {
108  if (col[i].get_type() == flex_type_enum::UNDEFINED) {
109  ret = true;
110  }
111  }
112  return ret;
113  }
114 
115  /**
116  * Column index operator. Can be used to extract a column from the dataframe.
117  * Returns a pair of (type, reference to column)
118  */
119  inline std::pair< flex_type_enum, std::vector<flexible_type>&>
120  operator[](std::string key) {
121  return {types.at(key), values.at(key)};
122  }
123 
124  /**
125  * Const column index operator. Can be used to extract a column from the
126  * dataframe. Returns a pair of (type, reference to column)
127  */
128  inline std::pair< flex_type_enum, const std::vector<flexible_type>&>
129  operator[](std::string key) const {
130  return {types.at(key), values.at(key)};
131  }
132 
133  /**
134  * Prints the contents of the dataframe to std::cerr
135  */
136  void print() const;
137 
138  /**
139  * Sets the value of a column of the dataframe.
140  */
141  void set_column(std::string key,
142  const std::vector<flexible_type>& val,
143  flex_type_enum type);
144 
145 
146  /**
147  * Sets the value of a column of the dataframe, consuming the vector value
148  */
149  void set_column(std::string key,
150  std::vector<flexible_type>&& val,
151  flex_type_enum type);
152 
153  /**
154  * Remove the column.
155  */
156  void remove_column(std::string key);
157 
158  /// Serializer
159  inline void save(oarchive& oarc) const {
160  oarc << names << types << values;
161  }
162 
163  /// Deserializer
164  inline void load(iarchive& iarc) {
165  iarc >> names >> types >> values;
166  }
167 
168  /// Clears the contents of the dataframe
169  inline void clear() {
170  names.clear();
171  types.clear();
172  values.clear();
173  }
174 };
175 
176 /**
177  * \ingroup unity
178  * The dataframe is a column-wise representation. This provides iteration over
179  * the dataframe in a row-wise representation. Incrementing the iterator
180  * advances the iterator element by element by across rows.
181  *
182  * Usage:
183  * \code
184  * // Performs a row-wise iteration over the entries of the dataframe
185  * dataframe_row_iterator iter = dataframe_row_iterator::begin(df);
186  * dataframe_row_iterator end = dataframe_row_iterator::end(df);
187  * while (iter != end) {
188  * // do stuff with (*iter). It is a flexible_type
189  * ++iter;
190  * }
191  * \endcode
192  *
193  * \code
194  * // Alternatively:
195  * dataframe_row_iterator iter = dataframe_row_iterator::begin(df);
196  * for (size_t row = 0; row < iter.row_size(); ++row) {
197  * for (size_t col = 0; col < iter.col_size(); ++col) {
198  * // do stuff with (*iter). It is a flexible_type
199  * // pointing to the cell in column 'col' and row 'row'
200  * ++iter;
201  * }
202  * }
203  * \endcode
204  */
206  private:
207  /// The names of each column of the dataframe
208  std::vector<std::string> names;
209 
210  /// The types of each column of the dataframe
211  std::vector<flex_type_enum> types;
212 
213  /// The list of iterators over each column
214  std::vector<std::pair<std::vector<flexible_type>::const_iterator,
215  std::vector<flexible_type>::const_iterator> > iterators;
216  /// Number of rows in the dataframe
217  size_t num_rows;
218  /// Number of columns in the dataframe
219  size_t num_columns;
220  ///The current column pointed to
221  size_t current_column;
222  /// The current row pointed to
223  size_t current_row;
224  /// The total number of entries: num_rows * num_column
225  size_t num_el;
226  /// The entry index pointed to.
227  size_t idx;
228  public:
229 
230  typedef flexible_type value_type;
231  typedef int difference_type;
232  typedef flexible_type* pointer;
233  typedef flexible_type& reference;
234  typedef std::forward_iterator_tag iterator_category;
235 
236  /// Creates a begin iterator to the dataframe
237  static dataframe_row_iterator begin(const dataframe_t& dt);
238 
239  /// Creates an end iterator to the dataframe
240  static dataframe_row_iterator end(const dataframe_t& dt);
241 
242  /**
243  * Changes the column iterator ordering by swapping the indices
244  * of columns 'a' and columns 'b'.
245  * Should only be done on begin and end iterators. Iterators in the midst
246  * of iterating are not safe to be swapped.
247  */
248  inline void swap_column_order(size_t a, size_t b) {
249  log_func_entry();
250  std::swap(iterators[a], iterators[b]);
251  std::swap(names[a], names[b]);
252  std::swap(types[a], types[b]);
253  }
254 
255  /// pre-increments to the next entry of the dataframe row-wise
257  ++iterators[current_column].first;
258  ++current_column;
259  if (current_column == num_columns) {
260  current_column = 0;
261  ++current_row;
262  }
263  ++idx;
264  return *this;
265  }
266 
267  /// post-increments to the next entry of the dataframe row-wise
269  dataframe_row_iterator ret = (*this);
270  ++ret;
271  return *this;
272  }
273 
274  /// Returns the index of the current row
275  inline size_t row() const {
276  return current_row;
277  }
278 
279  /// Returns the index of the current column
280  inline size_t column() const {
281  return current_column;
282  }
283 
284  /// Returns the number of columns
285  inline size_t column_size() const {
286  return num_columns;
287  }
288 
289  /// Returns the number of rows
290  inline size_t row_size() const {
291  return num_rows;
292  }
293 
294  /// Returns the name of the current column
295  inline const std::string& column_name() const {
296  return names[current_column];
297  }
298 
299  /// Returns the name of an arbitrary column
300  inline const std::string& column_name(size_t idx) const {
301  return names[idx];
302  }
303 
304  /// Returns the list of all column names
305  inline const std::vector<std::string>& column_names() const {
306  return names;
307  }
308 
309  /// Returns the type of the current column
310  inline flex_type_enum column_type() const {
311  return types[current_column];
312  }
313 
314  /// Returns the type of an arbitrary column
315  inline flex_type_enum column_type(size_t idx) const {
316  return types[idx];
317  }
318 
319  /// Returns the list of all column types
320  inline const std::vector<flex_type_enum>& column_types() const {
321  return types;
322  }
323 
324  /**
325  * Advances the iterator by this number of rows.
326  * Current column does not change. If the number of rows to skip causes
327  * the iterator to go past the end of the dataframe, the resultant
328  * iterator is equivalent to the end iterator of the dataframe.
329  */
330  void skip_rows(size_t num_rows_to_skip);
331 
332  /// Returns true if both iterators are equal
333  inline bool operator==(const dataframe_row_iterator& other) {
334  return num_el == other.num_el && idx < other.idx;
335  }
336 
337  /// Returns true if both iterators are not equal
338  inline bool operator!=(const dataframe_row_iterator& other) {
339  return !((*this) == other);
340  }
341 
342  /// Dereferences the iterator, returning a reference to the underlying flexible_type
343  inline const flexible_type& operator*() {
344  return *(iterators[current_column].first);
345  }
346 
347  /// Dereferences the iterator, returning a reference to the underlying flexible_type
348  inline const flexible_type& operator*() const {
349  return *(iterators[current_column].first);
350  }
351 
352 
353  /// Dereferences the iterator, returning a reference to the underlying flexible_type
354  inline const flexible_type* operator->() {
355  return &(*(iterators[current_column].first));
356  }
357 
358  /// Dereferences the iterator, returning a reference to the underlying flexible_type
359  inline const flexible_type* operator->() const {
360  return &(*(iterators[current_column].first));
361  }
362 };
363 
364 
365 
366 /**
367  * Cuts up the provided begin iterator to a dataframe into rows, calling the
368  * lambda with a new iterator and the range of rows it is meant to process.
369  */
371  std::function<void(dataframe_row_iterator& iter,
372  size_t startrow,
373  size_t endrow)> partialrowfn);
374 /// \}
375 } // namespace turi
376 
377 #endif
size_t row() const
Returns the index of the current row.
Definition: dataframe.hpp:275
std::pair< flex_type_enum, const std::vector< flexible_type > & > operator[](std::string key) const
Definition: dataframe.hpp:129
void clear()
Clears the contents of the dataframe.
Definition: dataframe.hpp:169
const flexible_type * operator->() const
Dereferences the iterator, returning a reference to the underlying flexible_type. ...
Definition: dataframe.hpp:359
const flexible_type & operator*() const
Dereferences the iterator, returning a reference to the underlying flexible_type. ...
Definition: dataframe.hpp:348
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
const std::string & column_name() const
Returns the name of the current column.
Definition: dataframe.hpp:295
void remove_column(std::string key)
bool contains(std::string key) const
Definition: dataframe.hpp:94
size_t nrows() const
Definition: dataframe.hpp:69
std::pair< flex_type_enum, std::vector< flexible_type > & > operator[](std::string key)
Definition: dataframe.hpp:120
flex_type_enum column_type(size_t idx) const
Returns the type of an arbitrary column.
Definition: dataframe.hpp:315
size_t column() const
Returns the index of the current column.
Definition: dataframe.hpp:280
size_t column_size() const
Returns the number of columns.
Definition: dataframe.hpp:285
const std::string & column_name(size_t idx) const
Returns the name of an arbitrary column.
Definition: dataframe.hpp:300
void read_csv(const std::string &path, char delimiter, bool use_header)
bool empty() const
Definition: dataframe.hpp:77
std::vector< std::string > names
A vector storing the name of columns.
Definition: dataframe.hpp:41
bool operator!=(const dataframe_row_iterator &other)
Returns true if both iterators are not equal.
Definition: dataframe.hpp:338
void swap_column_order(size_t a, size_t b)
Definition: dataframe.hpp:248
bool operator==(const dataframe_row_iterator &other)
Returns true if both iterators are equal.
Definition: dataframe.hpp:333
flex_type_enum column_type() const
Returns the type of the current column.
Definition: dataframe.hpp:310
void parallel_dataframe_iterate(const dataframe_t &df, std::function< void(dataframe_row_iterator &iter, size_t startrow, size_t endrow)> partialrowfn)
std::map< std::string, std::vector< flexible_type > > values
Definition: dataframe.hpp:51
const std::vector< flex_type_enum > & column_types() const
Returns the list of all column types.
Definition: dataframe.hpp:320
void load(iarchive &iarc)
Deserializer.
Definition: dataframe.hpp:164
dataframe_row_iterator & operator++()
pre-increments to the next entry of the dataframe row-wise
Definition: dataframe.hpp:256
const std::vector< std::string > & column_names() const
Returns the list of all column names.
Definition: dataframe.hpp:305
std::map< std::string, flex_type_enum > types
A map from the column name to the type of the column.
Definition: dataframe.hpp:44
size_t ncols() const
Returns the number of columns in the dataframe.
Definition: dataframe.hpp:89
void print() const
const flexible_type & operator*()
Dereferences the iterator, returning a reference to the underlying flexible_type. ...
Definition: dataframe.hpp:343
const flexible_type * operator->()
Dereferences the iterator, returning a reference to the underlying flexible_type. ...
Definition: dataframe.hpp:354
size_t row_size() const
Returns the number of rows.
Definition: dataframe.hpp:290
void save(oarchive &oarc) const
Serializer.
Definition: dataframe.hpp:159
void set_column(std::string key, const std::vector< flexible_type > &val, flex_type_enum type)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
bool contains_nan(std::string key) const
Definition: dataframe.hpp:101
dataframe_row_iterator & operator++(int)
post-increments to the next entry of the dataframe row-wise
Definition: dataframe.hpp:268
void set_type(std::string key, flex_type_enum type)