Turi Create  4.0
sframe.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_LIB_SFRAME_HPP
7 #define TURI_UNITY_LIB_SFRAME_HPP
8 #include <iostream>
9 #include <algorithm>
10 #include <memory>
11 #include <vector>
12 #include <core/logging/logger.hpp>
13 #include <core/data/flexible_type/flexible_type.hpp>
14 #include <core/storage/sframe_data/sarray.hpp>
15 #include <core/storage/sframe_data/dataframe.hpp>
16 #include <core/storage/sframe_data/sframe_index_file.hpp>
17 #include <core/storage/sframe_data/sframe_constants.hpp>
18 #include <core/storage/sframe_data/output_iterator.hpp>
19 
20 
21 namespace turi {
22 // forward declaration of th csv_line_tokenizer to avoid a
23 // circular dependency
24 struct csv_line_tokenizer;
25 class sframe_reader;
26 class csv_writer;
27 
29  std::vector<flexible_type>,
30  std::function<void(const std::vector<flexible_type>&)>,
31  std::function<void(std::vector<flexible_type>&&)>,
32  std::function<void(const sframe_rows&)> >
33  sframe_output_iterator;
34 
35 
36 /**
37  * \ingroup sframe_physical
38  * \addtogroup sframe_main Main SFrame Objects
39  * \{
40  */
41 
42 /**
43  * The SFrame is an immutable object that represents a table with rows
44  * and columns. Each column is an \ref sarray<flexible_type>, which is a
45  * sequence of an object T split into segments. The sframe writes an sarray
46  * for each column of data it is given to disk, each with a prefix that extends
47  * the prefix given to open. The SFrame is referenced on disk by a single
48  * ".frame_idx" file which then has a list of file names, one file for each
49  * column.
50  *
51  * The SFrame is \b write-once, \b read-many. The SFrame can be opened for
52  * writing \b once, after which it is read-only.
53  *
54  * Since each column of the SFrame is an independent sarray, as an independent
55  * shared_ptr<sarray<flexible_type> > object, columns can be added / removed
56  * to form new sframes without problems. As such, certain operations
57  * (such as the object returned by add_column) recan be "ephemeral" in that
58  * there is no .frame_idx file on disk backing it. An "ephemeral" frame can be
59  * identified by checking the result of get_index_file(). If this is empty,
60  * it is an ephemeral frame.
61  *
62  * The interface for the SFrame pretty much matches that of the \ref sarray
63  * as in the SArray's stored type is std::vector<flexible_type>. The SFrame
64  * however, also provides a large number of other capabilities such as
65  * csv parsing, construction from sarrays, etc.
66  */
67 class sframe : public swriter_base<sframe_output_iterator> {
68  public:
69 
70  /// The reader type
72 
73  /// The iterator type which \ref get_output_iterator returns
74  typedef sframe_output_iterator iterator;
75 
76  /// The type contained in the sframe
77  typedef std::vector<flexible_type> value_type;
78 
79  /**************************************************************************/
80  /* */
81  /* Constructors */
82  /* */
83  /**************************************************************************/
84  /**
85  * default constructor; does nothing; use \ref open_for_read or
86  * \ref open_for_write after construction to read/create an sarray.
87  */
88  inline sframe() { }
89 
90  /**
91  * Copy constructor.
92  * If the source frame is opened for writing, this will throw
93  * an exception. Otherwise, this will create a frame opened for reading,
94  * which shares column arrays with the source frame.
95  */
96  sframe(const sframe& other);
97 
98 
99  /**
100  * Move constructor.
101  */
102  sframe(sframe&& other) : sframe() {
103  (*this) = std::move(other);
104  }
105 
106 
107  /**
108  * Assignment operator.
109  * If the source frame is opened for writing, this will throw
110  * an exception. Otherwise, this will create a frame opened for reading,
111  * which shares column arrays with the source frame.
112  */
113  sframe& operator=(const sframe& other);
114 
115 
116  /**
117  * Move Assignment operator.
118  * Moves other into this. Other will be cleared as if it is a newly
119  * constructed sframe object.
120  */
121  sframe& operator=(sframe&& other);
122 
123  /**
124  * Attempts to construct an sframe which reads from the given frame
125  * index file. This should be a .frame_idx file.
126  * If the index cannot be opened, an exception is thrown.
127  */
128  explicit inline sframe(std::string frame_idx_file) {
129  auto frame_index_info = read_sframe_index_file(frame_idx_file);
130  open_for_read(frame_index_info);
131  }
132 
133  /**
134  * Construct an sframe from sframe index information.
135  */
136  explicit inline sframe(sframe_index_file_information frame_index_info) {
137  open_for_read(frame_index_info);
138  };
139 
140  /**
141  * Constructs an SFrame from a vector of Sarrays.
142  *
143  * \param columns List of sarrays to form as columns
144  * \param column_names List of the name for each column, with the indices
145  * corresponding with the list of columns. If the length of the column_names
146  * vector does not match columns, the column gets a default name.
147  * For example, if four columns are given and column_names = {id, num},
148  * the columns will be named {"id, "num", "X3", "X4"}. Entries that are
149  * zero-length strings will also be given a default name.
150  * \param fail_on_column_names If true, will throw an exception if any column
151  * names are unique. If false, will automatically adjust column names so
152  * they are unique.
153  *
154  * Throws an exception if any column names are not unique (if
155  * fail_on_column_names is true), or if the number of segments, segment
156  * sizes, or total sizes of each sarray is not equal. The constructed SFrame
157  * is ephemeral, and is not backed by a disk index.
158  */
159  explicit inline sframe(
160  const std::vector<std::shared_ptr<sarray<flexible_type> > > &new_columns,
161  const std::vector<std::string>& column_names = {},
162  bool fail_on_column_names=true) {
163  open_for_read(new_columns, column_names, fail_on_column_names);
164  }
165 
166  /**
167  * Constructs an SFrame from a csv file.
168  *
169  * All columns will be parsed into flex_string unless the column type is
170  * specified in the column_type_hints.
171  *
172  * \param path The url to the csv file. The url can points to local
173  * filesystem, hdfs, or s3. \param tokenizer The tokenization rules to use
174  * \param use_header If true, the first line will be parsed as column
175  * headers. Otherwise, R-style column names, i.e. X1, X2, X3... will be used.
176  * \param continue_on_failure If true, lines with parsing errors will be skipped.
177  * \param column_type_hints A map from column name to the column type.
178  * \param output_columns The subset of column names to output
179  * \param row_limit If non-zero, the maximum number of rows to read
180  * \param skip_rows If non-zero, the number of lines to skip at the start
181  * of each file
182  *
183  * Throws an exception if IO error or csv parse failed.
184  */
185  std::map<std::string, std::shared_ptr<sarray<flexible_type>>> init_from_csvs(
186  const std::string& path,
187  csv_line_tokenizer& tokenizer,
188  bool use_header,
189  bool continue_on_failure,
190  bool store_errors,
191  std::map<std::string, flex_type_enum> column_type_hints,
192  std::vector<std::string> output_columns = std::vector<std::string>(),
193  size_t row_limit = 0,
194  size_t skip_rows = 0);
195 
196  /**
197  * Constructs an SFrame from dataframe_t.
198  *
199  * \note Throw an exception if the dataframe contains undefined values (e.g.
200  * in sparse rows),
201  */
202  sframe(const dataframe_t& data);
203 
204  ~sframe();
205 
206  /**************************************************************************/
207  /* */
208  /* Openers */
209  /* */
210  /**************************************************************************/
211  /**
212  * Initializes the SFrame with an index_information.
213  * If the SFrame is already inited, this will throw an exception
214  */
215  inline void open_for_read(sframe_index_file_information frame_index_info) {
216  Dlog_func_entry();
217  ASSERT_MSG(!inited, "Attempting to init an SFrame "
218  "which has already been inited.");
219  inited = true;
220  create_arrays_for_reading(frame_index_info);
221  }
222 
223  /**
224  * Initializes the SFrame with a collection of columns. If the SFrame is
225  * already inited, this will throw an exception. Will throw an exception if
226  * column_names are not unique and fail_on_column_names is true.
227  */
229  const std::vector<std::shared_ptr<sarray<flexible_type> > > &new_columns,
230  const std::vector<std::string>& column_names = {},
231  bool fail_on_column_names=true) {
232  Dlog_func_entry();
233  ASSERT_MSG(!inited, "Attempting to init an SFrame "
234  "which has already been inited.");
235  inited = true;
236  create_arrays_for_reading(new_columns, column_names, fail_on_column_names);
237  }
238 
239  /**
240  * Opens the SFrame with an arbitrary temporary file.
241  * The array must not already been inited.
242  *
243  * \param column_names The name for each column. If the vector is shorter
244  * than column_types, or empty values are given, names are handled with
245  * default names of "X<column id+1>". Each column name must be unique.
246  * This will let you write non-unique column names, but if you do that,
247  * the sframe will throw an exception while constructing the output of
248  * this class.
249  * \param column_types The type of each column expressed as a
250  * flexible_type. Currently this is required to tell how many columns
251  * are a part of the sframe. Throws an exception if this is an empty
252  * vector.
253  * \param nsegments The number of parallel output segments on each
254  * sarray. Throws an exception if this is 0.
255  * \param frame_sidx_file If not specified, an argitrary temporary
256  * file will be created. Otherwise, all frame
257  * files will be written to the same location
258  * as the frame_sidx_file. Must end in
259  * ".frame_idx"
260  * \param fail_on_column_names If true, will throw an exception if any column
261  * names are unique. If false, will
262  * automatically adjust column names so they are
263  * unique.
264  */
265  inline void open_for_write(const std::vector<std::string>& column_names,
266  const std::vector<flex_type_enum>& column_types,
267  const std::string& frame_sidx_file = "",
268  size_t nsegments = SFRAME_DEFAULT_NUM_SEGMENTS,
269  bool fail_on_column_names=true) {
270  Dlog_func_entry();
271  ASSERT_MSG(!inited, "Attempting to init an SFrame "
272  "which has already been inited.");
273  if (column_names.size() != column_types.size()) {
274  log_and_throw(std::string("Names and Types array length mismatch"));
275  }
276  inited = true;
277  create_arrays_for_writing(column_names, column_types,
278  nsegments, frame_sidx_file, fail_on_column_names);
279  }
280 
281 /**************************************************************************/
282 /* */
283 /* Basic Accessors */
284 /* */
285 /**************************************************************************/
286 
287  /**
288  * Returns true if the Array is opened for reading.
289  * i.e. get_reader() will succeed
290  */
291  inline bool is_opened_for_read() const {
292  return (inited && !writing);
293  }
294 
295 
296  /**
297  * Returns true if the Array is opened for writing.
298  * i.e. get_output_iterator() will succeed
299  */
300  inline bool is_opened_for_write() const {
301  return (inited && writing);
302  }
303 
304 
305  /**
306  * Return the index file of the sframe
307  */
308  inline const std::string& get_index_file() const {
309  ASSERT_TRUE(inited);
310  return index_file;
311  }
312 
313 
314  /**
315  * Reads the value of a key associated with the sframe
316  * Returns true on success, false on failure.
317  */
318  inline bool get_metadata(const std::string& key, std::string &val) const {
319  bool ret;
320  std::tie(ret, val) = get_metadata(key);
321  return ret;
322  }
323 
324 
325  /**
326  * Reads the value of a key associated with the sframe
327  * Returns a pair of (true, value) on success, and (false, empty_string)
328  * on failure.
329  */
330  inline std::pair<bool, std::string> get_metadata(const std::string& key) const {
331  ASSERT_MSG(inited, "Invalid SFrame");
332  if (index_info.metadata.count(key)) {
333  return std::pair<bool, std::string>(true, index_info.metadata.at(key));
334  } else {
335  return std::pair<bool, std::string>(false, "");
336  }
337  }
338 
339 
340  /// Returns the number of columns in the SFrame. Does not throw.
341  inline size_t num_columns() const {
342  return index_info.ncolumns;
343  }
344 
345  /// Returns the length of each sarray.
346  inline size_t num_rows() const {
347  return size();
348  }
349 
350 
351  /**
352  * Returns the number of elements in the sframe. If the sframe was not initialized, returns 0.
353  */
354  inline size_t size() const {
355  return inited ? index_info.nrows : 0;
356  }
357 
358  /**
359  * Returns the name of the given column. Throws an exception if the
360  * column id is out of range.
361  */
362  inline std::string column_name(size_t i) const {
363  if(i >= index_info.column_names.size()) {
364  log_and_throw("Column index out of range!");
365  }
366 
367  return index_info.column_names[i];
368  }
369 
370  /**
371  * Returns the type of the given column. Throws an exception if the
372  * column id is out of range.
373  */
374  inline flex_type_enum column_type(size_t i) const {
375  if (writing) {
376  if(i >= group_writer->get_index_info().columns.size()) {
377  log_and_throw("Column index out of range!");
378  }
379  return (flex_type_enum)
380  (atoi(group_writer->get_index_info().columns[i].metadata["__type__"].c_str()));
381  } else {
382  if(i >= columns.size()) {
383  log_and_throw("Column index out of range!");
384  }
385  return columns[i]->get_type();
386  }
387  }
388 
389  /**
390  * Returns the type of the given column. Throws an exception if the
391  * column id is out of range.
392  * \overload
393  */
394  inline flex_type_enum column_type(const std::string& column_name) const {
395  return columns[column_index(column_name)]->get_type();
396  }
397 
398 
399  /** Returns the column names as a single vector.
400  */
401  inline const std::vector<std::string>& column_names() const {
402  return index_info.column_names;
403  }
404 
405  /** Returns the column types as a single vector.
406  */
407  inline std::vector<flex_type_enum> column_types() const {
408  std::vector<flex_type_enum> tv(num_columns());
409  for(size_t i = 0; i < num_columns(); ++i)
410  tv[i] = column_type(i);
411 
412  return tv;
413  }
414 
415  /**
416  * Returns true if the sframe contains the given column.
417  */
418  inline bool contains_column(const std::string& column_name) const {
419  Dlog_func_entry();
420  auto iter = std::find(index_info.column_names.begin(),
421  index_info.column_names.end(),
422  column_name);
423  return iter != index_info.column_names.end();
424  }
425 
426  /**
427  * Returns the number of segments that this SFrame will be
428  * written with. Never fails.
429  */
430  inline size_t num_segments() const {
431  ASSERT_MSG(inited, "Invalid SFrame");
432  if (writing) {
433  return group_writer->num_segments();
434  } else {
435  if (index_info.ncolumns == 0) return 0;
436  return columns[0]->num_segments();
437  }
438  }
439 
440  /**
441  * Return the number of segments in the collection.
442  * Will throw an exception if the writer is invalid (there is an error
443  * opening/writing files)
444  */
445  inline size_t segment_length(size_t i) const {
446  DASSERT_MSG(inited, "Invalid SFrame");
447  if (index_info.ncolumns == 0) return 0;
448  else return columns[0]->segment_length(i);
449  }
450 
451 
452  /**
453  * Returns the column index of column_name.
454  *
455  * Throws an exception of the column_ does not exist.
456  */
457  inline size_t column_index(const std::string& column_name) const {
458  auto iter = std::find(index_info.column_names.begin(),
459  index_info.column_names.end(),
460  column_name);
461  if (iter != index_info.column_names.end()) {
462  return (iter) - index_info.column_names.begin();
463  } else {
464  log_and_throw(std::string("Column name " + column_name + " does not exist."));
465  }
466  __builtin_unreachable();
467  }
468 
469  /**
470  * Returns the current index info of the array.
471  */
473  return index_info;
474  }
475 
476  /**
477  * Merges another SFrame with the same schema with the current SFrame
478  * returning a new SFrame.
479  * Both SFrames can be empty, but cannot be opened for writing.
480  */
481  sframe append(const sframe& other) const;
482 
483 
484  /**
485  * Gets an sframe reader object with the segment layout of the first column.
486  */
487  std::unique_ptr<reader_type> get_reader() const;
488 
489 
490  /**
491  * Gets an sframe reader object with num_segments number of logical segments.
492  */
493  std::unique_ptr<reader_type> get_reader(size_t num_segments) const;
494 
495 
496  /**
497  * Gets an sframe reader object with a custom segment layout. segment_lengths
498  * must sum up to the same length as the original array.
499  */
500  std::unique_ptr<reader_type> get_reader(const std::vector<size_t>& segment_lengths) const;
501 
502 /**************************************************************************/
503 /* */
504 /* Other SFrame Unique Accessors */
505 /* */
506 /**************************************************************************/
507 
508  /**
509  * Converts the sframe into a dataframe_t. Will reset iterators before
510  * and after the operation.
511  */
513 
514  /**
515  * Returns an sarray of the specific column.
516  *
517  * Throws an exception if the column does not exist.
518  */
519  std::shared_ptr<sarray<flexible_type> > select_column(size_t column_id) const;
520 
521  /**
522  * Returns an sarray of the specific column by name.
523  *
524  * Throws an exception if the column does not exist.
525  */
526  std::shared_ptr<sarray<flexible_type> > select_column(const std::string &name) const;
527 
528  /**
529  * Returns new sframe containing only the chosen columns in the same order.
530  * The new sframe is "ephemeral" in that it is not backed by an index
531  * on disk.
532  *
533  * Throws an exception if the column name does not exist.
534  */
535  sframe select_columns(const std::vector<std::string>& names) const;
536 
537  /**
538  * Returns a new ephemeral SFrame with the new column added to the end.
539  * The new sframe is "ephemeral" in that it is not backed by an index
540  * on disk.
541  *
542  * \param sarr_ptr Shared pointer to the SArray
543  * \param column_name The name to give this column. If empty it will
544  * be given a default name (X<column index>)
545  *
546  */
547  sframe add_column(std::shared_ptr<sarray<flexible_type> > sarr_ptr,
548  const std::string& column_name=std::string("")) const;
549 
550 
551 
552  /**
553  * Set the ith column name to name. This can be done when the
554  * frame is open in either reading or writing mode. Changes are ephemeral,
555  * and do not affect what is stored on disk.
556  */
557  void set_column_name(size_t column_id, const std::string& name);
558 
559  /**
560  * Returns a new ephemeral SFrame with the column removed.
561  * The new sframe is "ephemeral" in that it is not backed by an index
562  * on disk.
563  *
564  * \param column_id The index of the column to remove.
565  *
566  */
567  sframe remove_column(size_t column_id) const;
568 
569 
570  /**
571  * Returns a new ephemeral SFrame with two columns swapped.
572  * The new sframe is "ephemeral" in that it is not backed by an index
573  * on disk.
574  *
575  * \param column_1 The index of the first column.
576  * \param column_2 The index of the second column.
577  *
578  */
579  sframe swap_columns(size_t column_1, size_t column_2) const;
580 
581  /**
582  * Replace the column of the given column name with a new sarray.
583  * Return the new sframe with old column_name sarray replaced by the new sarray.
584  */
585  sframe replace_column(std::shared_ptr<sarray<flexible_type>> sarr_ptr,
586  const std::string& column_name) const;
587 
588 /**************************************************************************/
589  /* */
590 /* Writing Functions */
591 /* */
592 /**************************************************************************/
593 // These functions are only valid when the array is opened for writing
594 
595  /**
596  * Sets the number of segments in the output.
597  * Frame must be first opened for writing.
598  * Once an output iterator has been obtained, the number of segments
599  * can no longer be changed. Returns true on sucess, false on failure.
600  */
601  bool set_num_segments(size_t numseg);
602 
603  /**
604  * Gets an output iterator for the given segment. This can be used to
605  * write data to the segment, and is currently the only supported way
606  * to do so.
607  *
608  * The iterator is invalid once the segment is closed (See \ref close).
609  * Accessing the iterator after the writer is destroyed is undefined
610  * behavior.
611  *
612  * Cannot be called until the sframe is open.
613  *
614  * Example:
615  * \code
616  * // example to write the same vector to 7 rows of segment 1
617  * // let's say the sframe has 5 columns of type FLEX_TYPE_ENUM::INTEGER
618  * // and sfw is the sframe.
619  * auto iter = sfw.get_output_iterator(1);
620  * std::vector<flexible_type> vals{1,2,3,4,5}
621  * for(int i = 0; i < 7; ++i) {
622  * *iter = vals;
623  * ++iter;
624  * }
625  * \endcode
626  */
627  iterator get_output_iterator(size_t segmentid);
628 
629  /**
630  * Closes the sframe. close() also implicitly closes all segments. After
631  * the writer is closed, no segments can be written.
632  * After the sframe is closed, it becomes read only and can be read
633  * with the get_reader() function.
634  */
635  void close();
636 
637  /**
638  * Flush writes for a particular segment
639  */
640  void flush_write_to_segment(size_t segment);
641 
642  /**
643  * Saves a copy of the current sframe as a CSV file.
644  * Does not modify the current sframe.
645  *
646  * \param csv_file target CSV file to save into
647  * \param writer The CSV writer configuration
648  */
649  void save_as_csv(std::string csv_file,
650  csv_writer& writer);
651 
652  /**
653  * Adds meta data to the frame.
654  * Frame must be first opened for writing.
655  */
656  bool set_metadata(const std::string& key, std::string val);
657 
658  /**
659  * Saves a copy of the current sframe into a different location.
660  * Does not modify the current sframe.
661  */
662  void save(std::string index_file) const;
663 
664  /**
665  * SFrame serializer. oarc must be associated with a directory.
666  * Saves into a prefix inside the directory.
667  */
668  void save(oarchive& oarc) const;
669 
670 
671  /**
672  * Attempts to compact if the number of segments in the SArray
673  * exceeds SFRAME_COMPACTION_THRESHOLD.
674  */
675  void try_compact();
676 
677  /**
678  * SFrame deserializer. iarc must be associated with a directory.
679  * Loads from the next prefix inside the directory.
680  */
681  void load(iarchive& iarc);
682 
683  bool delete_files_on_destruction();
684 
685  /**
686  * Internal API.
687  * Used to obtain the internal writer object.
688  */
689  inline
690  std::shared_ptr<sarray_group_format_writer<flexible_type> >
692  return group_writer;
693  }
694  private:
695 
696  /**
697  * Clears all internal structures. Used by \ref create_arrays_for_reading
698  * and \ref create_arrays_for_writing to clear all the index information
699  * and column information
700  */
701  void reset();
702 
703  /**
704  * Internal function that actually writes the values to each SArray's
705  * output iterator. Used by the sframe_output_iterator.
706  */
707  void write(size_t segmentid, const std::vector<flexible_type>& t);
708 
709 
710  /**
711  * Internal function that actually writes the values to each SArray's
712  * output iterator. Used by the sframe_output_iterator.
713  */
714  void write(size_t segmentid, std::vector<flexible_type>&& t);
715 
716 
717  /**
718  * Internal function that actually writes the values to each SArray's
719  * output iterator. Used by the sframe_output_iterator.
720  */
721  void write(size_t segmentid, const sframe_rows& t);
722 
723  /**
724  * Internal function. Given the index_information, this function
725  * initializes each of the sarrays for reading; filling up
726  * the columns array
727  */
728  void create_arrays_for_reading(sframe_index_file_information frame_index_info);
729 
730  /**
731  * Internal function. Given a collection of sarray columns, this function
732  * makes an sframe representing the combination of all the columns. This
733  * sframe does not have an index file (it is ephemeral), and get_index_file
734  * will return an empty file. Will throw an exception if column_names are not
735  * unique and fail_on_column_names is true.
736  */
737  void create_arrays_for_reading(
738  const std::vector<std::shared_ptr<sarray<flexible_type> > > &columns,
739  const std::vector<std::string>& column_names = {},
740  bool fail_on_column_names=true);
741  /**
742  * Internal function. Given the index_file, this function initializes each of
743  * the sarrays for writing; filling up the columns array. Will throw an
744  * exception if column_names are not unique and fail_on_column_names is true.
745  */
746  void create_arrays_for_writing(const std::vector<std::string>& column_names,
747  const std::vector<flex_type_enum>& column_types,
748  size_t nsegments,
749  const std::string& frame_sidx_file,
750  bool fail_on_column_names);
751 
752  void keep_array_file_ref();
753  /**
754  * Internal function. Resolve conflicts in column names.
755  */
756  std::string generate_valid_column_name(const std::string &column_name) const;
757 
759  std::string index_file;
760  std::vector<std::shared_ptr<fileio::file_ownership_handle> > index_file_handle;
761 
762  std::vector<std::shared_ptr<sarray<flexible_type> > > columns;
763  std::shared_ptr<sarray_group_format_writer<flexible_type> > group_writer;
764 
765  mutex lock;
766 
767  bool inited = false;
768  bool writing = false;
769  friend class sframe_reader;
770 
771 public:
772  /**
773  * For debug purpose, print the information about the sframe.
774  */
775  void debug_print();
776 };
777 
778 /// \}
779 } // end of namespace
780 #endif
781 
782 
783 #include <core/storage/sframe_data/sframe_reader.hpp>
size_t ncolumns
The number of columns in the frame.
void try_compact()
sframe_index_file_information read_sframe_index_file(std::string index_file)
sframe(sframe_index_file_information frame_index_info)
Definition: sframe.hpp:136
void set_column_name(size_t column_id, const std::string &name)
size_t nrows
The number of rows in the frame.
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void save(std::string index_file) const
std::pair< bool, std::string > get_metadata(const std::string &key) const
Definition: sframe.hpp:330
void debug_print()
void open_for_read(const std::vector< std::shared_ptr< sarray< flexible_type > > > &new_columns, const std::vector< std::string > &column_names={}, bool fail_on_column_names=true)
Definition: sframe.hpp:228
flex_type_enum column_type(size_t i) const
Definition: sframe.hpp:374
std::shared_ptr< sarray_group_format_writer< flexible_type > > get_internal_writer()
Definition: sframe.hpp:691
size_t column_index(const std::string &column_name) const
Definition: sframe.hpp:457
const std::vector< std::string > & column_names() const
Definition: sframe.hpp:401
sframe remove_column(size_t column_id) const
bool set_metadata(const std::string &key, std::string val)
sframe swap_columns(size_t column_1, size_t column_2) const
size_t size() const
Definition: sframe.hpp:354
void load(iarchive &iarc)
sframe(const std::vector< std::shared_ptr< sarray< flexible_type > > > &new_columns, const std::vector< std::string > &column_names={}, bool fail_on_column_names=true)
Definition: sframe.hpp:159
sframe add_column(std::shared_ptr< sarray< flexible_type > > sarr_ptr, const std::string &column_name=std::string("")) const
size_t num_rows() const
Returns the length of each sarray.
Definition: sframe.hpp:346
iterator get_output_iterator(size_t segmentid)
sframe_output_iterator iterator
The iterator type which get_output_iterator returns.
Definition: sframe.hpp:74
bool is_opened_for_read() const
Definition: sframe.hpp:291
const std::string & get_index_file() const
Definition: sframe.hpp:308
size_t SFRAME_DEFAULT_NUM_SEGMENTS
sframe(sframe &&other)
Definition: sframe.hpp:102
size_t segment_length(size_t i) const
Definition: sframe.hpp:445
bool contains_column(const std::string &column_name) const
Definition: sframe.hpp:418
sframe replace_column(std::shared_ptr< sarray< flexible_type >> sarr_ptr, const std::string &column_name) const
sframe & operator=(const sframe &other)
sframe select_columns(const std::vector< std::string > &names) const
const sframe_index_file_information get_index_info() const
Definition: sframe.hpp:472
std::shared_ptr< sarray< flexible_type > > select_column(size_t column_id) const
#define ASSERT_TRUE(cond)
Definition: assertions.hpp:309
size_t num_segments() const
Definition: sframe.hpp:430
std::map< std::string, std::shared_ptr< sarray< flexible_type > > > init_from_csvs(const std::string &path, csv_line_tokenizer &tokenizer, bool use_header, bool continue_on_failure, bool store_errors, std::map< std::string, flex_type_enum > column_type_hints, std::vector< std::string > output_columns=std::vector< std::string >(), size_t row_limit=0, size_t skip_rows=0)
void open_for_write(const std::vector< std::string > &column_names, const std::vector< flex_type_enum > &column_types, const std::string &frame_sidx_file="", size_t nsegments=SFRAME_DEFAULT_NUM_SEGMENTS, bool fail_on_column_names=true)
Definition: sframe.hpp:265
std::map< std::string, std::string > metadata
Any additional metadata stored with the frame.
std::string column_name(size_t i) const
Definition: sframe.hpp:362
sframe_reader reader_type
The reader type.
Definition: sframe.hpp:71
std::unique_ptr< reader_type > get_reader() const
bool get_metadata(const std::string &key, std::string &val) const
Definition: sframe.hpp:318
size_t num_columns() const
Returns the number of columns in the SFrame. Does not throw.
Definition: sframe.hpp:341
void open_for_read(sframe_index_file_information frame_index_info)
Definition: sframe.hpp:215
bool set_num_segments(size_t numseg)
std::vector< flex_type_enum > column_types() const
Definition: sframe.hpp:407
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void flush_write_to_segment(size_t segment)
bool is_opened_for_write() const
Definition: sframe.hpp:300
dataframe_t to_dataframe()
sframe(std::string frame_idx_file)
Definition: sframe.hpp:128
std::vector< std::string > column_names
The names of each column. The length of this must match ncolumns.
void save_as_csv(std::string csv_file, csv_writer &writer)
std::vector< flexible_type > value_type
The type contained in the sframe.
Definition: sframe.hpp:77
sframe append(const sframe &other) const
flex_type_enum column_type(const std::string &column_name) const
Definition: sframe.hpp:394