Turi Create  4.0
gl_sframe.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_GL_SFRAME_HPP
7 #define TURI_UNITY_GL_SFRAME_HPP
8 #include <cmath>
9 #include <memory>
10 #include <cstddef>
11 #include <string>
12 #include <iostream>
13 #include <map>
14 #include <core/data/flexible_type/flexible_type.hpp>
15 #include <core/storage/sframe_data/group_aggregate_value.hpp>
16 #include <core/storage/sframe_data/sframe_rows.hpp>
17 #include "gl_sarray.hpp"
18 namespace turi {
19 class unity_sarray;
20 class unity_sframe;
21 class unity_sframe_base;
22 class gl_sarray;
23 class sframe;
24 class sframe_reader;
25 class sframe_reader_buffer;
26 
27 class gl_sframe_range;
28 class gl_sarray_reference;
29 class const_gl_sarray_reference;
30 
31 typedef std::map<std::string, flex_type_enum> str_flex_type_map;
32 typedef std::map<std::string, flexible_type> csv_parsing_config_map;
33 typedef std::map<std::string, std::string> string_map;
34 typedef std::map<std::string, std::shared_ptr<unity_sarray_base>> csv_parsing_errors;
35 
36 /**
37  * \ingroup group_glsdk
38  * \brief All the available groupby aggregators aggregators.
39  * See \ref gl_sframe::groupby for details.
40  */
41 namespace aggregate {
42 
43 /**
44  * Describing an aggregate operation on a set of columns.
45  *
46  * An object of groupby_descriptor_type can be constructed
47  * using functions such as \ref COUNT, \ref SUM, etc for builtin aggregations,
48  * or using \ref make_aggregator for customized aggregators.
49  */
51 
52  // default constructor
54 
55  // constructor for builtin operators
56  groupby_descriptor_type(const std::string& builtin_operator_name,
57  const std::vector<std::string>& group_columns);
58 
59  // constructor for custom operators
60  groupby_descriptor_type(std::shared_ptr<group_aggregate_value> aggregator,
61  const std::vector<std::string>& group_columns);
62 
63  /// columns as input into the aggregator
64  std::vector<std::string> m_group_columns;
65 
66  /// aggregator
67  std::shared_ptr<group_aggregate_value> m_aggregator;
68 };
69 
70 /**
71  * Create a groupby_descriptor_type of user defined groupby aggregator type T.
72  *
73  * \param group_columns A vector of column names expected by the groupby aggregator.
74  * \param const Args&... Extra argument to construct T
75  *
76  * \code
77  * class my_aggregator : public group_aggregate_value {
78  * // default constructible
79  * my_aggregator();
80  *
81  * ...
82  * };
83  *
84  * auto aggregator1 = make_aggregator<my_aggregator>({"col1"});
85  *
86  * class my_complicated_aggregator : public group_aggregate_value {
87  * // constructor requires extra arguments
88  * my_complicated_aggregator(const std::vector<double>& initial_values);
89  *
90  * ...
91  * };
92  *
93  * std::vector<double> initial_values {1,2,3};
94  * auto aggregator2 = make_aggregator<my_aggregator>({"col1", "col2"},
95  * initial_values);
96  * \endcode
97  */
98 template<typename T, typename... Args>
99 groupby_descriptor_type make_aggregator(const std::vector<std::string>& group_columns,
100  const Args&... args){
101  static_assert(std::is_base_of<group_aggregate_value, T>::value,
102  "T must inherit from group_aggregate_value");
103  auto aggregator = std::make_shared<T>(&args...);
104  return groupby_descriptor_type(aggregator, group_columns);
105 };
106 
107 /**
108  * Builtin sum aggregator for groupby
109  *
110  * Example: Get the sum of the rating column for each user.
111  * \code
112  * sf.groupby({"user"},
113  * {{"rating_sum",aggregate::SUM("rating")}});
114  * \endcode
115  *
116  * \see gl_sframe::groupby
117  */
118 groupby_descriptor_type SUM(const std::string& col);
119 
120 /**
121  * Builtin max aggregator for groupby.
122  *
123  * Example: Get the max of the rating column for each user.
124  * \code
125  * sf.groupby({"user"},
126  * {{"rating_max",aggregate::MAX("rating")}});
127  * \endcode
128  *
129  * \see gl_sframe::groupby
130  */
131 groupby_descriptor_type MAX(const std::string& col);
132 
133 /**
134  * Builtin min aggregator for groupby.
135  *
136  * Example: Get the min of the rating column for each user.
137  * \code
138  * sf.groupby({"user"},
139  * {{"rating_min",aggregate::MAX("rating")}});
140  * \endcode
141  *
142  * \see gl_sframe::groupby
143  */
144 groupby_descriptor_type MIN(const std::string& col);
145 
146 /**
147  * Builtin count aggregator for groupby.
148  *
149  * Example: Get the number of occurences of each user
150  * \code
151  * sf.groupby({"user"},
152  * {{"rating_count",aggregate::COUNT()}});
153  * \endcode
154  *
155  * \see gl_sframe::groupby
156  */
158 
159 
160 /**
161  * Builtin average aggregator for groupby.
162  *
163  * Synonym for \ref aggregate::AVG.
164  *
165  * Example: Get the average rating of each user.
166  * \code
167  * sf.groupby({"user"},
168  * {{"rating_avg",aggregate::AVG("rating")}});
169  * \endcode
170  *
171  * \see gl_sframe::groupby
172  */
173 groupby_descriptor_type MEAN(const std::string& col);
174 
175 /**
176  * Builtin average aggregator for groupby.
177  *
178  * Synonym for \ref aggregate::MEAN.
179  *
180  * Example: Get the average rating of each user.
181  * \code
182  * sf.groupby({"user"},
183  * {{"rating_avg",aggregate::AVG("rating")}});
184  * \endcode
185  *
186  * \see gl_sframe::groupby
187  */
188 groupby_descriptor_type AVG(const std::string& col);
189 
190 
191 /**
192  * Builtin variance aggregator for groupby.
193  *
194  * Synonym for \ref aggregate::VARIANCE
195  *
196  * Example: Get the rating variance of each user.
197  * \code
198  * sf.groupby({"user"},
199  * {{"rating_var",aggregate::VAR("rating")}});
200  * \endcode
201  *
202  * \see aggregate::VAR
203  * \see aggregate::STD
204  * \see gl_sframe::groupby
205  */
206 groupby_descriptor_type VAR(const std::string& col);
207 
208 /**
209  * Builtin variance aggregator for groupby.
210  *
211  * Synonym for \ref aggregate::VAR.
212  *
213  * Example: Get the rating variance of each user.
214  * \code
215  * sf.groupby({"user"},
216  * {{"rating_var",aggregate::VARIANCE("rating")}});
217  * \endcode
218  *
219  * \see aggregate::VARIANCE
220  * \see aggregate::STD
221  * \see gl_sframe::groupby
222  */
223 groupby_descriptor_type VARIANCE(const std::string& col);
224 
225 /**
226  * Builtin standard deviation aggregator for groupby.
227  *
228  * Synonym for \ref aggregate::STDV.
229  *
230  * Example: Get the rating standard deviation of each user.
231  * \code
232  * sf.groupby({"user"},
233  * {{"rating_std",aggregate::STD("rating")}});
234  * \endcode
235  *
236  * \see aggregate::STDV
237  * \see aggregate::VAR
238  * \see gl_sframe::groupby
239  */
240 groupby_descriptor_type STD(const std::string& col);
241 
242 
243 /**
244  * Builtin standard deviation aggregator for groupby.
245  *
246  * Synonym for \ref aggregate::STD.
247  *
248  * Example: Get the rating standard deviation of each user.
249  * \code
250  * sf.groupby({"user"},
251  * {{"rating_std",aggregate::STDV("rating")}});
252  * \endcode
253  *
254  * \see turi::aggregate::STD
255  * \see turi::aggregate::VAR
256  * \see gl_sframe::groupby
257  */
258 groupby_descriptor_type STDV(const std::string& col);
259 
260 /**
261  * Builtin aggregator for groupby which selects one row in the group.
262  *
263  * Example: Get one rating row from a user.
264  * \code
265  * sf.groupby({"user"},
266  * {{"rating",aggregate::SELECT_ONE("rating")}});
267  * \endcode
268  *
269  * If multiple columns are selected, they are guaranteed to come from the
270  * same row. for instance:
271  * \code
272  * sf.groupby({"user"},
273  * {{"rating", aggregate::SELECT_ONE("rating")},
274  * {"item", aggregate::SELECT_ONE("item")}});
275  * \endcode
276  *
277  * The selected "rating" and "item" value for each user will come from the
278  * same row in the \ref gl_sframe.
279  */
280 groupby_descriptor_type SELECT_ONE(const std::string& col);
281 
282 /**
283  * Builtin arg minimum aggregator for groupby.
284  *
285  * Example: Get the number of unique movies
286  * \code
287  * sf.groupby("user",
288  * {{"best_movie", aggregate::COUNT_DISTINCT("rating")}});
289  * \endcode
290  */
291 groupby_descriptor_type COUNT_DISTINCT(const std::string& col);
292 
293 ///@{
294 /**
295  * Builtin aggregator that combines values from one or two columns in one group
296  * into either a dictionary value, list value or array value.
297  *
298  * For example, to combine values from two columns that belong to one group into
299  * one dictionary value:
300  * \code
301  * sf.groupby({"document"},
302  * {{"word_count", aggregate::CONCAT("word", "count")}});
303  * \endcode
304  *
305  * To combine values from one column that belong to one group into a list value:
306  * \code
307  * sf.groupby({"user"},
308  * {{"friends", aggregate::CONCAT("friend")}});
309  * \endcode
310  */
311 groupby_descriptor_type CONCAT(const std::string& col);
312 groupby_descriptor_type CONCAT(const std::string& key, const std::string& value);
313 ///@}
314 
315 ///@{
316 /**
317  * Builtin approximate quantile aggregator for groupby.
318  *
319  * Accepts as an argument, one or more of a list of quantiles to query.
320  *
321  * To extract the median:
322  * \code
323  * sf.groupby({"user"},
324  * {{"rating_quantiles", aggregate::QUANTILE("rating", 0.5)}});
325  * \endcode
326  *
327  * To extract a few quantiles:
328  * \code
329  * sf.groupby({"user"},
330  * {{"rating_quantiles", aggregate::QUANTILE("rating", {0.25,0.5,0.75})}});
331  * \endcode
332  *
333  * Or equivalently
334  * \code
335  * sf.groupby({"user"},
336  * {{"rating_quantiles", aggregate::QUANTILE("rating", {0.25,0.5,0.75})}});
337  * \endcode
338  *
339  * The returned quantiles are guaranteed to have 0.5% accuracy. That is to say,
340  * if the requested quantile is 0.50, the resultant quantile value may be
341  * between 0.495 and 0.505 of the true quantile.
342  */
343 groupby_descriptor_type QUANTILE(const std::string& col, double quantile);
344 groupby_descriptor_type QUANTILE(const std::string& col, const std::vector<double>& quantiles);
345 ///@}
346 
347 /**
348  * Builtin arg maximum aggregator for groupby.
349  *
350  * Example: Get the movie with maximum rating per user.
351  * \code
352  * sf.groupby({"user"},
353  * {{"best_movie", aggregate::ARGMAX("rating","movie")}});
354  * \endcode
355  *
356  */
357 groupby_descriptor_type ARGMAX(const std::string& agg, const std::string& out);
358 
359 /**
360  * Builtin arg minimum aggregator for groupby.
361  *
362  * Example: Get the movie with minimum rating per user.
363  * \code
364  * sf.groupby("user",
365  * {{"best_movie", aggregate::ARGMIN("rating","movie")}});
366  * \endcode
367  */
368 groupby_descriptor_type ARGMIN(const std::string& agg, const std::string& out);
369 
370 } // aggregate
371 
372 /**
373  * \ingroup group_glsdk
374  * A tabular, column-mutable dataframe object that can scale to big data.
375  *
376  * The data in \ref gl_sframe is stored column-wise on persistent
377  * storage (e.g. disk) to avoid being
378  * constrained by memory size. Each column in an \ref gl_sframe is a
379  * immutable \ref gl_sarray, but \ref gl_sframe objects
380  * are mutable in that columns can be added and subtracted with ease.
381  * An \ref gl_sframe essentially acts as an ordered dictionary of \ref
382  * gl_sarray objects.
383  * Usage:
384  *
385  * ### Usage
386  *
387  * The gl_sframe API is designed to very closely mimic the Python SFrame API
388  * and supports much of the Python-like capabilities, but in C++.
389  *
390  * Column Creation And Referencing
391  * \code
392  * gl_sframe sf;
393  * sf["a"] = gl_sarray{1,2,3,4,5};
394  * gl_sarray a_5_element_sarray{1,1,1,1,1};
395  * sf["b"] = a_5_element_sarray;
396  * gl_sarray some_other_sarray{2,2,2,2,2};
397  * sf["c"] = sf["a"] / sf["b"] + some_other_sarray;
398  * \endcode
399  *
400  * Logical Filter:
401  * \code
402  * gl_sframe sf{{"a", {1,2,3,4,5}},
403  * {"b", {"1","2","3","4","5"}}};
404  * gl_sframe t = sf[sf["a"] < 3]
405  * // t now has 2 columns. a: [1,2] b:["1","2"]
406  * \endcode
407  *
408  * Python Range Slicing:
409  * \code
410  * gl_sframe sf{{"a", {1,2,3,4,5}},
411  * {"b", {"1","2","3","4","5"}}};
412  * t = sf[{0,3}];
413  * // t is the first 3 rows of sf
414  * \endcode
415  *
416  * And many others.
417  *
418  * The gl_sframe can be read \b inefficiently using operator[]
419  * \code
420  * gl_sframe sf{{"a", {1,2,3,4,5}},
421  * {"b", {"1","2","3","4","5"}}};
422  * std::vector<flexible_type> val = s[2];
423  * // val[0] == 3, val[1] == "3"
424  * \endcode
425  *
426  * Or iterated efficiently using the \ref range_iterator
427  * \code
428  * for (const auto& i: sa.range_iterator()) {
429  * ...
430  * }
431  * \endcode
432  *
433  * Note that using "auto" above is more efficient than using vector<flexible_type>
434  * \code
435  * for (const std::vector<flexible_type> & i: sa.range_iterator()) {
436  * \endcode
437  *
438  * The range_iterator materializes the SFrame if not already materialized, but
439  * \ref materialize_to_callback can be used to read the SFrame without
440  * materialization.
441  *
442  * The gl_sframe can constructed in a variety of means:
443  * - If the data to be written is already in memory, it can be created
444  * using the
445  * \ref gl_sframe::gl_sframe(const std::map<std::string, std::vector<flexible_type> >& data)
446  * "gl_sframe constructor"
447  * - Otherwise, the \ref gl_sframe_writer can be used which provides a simple
448  * write interface.
449  *
450  * ### Python Binding
451  *
452  * When used as an input argument in an SDK function, it permits a Python SFrame
453  * to be passed as an argument. When used in an output argument, it will return
454  * a Python SFrame.
455  *
456  * For instance:
457  * \code
458  * //
459  * // Compiled as example.so
460  * //
461  * gl_sframe add_ones_column(const gl_sframe& data) {
462  * gl_sframe sf = data;
463  * sf["ones"] = 1;
464  * return sf;
465  * }
466  * BEGIN_FUNCTION_REGISTRATION
467  * REGISTER_FUNCTION(add_ones_column, "data");
468  * END_FUNCTION_REGISTRATION
469  * \endcode
470  *
471  * Will allow this to be done in Python:
472  * \code{.py}
473  * import turicreate as gl
474  * import example
475  * sa = SFrame({"a":[1,2,3,4,5]})
476  * ret = example.add_ones_column(sa)
477  * # ret now has two columns. "a":[1,2,3,4,5] and "ones":[1,1,1,1,1]
478  * \endcode
479  *
480  * ### Details
481  *
482  * The gl_sframe is also lazy evaluated behind the scenes to minimize disk
483  * access. This may have the unfortunate effect of hiding errors until
484  * materialization is forced to occur. i.e. it might be some time much later in
485  * your code that errors will trigger.
486  *
487  * However, not all operations are lazy and certain operations will force
488  * materialization, and that is a constant target for optimization.
489  *
490  * If you want to force materialization yourself, use \ref materialize()
491  */
492 class gl_sframe {
493  public:
494  /// Constructs an empty gl_sframe.
495  gl_sframe();
496  /// Copy Constructor
497  gl_sframe(const gl_sframe&);
498  /// Move Constructor
499  gl_sframe(gl_sframe&&);
500 
501  /**
502  * Constructs a gl_sframe from a binary SFrame saved previously with
503  * \ref save().
504  *
505  * \see save()
506  */
507  explicit gl_sframe(const std::string& directory);
508 
509  void construct_from_sframe_index(const std::string& directory);
510 
511  /**
512  * Constructs a gl_sframe from a csv file
513  */
514  void construct_from_csvs(std::string csv_file, csv_parsing_config_map csv_config,
515  str_flex_type_map column_type_hints);
516 
517  /// Copy assignment
518  gl_sframe& operator=(const gl_sframe&);
519  /// Move assignment
520  gl_sframe& operator=(gl_sframe&&);
521 
522  /**
523  * Show a visualization of the SFrame.
524  */
525  void show(const std::string& path_to_client) const;
526 
527  /**
528  * Return a plot object of the SFrame (same visualization as `show`)
529  */
530  std::shared_ptr<model_base> plot() const;
531 
532  /**
533  * Constructs a gl_sframe from an in-memory map of values
534  * \code
535  * std::vector<flexible_type> a{1,2,3,4,5};
536  * std::vector<flexible_type> a_str{"1","2","3","4","5"};
537  * std::map<std::string, std::vector<flexible_type>> cols;
538  * cols["a"] = a;
539  * cols["a_str"] = a_str;
540  * gl_sframe sf(cols);
541  * \endcode
542  *
543  * Or, more compactly using C++11 initializer lists:
544  * \code
545  * gl_sframe sf({{"a", a},{"a_str", a_str}});
546  * \endcode
547  */
548  gl_sframe(const std::map<std::string, std::vector<flexible_type> >& data);
549 
550  void construct_from_dataframe(const std::map<std::string, std::vector<flexible_type> >& data);
551 
552  /**
553  * Constructs a gl_sframe from a collection of gl_sarrays.
554  *
555  * \code
556  * gl_sarray a{1,2,3,4,5};
557  * gl_sarray a_str{"1","2","3","4","5"}
558  * std::map<std::string, gl_sarray> cols;
559  * cols["a"] = a;
560  * cols["a_str"] = a_str;
561  * gl_sframe sf(cols);
562  * \endcode
563  *
564  * Or, more compactly using C++11 initializer lists:
565  * \code
566  * gl_sframe sf({{"a", a},{"a_str", a_str}});
567  * \endcode
568  */
569  gl_sframe(const std::map<std::string, gl_sarray>& data);
570 
571  /**
572  * Constructs a gl_sframe from an initializer list of columns.
573  *
574  * \code
575  * gl_sarray a{1,2,3,4,5};
576  * gl_sarray a_str{"1","2","3","4","5"}
577  * gl_sframe sf{{"a", a},{"a_str", a_str}};
578  * \endcode
579  */
580  gl_sframe(std::initializer_list<std::pair<std::string, gl_sarray>>);
581 
582  /// \cond TURI_INTERNAL
583  /**
584  * Implicit conversion from backend unity_sframe object
585  */
586  gl_sframe(std::shared_ptr<unity_sframe> sframe);
587  /**
588  * Implicit conversion from backend unity_sframe_base object
589  */
590  gl_sframe(std::shared_ptr<unity_sframe_base> sframe);
591  /**
592  * Implicit conversion from backend sframe object
593  */
594  gl_sframe(const sframe& sframe);
595 
596  /**
597  * Implicit conversion to backend sframe object
598  */
599  operator std::shared_ptr<unity_sframe>() const;
600  /**
601  * Implicit conversion to backend sframe object
602  */
603  operator std::shared_ptr<unity_sframe_base>() const;
604 
605  /**
606  * Conversion to materialized backend sframe object.
607  */
608  sframe materialize_to_sframe() const;
609  /// \endcond
610 
611 
612  ///@{
613  /**
614  * Returns the value at a particular array index; generally inefficient.
615  *
616  * This returns the value of the array at a particular index. Will raise
617  * an exception if the index is out of bounds. This operation is generally
618  * inefficient: the range_iterator() is prefered.
619  */
620  std::vector<flexible_type> operator[](int64_t i);
621  std::vector<flexible_type> operator[](int64_t i) const;
622  ///@}
623 
624  ///@{
625  /**
626  * Performs a slice Python style.
627  *
628  * \param slice A list of 2 or 3 values. If 2 values, this is interpreted as
629  * {start, end} indices, with an implicit value of step = 1.
630  * If 3 values, this is interpreted as {start, step, end}.
631  * Values at the positions [start, start+step, start+2*start, ...] are returned
632  * until end (exclusive) is reached. Negative start and end values are
633  * interpreted as offsets from the end of the array.
634  *
635  * Given a gl_sframe
636  * \code
637  * gl_sarray a{1,2,3,4,5,6,7,8,9,10};
638  * gl_sframe sf{{"a", a}}
639  * \endcode
640  *
641  * Slicing a consecutive range:
642  * \code
643  * auto ret = a[{1,4}]; // start at index 1, end at index 4
644  * // ret is a gl_sframe with one column a: [2,3,4]
645  * \endcode
646  *
647  * Slicing a range with a step:
648  * \code
649  * auto ret = a[{1,2,8}]; // start at index 1, end at index 8 with step size 2
650  * // ret is a gl_sframe with one column a: [2,4,6,8]
651  * \endcode
652  *
653  * Using negative indexing:
654  * \code
655  * auto ret = a[{-3,-1}]; // start at end - 3, end at index end - 1
656  * // ret a gl_sframe with one column a: [8,9]
657  * \endcode
658  */
659  gl_sframe operator[](const std::initializer_list<int64_t>& slice);
660  gl_sframe operator[](const std::initializer_list<int64_t>& slice) const;
661  ///@}
662 
663  /**
664  * Performs a logical filter.
665  *
666  * This function performs a logical filter: i.e. it subselects all the
667  * elements in this array where the corresponding value in the other array
668  * evaluates to true.
669  * \code
670  * gl_sframe sf{{"a", {1,2,3,4,5}},
671  * {"b", {"1","2","3","4","5"}},
672  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
673  * auto ret = sf[sf["a"] > 1 && sf["a"] <= 4];
674  *
675  * // ret is now the sframe with 3 columns:
676  * // a: [2,3,4]
677  * // b: ["2","3","4"]
678  * // c: [2.0,3.0,4.0]
679  * \endcode
680  */
681  gl_sframe operator[](const gl_sarray& logical_filter) const;
682 
683  friend class const_gl_sarray_reference;
684  friend class gl_sarray_reference;
685 
686  /**
687  * \name Column Indexing
688  * \anchor column_indexing
689  *
690  * \brief Selects a single column of the SFrame.
691  *
692  * This returns an internal array reference object that can be used exactly
693  * like a \ref gl_sarray. The design is quite similar to the reference object
694  * used by std::vector<bool> for indexing.
695  *
696  * For instance:
697  *
698  * \code
699  * gl_sframe sf{{"a", {1,2,3,4,5}},
700  * {"b", {"1","2","3","4","5"}},
701  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
702  * gl_sarray t = sf["a"]; // takes out column "a"
703  * \endcode
704  *
705  * However, this operator can also be used for modifying existing columns,
706  * or creating new columns. For instance:
707  *
708  * \code
709  * gl_sframe sf{{"a", {1,2,3,4,5}},
710  * {"b", {"1","2","3","4","5"}},
711  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
712  * sf["a"] = sf["a"] + 1; // sf["a"] is now {2,3,4,5,6}
713  * sf["d"] = sf["c"] - 1; // sf["d"] is now {0.0,1.0,2.0,3.0,4.0}
714  * \endcode
715  *
716  * Entire constant columns can also be created the same way:
717  * \code
718  * gl_sframe sf{{"a", {1,2,3,4,5}},
719  * {"b", {"1","2","3","4","5"}},
720  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
721  * sf["ones"] = 1;
722  * \endcode
723  *
724  * Since the returned object is meant to be a short-lived reference, the
725  * following is not permitted:
726  * \code
727  * gl_sframe sf{{"a", {1,2,3,4,5}},
728  * {"b", {"1","2","3","4","5"}},
729  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
730  * auto a_col = sf["a"];
731  * \endcode
732  * since "auto" resolves to gl_sarray_reference which is intentionally, not
733  * copy-constructible.
734  *
735  * For functional alternatives, See
736  * \ref replace_add_column,
737  * \ref add_column(const flexible_type&, const std::string&) "add_column",
738  * \ref add_column(const gl_sarray&, const std::string&), "add_column overload".
739  */
740  ///@{
741  const_gl_sarray_reference operator[](const std::string& column) const;
742  gl_sarray_reference operator[](const std::string& column);
743  ///@}
744 
745  //@{
746  /**
747  * \name Multi-Column Indexing
748  * \anchor multi_column_indexing
749  * Subselects a subset of columns returning the an SFrame containing only
750  * those columns.
751  *
752  * \code
753  * gl_sframe sf{{"a", {1,2,3,4,5}},
754  * {"b", {"1","2","3","4","5"}},
755  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
756  * gl_sframe ret = sf[{"a", "b"}]
757  * // ret has 2 columns "a" and "b"
758  * \endcode
759  */
760  gl_sframe operator[](const std::vector<std::string>& columns) const;
761  gl_sframe operator[](const std::initializer_list<std::string>& columns);
762  gl_sframe operator[](const std::initializer_list<std::string>& columns) const;
763  //@}
764 
765  friend class gl_sframe_range;
766 
767 
768 
769  /**
770  * Calls a callback function passing each row of the SArray.
771  *
772  * This does not materialize the array if not necessary.
773  *
774  * The callback may be called in parallel in which case the argument provides
775  * a thread number. The function should return false, but may return
776  * true at anytime to quit the iteration process. It may also throw exceptions
777  * which will be forwarded to the caller of this function.
778  *
779  * Each call to the callback passes:
780  * - a thread id,
781  * - a shared_ptr to an sframe_rows object
782  *
783  * The sframe_rows object looks like a vector<vector<flexible_type>>.
784  * i.e. to look at all the rows, you need to write:
785  *
786  * \code
787  * sf.materalize_to_callback([&](size_t, const std::shared_ptr<sframe_rows>& rows) {
788  * for(const auto& row: *rows) {
789  * // each row looks like an std::vector<flexible_type>
790  * // and can be casted to to a vector<flexible_type> if necessayr
791  * }
792  * });
793  * \endcode
794  *
795  * \param callback The callback to call
796  * \param nthreads Number of threads. If not specified, #cpus is used
797  */
798  void materialize_to_callback(
799  std::function<bool(size_t, const std::shared_ptr<sframe_rows>&)> callback,
800  size_t nthreads = (size_t)(-1));
801 
802  /**
803  * Returns a one pass range object with begin() and end() iterators.
804  *
805  * This will materialize the array.
806  *
807  * See \ref materialize_to_callback for a lazy version.
808  *
809  * \param start The starting index of the range
810  * \param end The ending index of the range
811  *
812  * \code
813  * // create an SFrame
814  * gl_sframe sf{{"a", {1,2,3,4,5}},
815  * {"b", {"1","2","3","4","5"}},
816  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
817  *
818  * // get a range over the entire frame
819  * auto ra = sa.range_iterator();
820  * auto iter = ra.begin();
821  * while (iter != ra.end()) {
822  * std::vector<flexible_type> val = *iter;
823  * // do something to val
824  * }
825  * \endcode
826  *
827  * Or more compactly with C++11 syntax:
828  * \code
829  * for(const auto& val: sa.range_iterator()) {
830  * std::cout << val[0] << " " << val[1] << " " << val[2] << "\n";
831  * }
832  * \endcode
833  *
834  * The range returned only supports one pass. The outcome of a second call to
835  * begin() is undefined after any iterator is advanced.
836  *
837  * When iterating over a gl_sframe with many columns, if only a small number
838  * of columns are needed, there is a performance benefit to subselecting just
839  * those columns first before iterating.
840  *
841  * i.e. if I only need columns "a" and "b" from the SFrame above:
842  * \code
843  *
844  * for(const auto& val: sa[{"a","b"}].range_iterator()) {
845  * std::cout << val[0] << " " << val[1] << "\n";
846  * }
847  * \endcode
848  *
849  * \see gl_sframe_range
850  */
851  gl_sframe_range range_iterator(size_t start=0, size_t end=(size_t)(-1)) const;
852 
853  /**
854  * Returns the number of rows of the SFrame.
855  *
856  * This may trigger materialization in situations in which the size of the
857  * SFrame is not known. For instance after a logical filter.
858  *
859  * \see has_size
860  */
861  virtual size_t size() const;
862 
863  /**
864  * True if size() == 0.
865  */
866  bool empty() const;
867 
868  /**
869  * Returns whether or not the sarray has been materialized.
870  *
871  * \see materialize
872  */
873  bool is_materialized() const;
874 
875  /**
876  * Returns true if the size of the SFrame is known. If it is not known,
877  * calling size() may trigger materialization.
878  */
879  bool has_size() const;
880 
881  /**
882  * For a SFrame that is lazily evaluated, force persist this sframe to disk,
883  * committing all lazy evaluated operations.
884  *
885  * \see is_materialized
886  */
887  void materialize();
888 
889  /**
890  *
891  * Saves the SFrame to file.
892  *
893  * When format is "binary", the saved SArray will be in a directory
894  * named with the `targetfile` parameter. When format is "text" or "csv",
895  * it is saved as a single human readable text file.
896  *
897  * \param filename A local path or a remote URL. If format is 'text', it
898  * will be saved as a text file. If format is 'binary', a directory will be
899  * created at the location which will contain the SArray.
900  *
901  * \param format Optional. Either "binary", "csv" or "". Defaults to "".
902  * Format in which to save the SFrame. Binary saved SArrays can be
903  * loaded much faster and without any format conversion losses.
904  * If "csv", Each row will be written as a single line in an output text
905  * file. If format is an empty string (default), we will try to infer the
906  * format from filename given. If file name ends with "csv", or
907  * ".csv.gz", then the gl_sframe is saved as "csv" format, otherwise the
908  * gl_sframe is saved as 'binary' format.
909  */
910  void save(const std::string& path, const std::string& format="") const;
911 
912 
913  /**
914  * Performs an incomplete save of an existing SFrame into a directory.
915  * This saved SFrame may reference SFrames in other locations *in the same
916  * filesystem* for certain columns/segments/etc.
917  *
918  * Does not modify the current sframe.
919  */
920  void save_reference(const std::string& path) const;
921 
922  /**
923  * Returns an array of types of each column.
924  */
925  virtual std::vector<flex_type_enum> column_types() const;
926 
927  /**
928  * Returns the number of columns of the SFrame.
929  */
930  virtual size_t num_columns() const;
931 
932  /**
933  * Returns the columns names of the SFrame.
934  */
935  virtual std::vector<std::string> column_names() const;
936 
937  /**
938  * Returns true if the column is present in the sframe, and false
939  * otherwise.
940  */
941  bool contains_column(const std::string& col_name) const;
942 
943  /**
944  * Returns a gl_sframe which contains the first n rows of this gl_sframe.
945  *
946  * \param n The number of rows to fetch.
947  */
948  gl_sframe head(size_t n) const;
949 
950  /**
951  * Returns a gl_sframe which contains the last n rows of this gl_sframe.
952  *
953  * \param n The number of rows to fetch.
954  */
955  gl_sframe tail(size_t n) const;
956 
957  /**
958  * Maps each row of the \ref gl_sframe by a given function to a single value.
959  * The result \ref gl_sarray is of type "dtype". "fn" should be a function
960  * that returns exactly one value which can be cast into the type specified
961  * by "dtype".
962  *
963  * \param fn The function to transform each element. Must return exactly one
964  * value which can be cast into the type specified by "dtype".
965  *
966  * \param dtype The data type of the new \ref gl_sarray.
967  *
968  * Example:
969  * \code
970  * gl_sframe sf{{"a", {1,2,3,4,5}},
971  * {"c", {1.0,2.0,3.0,4.0,5.0}}};
972  * std::cout << sf.apply([](const sframe_rows::row& x) {
973  * return x[0] * x[1];
974  * }, flex_type_enum::FLOAT);
975  * \endcode
976  *
977  * Produces output:
978  * \code{.txt}
979  * dtype: float
980  * Rows: 5
981  * [1.0, 4.0, 9.0, 16.0, 25.0]
982  * \endcode
983  *
984  * \see gl_sarray::apply
985  */
986  gl_sarray apply(std::function<flexible_type(const sframe_rows::row&)> fn,
987  flex_type_enum dtype) const;
988  /**
989  * Create an \ref gl_sframe which contains a subsample of the current
990  * \ref gl_sframe.
991  *
992  * \param fraction The fraction of the rows to fetch. Must be between 0 and 1.
993  *
994  * Example:
995  * \code
996  * gl_sframe sf{{"a", {1,2,3,4,5}},
997  * {"b", {1.0,2.0,3.0,4.0,5.0}}};
998  * std::cout << sf.sample(.3);
999  * \endcode
1000  *
1001  * Produces output:
1002  * \code{.txt}
1003  * Columns:
1004  * a integer
1005  * b float
1006  * Rows: ?
1007  * Data:
1008  * +----------------+----------------+
1009  * | a | b |
1010  * +----------------+----------------+
1011  * | 4 | 4 |
1012  * | 5 | 5 |
1013  * +----------------+----------------+
1014  * ? rows x 2 columns]
1015  * \endcode
1016  */
1017  gl_sframe sample(double fraction) const;
1018 
1019 
1020  /**
1021  * Create an \ref gl_sframe which contains a subsample of the current
1022  * \ref gl_sframe.
1023  *
1024  * \param fraction The fraction of the rows to fetch. Must be between 0 and 1.
1025  *
1026  * \param seed The random seed for the random number generator.
1027  * Deterministic output is obtained if this is set to a constant.
1028  *
1029  * Example:
1030  * \code
1031  * gl_sframe sf{{"a", {1,2,3,4,5}},
1032  * {"b", {1.0,2.0,3.0,4.0,5.0}}};
1033  * std::cout << sf.sample(.3, 12345);
1034  * \endcode
1035  *
1036  * Produces output:
1037  * \code{.txt}
1038  * Columns:
1039  * a integer
1040  * b float
1041  * Rows: ?
1042  * Data:
1043  * +----------------+----------------+
1044  * | a | b |
1045  * +----------------+----------------+
1046  * | 4 | 4 |
1047  * | 5 | 5 |
1048  * +----------------+----------------+
1049  * ? rows x 2 columns]
1050  * \endcode
1051  */
1052  gl_sframe sample(double fraction, size_t seed, bool exact=false) const;
1053 
1054 
1055  /**
1056  * Randomly split the rows of an \ref gl_sframe into two \ref gl_sframe
1057  * objects. The first \ref gl_sframe contains \b M rows, sampled uniformly
1058  * (without replacement) from the original \ref gl_sframe. \b M is
1059  * approximately the fraction times the original number of rows. The second
1060  * \ref gl_sframe contains the remaining rows of the original \ref gl_sframe.
1061  *
1062  * \param fraction Approximate fraction of the rows to fetch for the first returned
1063  * \ref gl_sframe. Must be between 0 and 1.
1064  *
1065  * \param seed Optional. Seed for the random number generator used to split.
1066  *
1067  * Example:
1068  * \code
1069  * auto sf = gl_sframe({{"id", gl_sarray::from_sequence(0, 1024)}});
1070  * gl_sframe sf_train, sf_test;
1071  * std::tie(sf_train, sf_test) = sf.random_split(.95);
1072  * std::cout << sf_test.size() << " " << sf_train.size() << "\n";
1073  * \endcode
1074  *
1075  * Produces output:
1076  * \code{.txt}
1077  * 102 922
1078  * \endcode
1079  */
1080  std::pair<gl_sframe, gl_sframe> random_split(double fraction) const;
1081 
1082  /**
1083  * Randomly split the rows of an \ref gl_sframe into two \ref gl_sframe
1084  * objects. The first \ref gl_sframe contains \b M rows, sampled uniformly
1085  * (without replacement) from the original \ref gl_sframe. \b M is
1086  * approximately the fraction times the original number of rows. The second
1087  * \ref gl_sframe contains the remaining rows of the original \ref gl_sframe.
1088  *
1089  * \param fraction Approximate fraction of the rows to fetch for the first
1090  * returned \ref gl_sframe. Must be between 0 and 1.
1091  *
1092  * \param seed The random seed for the random number generator.
1093  * Deterministic output is obtained if this is set to a constant.
1094  *
1095  * Example:
1096  * \code
1097  * auto sf = gl_sframe({{"id", gl_sarray::from_sequence(0, 1024)}});
1098  * gl_sframe sf_train, sf_test;
1099  * std::tie(sf_train, sf_test) = sf.random_split(.95, 12345);
1100  * std::cout << sf_test.size() << " " << sf_train.size() << "\n";
1101  * \endcode
1102  *
1103  * Produces output:
1104  * \code{.txt}
1105  * 44 980
1106  * \endcode
1107  */
1108  std::pair<gl_sframe, gl_sframe> random_split(double fraction, size_t seed, bool exact=false) const;
1109 
1110  /**
1111  * Get top k rows according to the given column. Result is according to and
1112  * sorted by "column_name" in the given order (default is descending).
1113  * When "k" is small, "topk" is more efficient than "sort".
1114  *
1115  * \param column_name The column to sort on
1116  *
1117  * \param k Optional. Defaults to 10 The number of rows to return.
1118  *
1119  * \param reverse Optional. Defaults to False. If true, return the top k rows
1120  * in ascending order, otherwise, in descending order.
1121  *
1122  * Example:
1123  * \code
1124  * auto sf = gl_sframe({{"id", gl_sarray::from_sequence(1000)}});
1125  * auto sf["value"] = 0 - sf["id"];
1126  * std::cout << sf.topk("id", k=3);
1127  * \endcode
1128  *
1129  * Produces output:
1130  * \code{.txt}
1131  * +--------+--------+
1132  * | id | value |
1133  * +--------+--------+
1134  * | 999 | -999 |
1135  * | 998 | -998 |
1136  * | 997 | -997 |
1137  * +--------+--------+
1138  * [3 rows x 2 columns]
1139  *
1140  * \endcode
1141  *
1142  * Example:
1143  * \code
1144  * std::cout << sf.topk("value", k=3);
1145  * \endcode
1146  *
1147  * Produces output:
1148  * \code{.txt}
1149  * +--------+--------+
1150  * | id | value |
1151  * +--------+--------+
1152  * | 1 | -1 |
1153  * | 2 | -2 |
1154  * | 3 | -3 |
1155  * +--------+--------+
1156  * [3 rows x 2 columns]
1157  * \endcode
1158  *
1159  * \see sort
1160  */
1161  gl_sframe topk(const std::string& column_name, size_t k=10, bool reverse=false) const;
1162 
1163  /** Returns the index of column `column_name`.
1164  */
1165  size_t column_index(const std::string &column_name) const;
1166 
1167  /** Returns the name of column `index`.
1168  */
1169  const std::string& column_name(size_t index) const;
1170 
1171 
1172  /**
1173  * Extracts one column of the gl_sframe.
1174  *
1175  * This is equivalent to using \ref column_indexing "operator[]" for column
1176  * indexing.
1177  *
1178  * Equivalent to:
1179  * \code
1180  * sf[colname];
1181  * \endcode
1182  *
1183  * \see select_columns
1184  *
1185  */
1186  gl_sarray select_column(const std::string& colname) const;
1187 
1188  /**
1189  * Extracts a collection of columns of the gl_sframe.
1190  *
1191  * This is equivalent to using \ref multi_column_indexing "operator[]" for
1192  * selecting multiple columns
1193  * \code
1194  * sf[colnames];
1195  * \endcode
1196  *
1197  * \see select_column
1198  */
1199  gl_sframe select_columns(const std::vector<std::string>& colnames) const;
1200 
1201 
1202  /**
1203  * Add a column to this \ref gl_sframe, replacing a column with the same name
1204  * already exists. The number of elements in the data given
1205  * must match the length of every other column of the \ref gl_sframe. This
1206  * operation modifies the current \ref gl_sframe in place.
1207  * If no name is given, a default name is chosen.
1208  *
1209  * \param data The column of data to add.
1210  *
1211  * \param name Optional. The name of the column. If no name is given, a
1212  * default name is chosen.
1213  *
1214  * This is equivalent to using \ref column_indexing "operator[]" for column
1215  * assignment.
1216  * \code
1217  * sf[name] = data;
1218  * \endcode
1219  *
1220  * Example:
1221  * \code
1222  * auto sf = gl_sframe({{"id", {1, 2, 3}},
1223  * {"val", {"A", "B", "C"}}});
1224  * auto sa = gl_sarray({"cat", "dog", "fossa"});
1225  * sf.replace_add_column(sa, "species");
1226  * std::cout << sf;
1227  * \endcode
1228  *
1229  * Produces output:
1230  * \code{.txt}
1231  * +----+-----+---------+
1232  * | id | val | species |
1233  * +----+-----+---------+
1234  * | 1 | A | cat |
1235  * | 2 | B | dog |
1236  * | 3 | C | fossa |
1237  * +----+-----+---------+
1238  * [3 rows x 3 columns]
1239  * \endcode
1240  *
1241  * \see add_column(const gl_sarray&, const std::string&),
1242  */
1243  virtual void replace_add_column(const gl_sarray& data, const std::string& name="");
1244 
1245  /**
1246  * Add a column of identical values this \ref gl_sframe, raising an exception
1247  * if a column the same name already exists.
1248  * This operation modifies the current \ref gl_sframe in place. If no name
1249  * is given, a default name is chosen.
1250  *
1251  * \param data The value to assign to each entry in the new column
1252  *
1253  * \param name Optional. The name of the column. If no name is given, a
1254  * default name is chosen.
1255  *
1256  * This is almost equivalent to using \ref column_indexing "operator[]" for
1257  * column assignment, but raises an exception if overwriting a column with
1258  * the same name.
1259  * \code
1260  * sf[name] = data;
1261  * \endcode
1262  *
1263  * Example:
1264  * \code
1265  * auto sf = gl_sframe({{"id", {1, 2, 3}},
1266  * {"val", {"A", "B", "C"}}});
1267  * auto sa = gl_sarray({"cat", "dog", "fossa"});
1268  * sf.replace_add_column("fish", "species");
1269  * std::cout << sf;
1270  * \endcode
1271  *
1272  * Produces output:
1273  * \code{.txt}
1274  * +----+-----+---------+
1275  * | id | val | species |
1276  * +----+-----+---------+
1277  * | 1 | A | fish |
1278  * | 2 | B | fish |
1279  * | 3 | C | fish |
1280  * +----+-----+---------+
1281  * [3 rows x 3 columns]
1282  * \endcode
1283  *
1284  * \see replace_add_column
1285  * \see add_column(const gl_sarray&, const std::string&)
1286  */
1287  virtual void add_column(const flexible_type& data, const std::string& name="");
1288 
1289  /**
1290  * Add a column to this \ref gl_sframe, raising an exception if a column the
1291  * same name already exists. The number of elements in the data given
1292  * must match the length of every other column of the \ref gl_sframe. This
1293  * operation modifies the current \ref gl_sframe in place.
1294  * If no name is given, a default name is chosen.
1295  *
1296  * \param data The column of data to add.
1297  *
1298  * \param name Optional. The name of the column. If no name is given, a
1299  * default name is chosen.
1300  *
1301  * This is almost equivalent to using \ref column_indexing "operator[]" for
1302  * column assignment, but raises an exception if overwriting a column with
1303  * the same name.
1304  * \code
1305  * sf[name] = data;
1306  * \endcode
1307  *
1308  * Example:
1309  * \code
1310  * auto sf = gl_sframe({{"id", {1, 2, 3}},
1311  * {"val", {"A", "B", "C"}}});
1312  * auto sa = gl_sarray({"cat", "dog", "fossa"});
1313  * sf.replace_add_column(sa, "species");
1314  * std::cout << sf;
1315  * \endcode
1316  *
1317  * Produces output:
1318  * \code{.txt}
1319  * +----+-----+---------+
1320  * | id | val | species |
1321  * +----+-----+---------+
1322  * | 1 | A | cat |
1323  * | 2 | B | dog |
1324  * | 3 | C | fossa |
1325  * +----+-----+---------+
1326  * [3 rows x 3 columns]
1327  * \endcode
1328  *
1329  * \see replace_add_column
1330  * \see add_column(const flexible_type&, const std::string&),
1331  */
1332  virtual void add_column(const gl_sarray& data, const std::string& name="");
1333 
1334  /**
1335  * Adds multiple columns to this \ref gl_sframe. The number of elements in
1336  * all columns must match the length of every other column of the \ref
1337  * gl_sframe. This operation modifies the current \ref gl_sframe in place
1338  *
1339  * \param data The columns to add.
1340  *
1341  * Example:
1342  * \code
1343  * auto sf = gl_sframe({{"id", {1, 2, 3}},
1344  * {"val", {"A", "B", "C"}}});
1345  * auto sf2 = gl_sframe({{"species", {"cat", "dog", "fossa"}},
1346  * {"age", {3, 5, 9}}});
1347  * std::cout << sf.add_columns(sf2);
1348  * std::cout << sf;
1349  * \endcode
1350  *
1351  * Produces output:
1352  * \code{.txt}
1353  * +----+-----+-----+---------+
1354  * | id | val | age | species |
1355  * +----+-----+-----+---------+
1356  * | 1 | A | 3 | cat |
1357  * | 2 | B | 5 | dog |
1358  * | 3 | C | 9 | fossa |
1359  * +----+-----+-----+---------+
1360  * [3 rows x 4 columns]
1361  * \endcode
1362  *
1363  * \see add_column
1364  */
1365  virtual void add_columns(const gl_sframe& data);
1366 
1367  /**
1368  * Remove a column from this \ref gl_sframe. This operation modifies the
1369  * current \ref gl_sframe in place. Raises an exception if the column
1370  * does not exist.
1371  *
1372  * \param name The name of the column to remove.
1373  *
1374  * Example:
1375  * \code
1376  * auto sf = gl_sframe({{"id", {1, 2, 3}},
1377  * {"val", {"A", "B", "C"}}});
1378  * sf.remove("val");
1379  * std::cout << sf;
1380  * \endcode
1381  *
1382  * Produces output:
1383  * \code{.txt}
1384  * +----+
1385  * | id |
1386  * +----+
1387  * | 1 |
1388  * | 2 |
1389  * | 3 |
1390  * +----+
1391  * [3 rows x 1 columns]
1392  * \endcode
1393  *
1394  */
1395  virtual void remove_column(const std::string& name);
1396 
1397  /**
1398  * Swap the columns with the given names. This operation modifies the
1399  * current \ref gl_sframe in place. Raises an exception if the columns do not
1400  * exist.
1401  *
1402  * \param column_1 Name of column to swap
1403  *
1404  * \param column_2 Name of other column to swap
1405  *
1406  * Example:
1407  * \code
1408  * auto sf = gl_sframe({{"id", {1, 2, 3}},
1409  * { "val", {"A", "B", "C"}}});
1410  * sf.swap_columns("id", "val");
1411  * std::cout << sf;
1412  * \endcode
1413  *
1414  * Produces output:
1415  * \code{.txt}
1416  * +-----+-----+
1417  * | val | id |
1418  * +-----+-----+
1419  * | A | 1 |
1420  * | B | 2 |
1421  * | C | 3 |
1422  * +----+-----+
1423  * [3 rows x 2 columns]
1424  * \endcode
1425  *
1426  */
1427  virtual void swap_columns(const std::string& column_1, const std::string& column_2);
1428 
1429 
1430  /**
1431  * Rename the given columns. "names" is expected to be a dictionary mapping
1432  * old names to new names. This changes the names of the columns given as
1433  * the keys and replaces them with the names given as the values. This
1434  * operation modifies the current \ref gl_sframe in place.
1435  *
1436  * \param names a map {old-name, new-name} pairs
1437  *
1438  * Example:
1439  * \code
1440  * auto sf = SFrame({{"X1", {"Alice","Bob"}},
1441  * {"X2", {"123 Fake Street","456 Fake Street"}}});
1442  * sf.rename({{"X1", "name"},{ "X2","address"}});
1443  * std::cout << sf;
1444  * \endcode
1445  *
1446  * Produces output:
1447  * \code{.txt}
1448  * +-------+-----------------+
1449  * | name | address |
1450  * +-------+-----------------+
1451  * | Alice | 123 Fake Street |
1452  * | Bob | 456 Fake Street |
1453  * +-------+-----------------+
1454  * [2 rows x 2 columns]
1455  * \endcode
1456  *
1457  * \see column_names
1458  */
1459  virtual void rename(const std::map<std::string, std::string>& old_to_new_names);
1460 
1461 
1462  /**
1463  * Add the rows of an \ref gl_sframe to the end of this \ref gl_sframe. Both
1464  * \ref gl_sframe objects must have the same set of columns with the same
1465  * column names and column types.
1466  *
1467  * \param other Another \ref gl_sframe whose rows are appended to
1468  * the current \ref gl_sframe.
1469  *
1470  * Example:
1471  * \code
1472  * auto sf = gl_sframe({{"id", {4, 6, 8}},
1473  * {"val", {"D", "F", "H"}}});
1474  * auto sf2 = gl_sframe({{"id", {1, 2, 3}},
1475  * {"val", {"A", "B", "C"}}});
1476  * auto sf = sf.append(sf2);
1477  * std::cout << sf;
1478  * \endcode
1479  *
1480  * Produces output:
1481  * \code{.txt}
1482  * +----+-----+
1483  * | id | val |
1484  * +----+-----+
1485  * | 4 | D |
1486  * | 6 | F |
1487  * | 8 | H |
1488  * | 1 | A |
1489  * | 2 | B |
1490  * | 3 | C |
1491  * +----+-----+
1492  * [6 rows x 2 columns]
1493  * \endcode
1494  */
1495  gl_sframe append(const gl_sframe& other) const;
1496 
1497  /**
1498  * Perform a group on the key_columns followed by aggregations on the columns
1499  * listed in operations. The operations parameter is a dictionary that
1500  * indicates which aggregation operators to use and which columns to use them
1501  * on. The available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV,
1502  * CONCAT, SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience,
1503  * aggregators MEAN, STD, and VARIANCE are available as synonyms for AVG,
1504  * STDV, and VAR. See turi::aggregate for more detail on the
1505  * aggregators.
1506  *
1507  * \param groupkeys Columns to group on. Type of key columns can be of any
1508  * type other than dictionary.
1509  *
1510  * \param operations Map of columns and aggregation operations. Each key is a
1511  * output column name and each value is an aggregator.
1512  *
1513  * Suppose we have an SFrame (sf) with movie ratings by many users.
1514  * \code{.txt}
1515  * +---------+----------+--------+
1516  * | user_id | movie_id | rating |
1517  * +---------+----------+--------+
1518  * | 25904 | 1663 | 3 |
1519  * | 25907 | 1663 | 3 |
1520  * | 25923 | 1663 | 3 |
1521  * | 25924 | 1663 | 3 |
1522  * | 25928 | 1663 | 2 |
1523  * | 25933 | 1663 | 4 |
1524  * | 25934 | 1663 | 4 |
1525  * | 25935 | 1663 | 4 |
1526  * | 25936 | 1663 | 5 |
1527  * | 25937 | 1663 | 2 |
1528  * | ... | ... | ... |
1529  * +---------+----------+--------+
1530  * [10000 rows x 3 columns]
1531  * \endcode
1532  *
1533  * Compute the number of occurrences of each user.
1534  * \code
1535  * auto user_count = sf.groupby({"user_id"},
1536  * {{"count", aggregate::COUNT()}});
1537  * std::cout << user_count;
1538  * \endcode
1539  * \code{.txt}
1540  * +---------+-------+
1541  * | user_id | count |
1542  * +---------+-------+
1543  * | 62361 | 1 |
1544  * | 30727 | 1 |
1545  * | 40111 | 1 |
1546  * | 50513 | 1 |
1547  * | 35140 | 1 |
1548  * | 42352 | 1 |
1549  * | 29667 | 1 |
1550  * | 46242 | 1 |
1551  * | 58310 | 1 |
1552  * | 64614 | 1 |
1553  * | ... | ... |
1554  * +---------+-------+
1555  * [9852 rows x 2 columns]
1556  * \endcode
1557  *
1558  * Compute the mean and standard deviation of ratings per user.
1559  * \code
1560  * auto user_rating_stats = sf.groupby({"user_id"},
1561  * {{ "mean_rating", aggregate::MEAN("rating")},
1562  * {"std_rating", aggregate::STD("rating")}});
1563  * std::cout << user_rating_stats;
1564  * \endcode
1565  * \code{.txt}
1566  * +---------+-------------+------------+
1567  * | user_id | mean_rating | std_rating |
1568  * +---------+-------------+------------+
1569  * | 62361 | 5.0 | 0.0 |
1570  * | 30727 | 4.0 | 0.0 |
1571  * | 40111 | 2.0 | 0.0 |
1572  * | 50513 | 4.0 | 0.0 |
1573  * | 35140 | 4.0 | 0.0 |
1574  * | 42352 | 5.0 | 0.0 |
1575  * | 29667 | 4.0 | 0.0 |
1576  * | 46242 | 5.0 | 0.0 |
1577  * | 58310 | 2.0 | 0.0 |
1578  * | 64614 | 2.0 | 0.0 |
1579  * | ... | ... | ... |
1580  * +---------+-------------+------------+
1581  * [9852 rows x 3 columns]
1582  * \endcode
1583  *
1584  * Compute the movie with the minimum rating per user.
1585  * \code
1586  * auto chosen_movies = sf.groupby({"user_id"},
1587  * {{ "worst_movies", aggregate::ARGMIN("rating","movie_id")}});
1588  * std::cout << chosen_movies;
1589  * \endcode
1590  * \code{.txt}
1591  * +---------+-------------+
1592  * | user_id | worst_movies |
1593  * +---------+-------------+
1594  * | 62361 | 1663 |
1595  * | 30727 | 1663 |
1596  * | 40111 | 1663 |
1597  * | 50513 | 1663 |
1598  * | 35140 | 1663 |
1599  * | 42352 | 1663 |
1600  * | 29667 | 1663 |
1601  * | 46242 | 1663 |
1602  * | 58310 | 1663 |
1603  * | 64614 | 1663 |
1604  * | ... | ... |
1605  * +---------+-------------+
1606  * [9852 rows x 2 columns]
1607  * \endcode
1608  *
1609  * Compute the count, mean, and standard deviation of ratings per (user,
1610  * time), automatically assigning output column names.
1611  * \code
1612  * // make up some time column which is a combination of user id and movie id
1613  * sf["time"] = sf.apply([](const flexible_type& x) {
1614  * return (x[0] + x[1]) % 11 + 2000;
1615  * });
1616  * auto user_rating_stats = sf.groupby({"user_id", "time"},
1617  * {{"Count", aggregate::COUNT()},
1618  * {"Avg of rating", aggregate::AVG("rating")},
1619  * {"Stdv of rating", aggregate::STDV("rating")}});
1620  * std::cout << user_rating_stats;
1621  * \endcode
1622  * \code{.txt}
1623  * +------+---------+-------+---------------+----------------+
1624  * | time | user_id | Count | Avg of rating | Stdv of rating |
1625  * +------+---------+-------+---------------+----------------+
1626  * | 2006 | 61285 | 1 | 4.0 | 0.0 |
1627  * | 2000 | 36078 | 1 | 4.0 | 0.0 |
1628  * | 2003 | 47158 | 1 | 3.0 | 0.0 |
1629  * | 2007 | 34446 | 1 | 3.0 | 0.0 |
1630  * | 2010 | 47990 | 1 | 3.0 | 0.0 |
1631  * | 2003 | 42120 | 1 | 5.0 | 0.0 |
1632  * | 2007 | 44940 | 1 | 4.0 | 0.0 |
1633  * | 2008 | 58240 | 1 | 4.0 | 0.0 |
1634  * | 2002 | 102 | 1 | 1.0 | 0.0 |
1635  * | 2009 | 52708 | 1 | 3.0 | 0.0 |
1636  * | ... | ... | ... | ... | ... |
1637  * +------+---------+-------+---------------+----------------+
1638  * [10000 rows x 5 columns]
1639  * \endcode
1640  *
1641  * The groupby function can take a variable length list of aggregation
1642  * specifiers so if we want the count and the 0.25 and 0.75 quantiles of
1643  * ratings:
1644  * \code
1645  * auto user_rating_stats = sf.groupby({"user_id", "time"},
1646  * {{"Count", aggregate::COUNT()},
1647  * {"rating_quantiles", agggregate.QUANTILE("rating",{0.25, 0.75}) }});
1648  * std::cout << user_rating_stats;
1649  * \endcode
1650  * \code{.txt}
1651  * +------+---------+-------+------------------------+
1652  * | time | user_id | Count | rating_quantiles |
1653  * +------+---------+-------+------------------------+
1654  * | 2006 | 61285 | 1 | [4.0, 4.0] |
1655  * | 2000 | 36078 | 1 | [4.0, 4.0] |
1656  * | 2003 | 47158 | 1 | [3.0, 3.0] |
1657  * | 2007 | 34446 | 1 | [3.0, 3.0] |
1658  * | 2010 | 47990 | 1 | [3.0, 3.0] |
1659  * | 2003 | 42120 | 1 | [5.0, 5.0] |
1660  * | 2007 | 44940 | 1 | [4.0, 4.0] |
1661  * | 2008 | 58240 | 1 | [4.0, 4.0] |
1662  * | 2002 | 102 | 1 | [1.0, 1.0] |
1663  * | 2009 | 52708 | 1 | [3.0, 3.0] |
1664  * | ... | ... | ... | ... |
1665  * +------+---------+-------+------------------------+
1666  * [10000 rows x 4 columns]
1667  * \endcode
1668  *
1669  * To put all items a user rated into one list value by their star rating:
1670  * \code
1671  * auto user_rating_stats = sf.groupby({"user_id", "rating"},
1672  * {{"rated_movie_ids",aggregate::CONCAT("movie_id")}});
1673  * std::cout << user_rating_stats;
1674  * \endcode
1675  * \code{.txt}
1676  * +--------+---------+----------------------+
1677  * | rating | user_id | rated_movie_ids |
1678  * +--------+---------+----------------------+
1679  * | 3 | 31434 | array("d", [1663.0]) |
1680  * | 5 | 25944 | array("d", [1663.0]) |
1681  * | 4 | 38827 | array("d", [1663.0]) |
1682  * | 4 | 51437 | array("d", [1663.0]) |
1683  * | 4 | 42549 | array("d", [1663.0]) |
1684  * | 4 | 49532 | array("d", [1663.0]) |
1685  * | 3 | 26124 | array("d", [1663.0]) |
1686  * | 4 | 46336 | array("d", [1663.0]) |
1687  * | 4 | 52133 | array("d", [1663.0]) |
1688  * | 5 | 62361 | array("d", [1663.0]) |
1689  * | ... | ... | ... |
1690  * +--------+---------+----------------------+
1691  * [9952 rows x 3 columns]
1692  * \endcode
1693  *
1694  * To put all items and rating of a given user together into a dictionary
1695  * value:
1696  * \code
1697  * auto user_rating_stats = sf.groupby({"user_id"},
1698  * {{"movie_rating",agg.CONCAT("movie_id", "rating")}});
1699  * std::cout << user_rating_stats;
1700  * \endcode
1701  * \code{.txt}
1702  * +---------+--------------+
1703  * | user_id | movie_rating |
1704  * +---------+--------------+
1705  * | 62361 | {1663: 5} |
1706  * | 30727 | {1663: 4} |
1707  * | 40111 | {1663: 2} |
1708  * | 50513 | {1663: 4} |
1709  * | 35140 | {1663: 4} |
1710  * | 42352 | {1663: 5} |
1711  * | 29667 | {1663: 4} |
1712  * | 46242 | {1663: 5} |
1713  * | 58310 | {1663: 2} |
1714  * | 64614 | {1663: 2} |
1715  * | ... | ... |
1716  * +---------+--------------+
1717  * [9852 rows x 2 columns]
1718  * \endcode
1719  *
1720  * \see aggregate
1721  */
1722  gl_sframe groupby(const std::vector<std::string>& groupkeys,
1723  const std::map<std::string, aggregate::groupby_descriptor_type>& operators
1724  = std::map<std::string, aggregate::groupby_descriptor_type>()) const;
1725 
1726  /**
1727  * Joins two \ref gl_sframe objects. Merges the current (left) \ref
1728  * gl_sframe with the given (right) \ref gl_sframe using a SQL-style
1729  * equi-join operation by columns.
1730  *
1731  * \param right The \ref gl_sframe to join.
1732  *
1733  * \param on The column name(s) representing the set of join keys. Each row that
1734  * has the same value in this set of columns will be merged together.
1735  *
1736  * \param how Optional. The type of join to perform. "inner" is default.
1737  * - \b "inner" : Equivalent to a SQL inner join. Result consists of the
1738  * rows from the two frames whose join key values match exactly,
1739  * merged together into one \ref gl_sframe.
1740  * - \b "left" : Equivalent to a SQL left outer join. Result is the union
1741  * between the result of an inner join and the rest of the rows from
1742  * the left \ref gl_sframe, merged with missing values.
1743  * - \b "right" : Equivalent to a SQL right outer join. Result is the union
1744  * between the result of an inner join and the rest of the rows from
1745  * the right \ref gl_sframe, merged with missing values.
1746  * - \b "outer" : Equivalent to a SQL full outer join. Result is
1747  * the union between the result of a left outer join and a right
1748  * outer join.
1749  *
1750  * Example:
1751  * \code
1752  * auto animals = gl_sframe({{"id", {1, 2, 3, 4}},
1753  * {"name", {"dog", "cat", "sheep", "cow"}}});
1754  * auto sounds = gl_sframe({{"id", {1, 3, 4, 5}},
1755  * {"sound", {"woof", "baa", "moo", "oink"}}});
1756  * std::cout << animals.join(sounds, {"id"});
1757  * std::cout << animals.join(sounds, {"id"}, "left");
1758  * std::cout << animals.join(sounds, {"id"}, "right");
1759  * std::cout << animals.join(sounds, {"id"}, "outer");
1760  * \endcode
1761  *
1762  * Produces output:
1763  * \code{.txt}
1764  * +----+-------+-------+
1765  * | id | name | sound |
1766  * +----+-------+-------+
1767  * | 1 | dog | woof |
1768  * | 3 | sheep | baa |
1769  * | 4 | cow | moo |
1770  * +----+-------+-------+
1771  * [3 rows x 3 columns]
1772  *
1773  * +----+-------+-------+
1774  * | id | name | sound |
1775  * +----+-------+-------+
1776  * | 1 | dog | woof |
1777  * | 3 | sheep | baa |
1778  * | 4 | cow | moo |
1779  * | 2 | cat | None |
1780  * +----+-------+-------+
1781  * [4 rows x 3 columns]
1782  *
1783  * +----+-------+-------+
1784  * | id | name | sound |
1785  * +----+-------+-------+
1786  * | 1 | dog | woof |
1787  * | 3 | sheep | baa |
1788  * | 4 | cow | moo |
1789  * | 5 | None | oink |
1790  * +----+-------+-------+
1791  * [4 rows x 3 columns]
1792  *
1793  * +----+-------+-------+
1794  * | id | name | sound |
1795  * +----+-------+-------+
1796  * | 1 | dog | woof |
1797  * | 3 | sheep | baa |
1798  * | 4 | cow | moo |
1799  * | 5 | None | oink |
1800  * | 2 | cat | None |
1801  * +----+-------+-------+
1802  * [5 rows x 3 columns]
1803  * \endcode
1804  */
1805  gl_sframe join(const gl_sframe& right,
1806  const std::vector<std::string>& joinkeys,
1807  const std::string& how="inner") const;
1808 
1809 
1810  /**
1811  * Joins two \ref gl_sframe objects. Merges the current (left) \ref
1812  * gl_sframe with the given (right) \ref gl_sframe using a SQL-style
1813  * equi-join operation by columns.
1814  *
1815  * \param right The \ref gl_sframe to join.
1816  *
1817  * \param on The column name(s) representing a map of join keys from left
1818  * to right. Each key is taken as a column name on the left gl_sframe
1819  * and each value is taken as the column name in the right gl_sframe.
1820  *
1821  * \param how Optional. The type of join to perform. "inner" is default.
1822  * - \b "inner" : Equivalent to a SQL inner join. Result consists of the
1823  * rows from the two frames whose join key values match exactly,
1824  * merged together into one \ref gl_sframe.
1825  * - \b "left" : Equivalent to a SQL left outer join. Result is the union
1826  * between the result of an inner join and the rest of the rows from
1827  * the left \ref gl_sframe, merged with missing values.
1828  * - \b "right" : Equivalent to a SQL right outer join. Result is the union
1829  * between the result of an inner join and the rest of the rows from
1830  * the right \ref gl_sframe, merged with missing values.
1831  * - \b "outer" : Equivalent to a SQL full outer join. Result is
1832  * the union between the result of a left outer join and a right
1833  * outer join.
1834  *
1835  * Example:
1836  * \code
1837  * auto animals = gl_sframe({{"id", {1, 2, 3, 4}},
1838  * {"name", {"dog", "cat", "sheep", "cow"}}});
1839  * auto sounds = gl_sframe({{"id", {1, 3, 4, 5}},
1840  * {"sound", {"woof", "baa", "moo", "oink"}}});
1841  * std::cout << animals.join(sounds, {"id", "id"});
1842  * \endcode
1843  *
1844  * Produces output:
1845  * \code{.txt}
1846  * +----+-------+-------+
1847  * | id | name | sound |
1848  * +----+-------+-------+
1849  * | 1 | dog | woof |
1850  * | 3 | sheep | baa |
1851  * | 4 | cow | moo |
1852  * +----+-------+-------+
1853  * [3 rows x 3 columns]
1854  * \endcode
1855  */
1856  gl_sframe join(const gl_sframe& right,
1857  const std::map<std::string, std::string>& joinkeys,
1858  const std::string& how="inner") const;
1859 
1860  /**
1861  * Filter an \ref gl_sframe by values inside an iterable object. Result is an
1862  * \ref gl_sframe that only includes (or excludes) the rows that have a
1863  * column with the given "column_name" which holds one of the values in the
1864  * given "values" \ref gl_sarray.
1865  *
1866  * \param values The values to use to filter the \ref gl_sframe. The
1867  * resulting \ref gl_sframe will only include rows that have one of these
1868  * values in the given column.
1869  *
1870  * \param column_name The column of the \ref gl_sframe to match with the
1871  * given "values".
1872  *
1873  * \param exclude Optional. Defaults to false. If true, the result \ref
1874  * gl_sframe will contain all rows except those that have one of "values" in
1875  * "column_name".
1876  *
1877  * Example:
1878  * \code
1879  * auto sf = gl_sframe({{"id", {1, 2, 3, 4}},
1880  * {"animal_type", {"dog", "cat", "cow", "horse"}},
1881  * {"name", {"bob", "jim", "jimbob", "bobjim"}}});
1882  * auto household_pets = {"cat", "hamster", "dog", "fish", "bird", "snake"};
1883  * std::cout << sf.filter_by(household_pets, "animal_type");
1884  * std::cout << sf.filter_by(household_pets, "animal_type", exclude=True);
1885  * \endcode
1886  *
1887  * Produces output:
1888  * \code{.txt}
1889  * +-------------+----+------+
1890  * | animal_type | id | name |
1891  * +-------------+----+------+
1892  * | dog | 1 | bob |
1893  * | cat | 2 | jim |
1894  * +-------------+----+------+
1895  * [2 rows x 3 columns]
1896  * +-------------+----+--------+
1897  * | animal_type | id | name |
1898  * +-------------+----+--------+
1899  * | horse | 4 | bobjim |
1900  * | cow | 3 | jimbob |
1901  * +-------------+----+--------+
1902  * [2 rows x 3 columns]
1903  * \endcode
1904  */
1905  gl_sframe filter_by(const gl_sarray& values, const std::string& column_name, bool exclude=false) const;
1906 
1907  /**
1908  * \overload
1909  * Pack two or more columns of the current \ref gl_sframe into one single
1910  * column. The result is a new \ref gl_sframe with the unaffected columns
1911  * from the original \ref gl_sframe plus the newly created column.
1912  *
1913  * The type of the resulting column is decided by the "dtype" parameter.
1914  * Allowed values for "dtype" are flex_type_enum::DICT ,
1915  * flex_type_enum::VECTOR or flex_type_enum::LIST
1916  *
1917  * - \ref flex_type_enum::DICT : pack to a dictionary \ref gl_sarray where column name becomes
1918  * dictionary key and column value becomes dictionary value
1919  *
1920  * - \ref flex_type_enum::VECTOR : pack all values from the packing columns into an array
1921  *
1922  * - \ref flex_type_enum::LIST : pack all values from the packing columns into a list.
1923  *
1924  * \param columns A list of column names to be packed. There must
1925  * at least two columns to pack.
1926  *
1927  * \param new_column_name Packed column name.
1928  *
1929  * \param dtype Optional. The resulting packed column type.
1930  * If not provided, dtype is list.
1931  *
1932  * \param fill_na Optional. Value to fill into packed column if missing value
1933  * is encountered. If packing to dictionary, "fill_na" is only applicable to
1934  * dictionary values; missing keys are not replaced.
1935  *
1936  * Example:
1937  * Suppose 'sf' is an an SFrame that maintains business category information.
1938  * \code{.cpp}
1939  * auto sf = gl_sframe({{"business", {1,2,3,4}},
1940  * {"category.retail", {1, FLEX_UNDEFINED, 1, FLEX_UNDEFINED}},
1941  * {"category.food", {1, 1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
1942  * {"category.service", {FLEX_UNDEFINED, 1, 1, FLEX_UNDEFINED}},
1943  * {"category.shop", {1, 1, FLEX_UNDEFINED, 1}}});
1944  * std::cout << sf;
1945  * \endcode
1946  * \code{.txt}
1947  * +----------+-----------------+---------------+------------------+---------------+
1948  * | business | category.retail | category.food | category.service | category.shop |
1949  * +----------+-----------------+---------------+------------------+---------------+
1950  * | 1 | 1 | 1 | None | 1 |
1951  * | 2 | None | 1 | 1 | 1 |
1952  * | 3 | 1 | None | 1 | None |
1953  * | 4 | None | 1 | None | 1 |
1954  * +----------+-----------------+---------------+------------------+---------------+
1955  * [4 rows x 5 columns]
1956  * \endcode
1957  *
1958  * To pack all category columns into a list:
1959  * \code{.cpp}
1960  * std::cout << sf.pack_columns({"category.retail", "category.food",
1961  * "category.service", "category.shop"},
1962  * "category");
1963  * \endcode
1964  * \code{.txt}
1965  * +----------+--------------------+
1966  * | business | category |
1967  * +----------+--------------------+
1968  * | 1 | [1, 1, None, 1] |
1969  * | 2 | [None, 1, 1, 1] |
1970  * | 3 | [1, None, 1, None] |
1971  * | 4 | [None, 1, None, 1] |
1972  * +----------+--------------------+
1973  * [4 rows x 2 columns]
1974  * \endcode
1975  *
1976  * To pack all category columns into a dictionary:
1977  * \code{.cpp}
1978  * std::cout << sf.pack_columns({"category.retail", "category.food",
1979  * "category.service", "category.shop"},
1980  * "category",
1981  * flex_type_enum::DICT);
1982  *
1983  * \endcode
1984  * \code{.txt}
1985  * +----------+--------------------------------+
1986  * | business | X2 |
1987  * +----------+--------------------------------+
1988  * | 1 | {'category.retail': 1, 'ca ... |
1989  * | 2 | {'category.food': 1, 'cate ... |
1990  * | 3 | {'category.retail': 1, 'ca ... |
1991  * | 4 | {'category.food': 1, 'cate ... |
1992  * +----------+--------------------------------+
1993  * [4 rows x 2 columns]
1994  * \endcode
1995  * \see gl_sframe::unpack
1996  */
1997  gl_sframe pack_columns(const std::vector<std::string>& columns,
1998  const std::string& new_column_name,
2000  flexible_type fill_na = FLEX_UNDEFINED) const;
2001 /**
2002  * Pack two or more columns of the current \ref gl_sframe with a common
2003  * column name prefix into one single column. The result is a new \ref
2004  * gl_sframe with the unaffected columns from the original \ref gl_sframe
2005  * plus the newly created column.
2006  *
2007  * The type of the resulting column is decided by the "dtype" parameter.
2008  * Allowed values for "dtype" are flex_type_enum::DICT ,
2009  * flex_type_enum::VECTOR or flex_type_enum::LIST
2010  *
2011  * - \ref flex_type_enum::DICT : pack to a dictionary \ref gl_sarray where column name becomes
2012  * dictionary key and column value becomes dictionary value
2013  *
2014  * - \ref flex_type_enum::VECTOR : pack all values from the packing columns into an array
2015  *
2016  * - \ref flex_type_enum::LIST : pack all values from the packing columns into a list.
2017  *
2018  * \param column_prefix Packs all columns with the given prefix.
2019  *
2020  * \param new_column_name Packed column name.
2021  *
2022  * \param dtype Optional. The resulting packed column type.
2023  * If not provided, dtype is list.
2024  *
2025  * \param fill_na Optional. Value to fill into packed column if missing value
2026  * is encountered. If packing to dictionary, "fill_na" is only applicable to
2027  * dictionary values; missing keys are not replaced.
2028  *
2029  * Example:
2030  * Suppose 'sf' is an an SFrame that maintains business category information.
2031  * \code{.cpp}
2032  * auto sf = gl_sframe({{"business", {1,2,3,4}},
2033  * {"category.retail", {1, FLEX_UNDEFINED, 1, FLEX_UNDEFINED}},
2034  * {"category.food", {1, 1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
2035  * {"category.service", {FLEX_UNDEFINED, 1, 1, FLEX_UNDEFINED}},
2036  * {"category.shop", {1, 1, FLEX_UNDEFINED, 1}}});
2037  * std::cout << sf;
2038  * \endcode
2039  * \code{.txt}
2040  * +----------+-----------------+---------------+------------------+---------------+
2041  * | business | category.retail | category.food | category.service | category.shop |
2042  * +----------+-----------------+---------------+------------------+---------------+
2043  * | 1 | 1 | 1 | None | 1 |
2044  * | 2 | None | 1 | 1 | 1 |
2045  * | 3 | 1 | None | 1 | None |
2046  * | 4 | None | 1 | None | 1 |
2047  * +----------+-----------------+---------------+------------------+---------------+
2048  * [4 rows x 5 columns]
2049  * \endcode
2050  *
2051  * To pack all category columns into a list:
2052  * \code{.cpp}
2053  * std::cout << sf.pack_columns("category", "category");
2054  * \endcode
2055  * \code{.txt}
2056  * +----------------+----------------+
2057  * | business | category |
2058  * +----------------+----------------+
2059  * | 1 | [1,1,,1] |
2060  * | 2 | [,1,1,1] |
2061  * | 3 | [1,,1,] |
2062  * | 4 | [,,,1] |
2063  * +----------------+----------------+
2064  * [4 rows x 2 columns]
2065  * \endcode
2066  *
2067  * To pack all category columns into a dictionary:
2068  * \code{.cpp}
2069  * std::cout << sf.pack_columns("category",
2070  * "category",
2071  * flex_type_enum::DICT);
2072  *
2073  * \endcode
2074  * \code{.txt}
2075  * +----------+--------------------------------+
2076  * | business | X2 |
2077  * +----------+--------------------------------+
2078  * | 1 | {'category.retail': 1, 'ca ... |
2079  * | 2 | {'category.food': 1, 'cate ... |
2080  * | 3 | {'category.retail': 1, 'ca ... |
2081  * | 4 | {'category.food': 1, 'cate ... |
2082  * +----------+--------------------------------+
2083  * [4 rows x 2 columns]
2084  * \endcode
2085  *
2086  * \see gl_sframe::unpack
2087  */
2088  gl_sframe pack_columns(const std::string& column_prefix,
2089  const std::string& new_column_name,
2091  flexible_type fill_na = FLEX_UNDEFINED) const;
2092 
2093  /**
2094  * Splits a datetime column of \ref gl_sframe to multiple columns, with each
2095  * value in a separate column. Returns a new \ref gl_sframe with the
2096  * column replaced with a list of new columns. The expanded column
2097  * must be of datetime type. For more details regarding name generation and
2098  * other, refer to \ref gl_sarray::split_datetime
2099  *
2100  * This function is a convenience function which is equivalent to calling
2101  * \ref gl_sarray::split_datetime on the column, deleting the column and
2102  * adding the expanded columns back to the sframe.
2103  *
2104  * \param expand_column Name of the column to expand.
2105  *
2106  * \param column_name_prefix Optional. If provided, expanded column names
2107  * would start with the given prefix. If not provided, the default value is
2108  * the name of the expanded column.
2109  *
2110  * \param limit Optional. Limits the set of datetime elements to expand.
2111  * Elements are 'year','month','day','hour','minute',
2112  * and 'second'.
2113  *
2114  * \param tzone Optional. A boolean parameter that determines whether to
2115  * show the timezone column or not. Defaults to false.
2116  *
2117  * Example:
2118  * \code{.cpp}
2119  * auto sa = gl_sarray({"20-Oct-2011", "10-Jan-2012"});
2120  * gl_sframe sf;
2121  * sf["date"] = sa.str_to_datetime("%d-%b-%Y");
2122  * auto split_sf = sf.split_datetime("date", "", {"day","year"});
2123  * std::cout << split_sf;
2124  * \endcode
2125  *
2126  * Produces output:
2127  * \code{.txt}
2128  * Columns:
2129  * day integer
2130  * year integer
2131  * +----------------+----------------+
2132  * | day | year |
2133  * +----------------+----------------+
2134  * | 20 | 2011 |
2135  * | 10 | 2012 |
2136  * +----------------+----------------+
2137  * [2 rows x 2 columns]
2138  * \endcode
2139  */
2140  gl_sframe split_datetime(const std::string& expand_column,
2141  const std::string& column_name_prefix = "X",
2142  const std::vector<std::string>& limit = std::vector<std::string>(),
2143  bool tzone=false) const;
2144 
2145  /**
2146  * Expand one column of this \ref gl_sframe to multiple columns with each value in
2147  * a separate column. Returns a new \ref gl_sframe with the unpacked column
2148  * replaced with a list of new columns. The column must be of
2149  * list/array/dict type.
2150  * For more details regarding name generation, missing value handling and
2151  * other, refer to \ref gl_sarray::unpack
2152  *
2153  * \param unpack_column Name of the unpacked column
2154  *
2155  * \param column_name_prefix Optional. If provided, unpacked column
2156  * names would start with the given prefix. Defaults to "X". If the empty
2157  * string is used, no prefix is used.
2158  *
2159  * \param column_types Optional. Column types for the unpacked columns. If
2160  * not provided, column types are automatically inferred from first 100 rows.
2161  * Defaults to FLEX_UNDEFINED.
2162  *
2163  * \param na_value Optional. Convert all values that are equal to "na_value"
2164  * to missing value if specified.
2165  *
2166  * \param limit optional limits in the set of list/vector/dict keys to unpack.
2167  * For list/vector gl_sarrays, "limit" must contain integer indices.
2168  * For dict gl_sarrays, "limit" must contain dictionary keys.
2169  *
2170  * Example:
2171  * \code{.cpp}
2172  * sf = gl_sframe({{"id", {1,2,3}},
2173  * {"wc": {flex_dict{{"a", 1}},
2174  * flex_dict{{"b", 2}},
2175  * flex_dict{{"a", 1},{"b", 2}}
2176  * }
2177  * }});
2178  * std::cout << sf;
2179  * \endcode
2180  * \code{.txt}
2181  * +----+------------------+
2182  * | id | wc |
2183  * +----+------------------+
2184  * | 1 | {'a': 1} |
2185  * | 2 | {'b': 2} |
2186  * | 3 | {'a': 1, 'b': 2} |
2187  * +----+------------------+
2188  * [3 rows x 2 columns]
2189  * \endcode
2190  *
2191  * To unpack:
2192  * \code{.cpp}
2193  * std::cout << sf.unpack("wc");
2194  * \endcode
2195  * \code{.txt}
2196  * +----+------+------+
2197  * | id | wc.a | wc.b |
2198  * +----+------+------+
2199  * | 1 | 1 | None |
2200  * | 2 | None | 2 |
2201  * | 3 | 1 | 2 |
2202  * +----+------+------+
2203  * [3 rows x 3 columns]
2204  * \endcode
2205  *
2206  * To not have prefix in the generated column name::
2207  * \code{.cpp}
2208  * std::cout << sf.unpack("wc", "");
2209  * \endcode
2210  * \code{.txt}
2211  * +----+------+------+
2212  * | id | wc.a | wc.b |
2213  * +----+------+------+
2214  * | 1 | 1 | None |
2215  * | 2 | None | 2 |
2216  * | 3 | 1 | 2 |
2217  * +----+------+------+
2218  * [3 rows x 3 columns]
2219  * \endcode
2220  *
2221  * To limit subset of keys to unpack:
2222  * \code{.cpp}
2223  * std::cout << sf.unpack("wc", "", {}, FLEX_UNDEFINED, {"b"});
2224  * \endcode
2225  * \code{.txt}
2226  * +----+------+
2227  * | id | b |
2228  * +----+------+
2229  * | 1 | None |
2230  * | 2 | 2 |
2231  * | 3 | 2 |
2232  * +----+------+
2233  * [3 rows x 3 columns]
2234  * \endcode
2235  *
2236  * \see gl_sframe::pack_columns
2237  * \see gl_sarray::unpack
2238  */
2239  gl_sframe unpack(const std::string& unpack_column,
2240  const std::string& column_name_prefix = "X",
2241  const std::vector<flex_type_enum>& column_types = std::vector<flex_type_enum>(),
2242  const flexible_type& na_value = FLEX_UNDEFINED,
2243  const std::vector<flexible_type>& limit = std::vector<flexible_type>()) const;
2244 
2245 /**
2246  * Convert a "wide" column of an \ref gl_sframe to one or two "tall" columns by
2247  * stacking all values.
2248  *
2249  * The stack works only for columns of list, or array type (for the dict type,
2250  * see the
2251  * \ref stack(const std::string&, const std::vector<std::string>&, bool)const
2252  * "overload"). One new column is created as a result of stacking, where each
2253  * row holds one element of the array or list value, and the rest columns
2254  * from the same original row repeated.
2255  *
2256  * The new \ref gl_sframe includes the newly created column and all columns other
2257  * than the one that is stacked.
2258  *
2259  * \param column_names The column(s) to stack. This column must be of
2260  * dict/list/array type
2261  *
2262  * \param new_column_name Optional. The new column name.
2263  * If not given, column names are generated automatically.
2264  *
2265  * \param drop_na Optional. Defaults to false. If true, missing values and
2266  * empty list/array/dict are all dropped from the resulting column(s). If
2267  * false, missing values are maintained in stacked column(s).
2268  *
2269  * Suppose 'sf' is an SFrame that contains a user and his/her friends,
2270  * where 'friends' columns is an array type. Stack on 'friends' column
2271  * would create a user/friend list for each user/friend pair:
2272  * \code
2273  * auto sf = gl_sframe({{"topic",{1,2,3}},
2274  * {"friends",{{2,3,4}, {5,6}, {4,5,10,FLEX_UNDEFINED}}}
2275  * });
2276  * std::cout << sf;
2277  * std::cout << sf.stack("friends", "friend");
2278  * \endcode
2279  *
2280  * Produces output:
2281  * \code{.txt}
2282  * +------+------------------+
2283  * | user | friends |
2284  * +------+------------------+
2285  * | 1 | [2, 3, 4] |
2286  * | 2 | [5, 6] |
2287  * | 3 | [4, 5, 10, None] |
2288  * +------+------------------+
2289  * [3 rows x 2 columns]
2290  *
2291  * +------+--------+
2292  * | user | friend |
2293  * +------+--------+
2294  * | 1 | 2 |
2295  * | 1 | 3 |
2296  * | 1 | 4 |
2297  * | 2 | 5 |
2298  * | 2 | 6 |
2299  * | 3 | 4 |
2300  * | 3 | 5 |
2301  * | 3 | 1 |
2302  * | 3 | None |
2303  * +------+--------+
2304  * [9 rows x 2 columns]
2305  * \endcode
2306  *
2307  * \see gl_sframe::unstack(const std::vector<std::string>&, const std::string&) const
2308  * \see stack(const std::string&, const std::vector<std::string>&, bool)const
2309  */
2310  gl_sframe stack(const std::string& column_name,
2311  const std::string& new_column_names,
2312  bool drop_na = false) const;
2313 /**
2314  * Convert a "wide" column of an \ref gl_sframe to one or two "tall" columns by
2315  * stacking all values.
2316  *
2317  * The stack works only for columns of dictionary type (for the list or array types,
2318  * see the
2319  * \ref stack(const std::string&, const std::string&, bool)const
2320  * "overload"). Two new columns are created as a result of
2321  * stacking: one column holds the key and another column holds the value.
2322  * The rest of the columns are repeated for each key/value pair.
2323  *
2324  * The new \ref gl_sframe includes the newly created columns and all columns
2325  * other than the one that is stacked.
2326  *
2327  * \param column_names The column(s) to stack. This column must be of
2328  * dict/list/array type
2329  *
2330  * \param new_column_names Optional. The new column names. Must be an vector of
2331  * 2 values corresponding to the "key" column and the "value" column.
2332  * If not given, column names are generated automatically.
2333  *
2334  * \param drop_na Optional. Defaults to false. If true, missing values and
2335  * empty list/array/dict are all dropped from the resulting column(s). If
2336  * false, missing values are maintained in stacked column(s).
2337  *
2338  * Suppose 'sf' is an SFrame that contains a column of dict type.
2339  * Stack would stack all keys in one column and all values in another
2340  * column:
2341  * \code
2342  * auto sf = gl_sframe({{"topic",{1,2,3,4}},
2343  * {"words", {flex_dict{{"a",3},{"cat",2}},
2344  * flex_dict{{"a",1},{"the",2}},
2345  * flex_dict{{"the",1},{"dog",3}},
2346  * flex_dict()}
2347  * }});
2348  * std::cout << sf.stack("words", new_column_name={"word", "count"});
2349  * \endcode
2350  *
2351  * Produces output:
2352  * \code{.txt}
2353  * +-------+----------------------+
2354  * | topic | words |
2355  * +-------+----------------------+
2356  * | 1 | {'a': 3, 'cat': 2} |
2357  * | 2 | {'a': 1, 'the': 2} |
2358  * | 3 | {'the': 1, 'dog': 3} |
2359  * | 4 | {} |
2360  * +-------+----------------------+
2361  * [4 rows x 2 columns]
2362  *
2363  * +-------+------+-------+
2364  * | topic | word | count |
2365  * +-------+------+-------+
2366  * | 1 | a | 3 |
2367  * | 1 | cat | 2 |
2368  * | 2 | a | 1 |
2369  * | 2 | the | 2 |
2370  * | 3 | the | 1 |
2371  * | 3 | dog | 3 |
2372  * | 4 | None | None |
2373  * +-------+------+-------+
2374  * [7 rows x 3 columns]
2375  *
2376  * Observe that since topic 4 had no words, an empty row is inserted.
2377  * To drop that row, set dropna=True in the parameters to stack.
2378  * \endcode
2379  *
2380  * \see unstack(const std::string&, const std::string&) const
2381  * \see stack(const std::string&, const std::string&, bool)const
2382  */
2383  gl_sframe stack(const std::string& column_name,
2384  const std::vector<std::string>& new_column_names,
2385  bool drop_na = false) const;
2386 
2387  /**
2388  * Concatenate values from one columns into one column, grouping by
2389  * all other columns. The resulting column could be of type list or array.
2390  * If "column" is a numeric column, the result will be of vector type.
2391  * If "column" is a non-numeric column, the new column will be of list type.
2392  *
2393  * \param column The column that is to be concatenated.
2394  * If str, then collapsed column type is either array or list.
2395  *
2396  * \param new_column_name Optional. New column name. If not given, a name is
2397  * generated automatically.
2398  *
2399  * Example:
2400  * \code
2401  * auto sf = gl_sframe({{"friend", {2, 3, 4, 5, 6, 4, 5, 2, 3}},
2402  * {"user", {1, 1, 1, 2, 2, 2, 3, 4, 4}}});
2403  * std::cout << sf.unstack("friend", "friends");
2404  * \endcode
2405  *
2406  * Produces output:
2407  * \code{.txt}
2408  * +------+-----------------------------+
2409  * | user | friends |
2410  * +------+-----------------------------+
2411  * | 3 | [5.0] |
2412  * | 1 | [2.0, 4.0, 3.0] |
2413  * | 2 | [5.0, 6.0, 4.0] |
2414  * | 4 | [2.0, 3.0] |
2415  * +------+-----------------------------+
2416  * [4 rows x 2 columns]
2417  * \endcode
2418  *
2419  * \see stack(const std::string&, const std::string&, bool)const
2420  * \see groupby
2421  */
2422  gl_sframe unstack(const std::string& columns,
2423  const std::string& new_column_name = "") const;
2424 
2425  /**
2426  * Concatenate values two columns into one column, grouping by
2427  * all other columns. The new column will be of dict type where the keys are
2428  * taken from the first column in the list, and the values taken from the
2429  * second column in the list.
2430  *
2431  * \param column The columns that are to be concatenated.
2432  *
2433  * \param new_column_name Optional.
2434  * New column name. If not given, a name is generated automatically.
2435  *
2436  * Example:
2437  * \code
2438  * auto sf = gl_sframe({{"count",{4, 2, 1, 1, 2, FLEX_UNDEFINED}},
2439  * {"topic",{"cat", "cat", "dog", "elephant", "elephant", "fish"}},
2440  * {"word", {"a", "c", "c", "a", "b", FLEX_UNDEFINED}}});
2441  * std::cout << sf.unstack({"word", "count"}, "words");
2442  * \endcode
2443  *
2444  * Produces output:
2445  * \code{.txt}
2446  * +----------+------------------+
2447  * | topic | words |
2448  * +----------+------------------+
2449  * | elephant | {'a': 1, 'b': 2} |
2450  * | dog | {'c': 1} |
2451  * | cat | {'a': 4, 'c': 2} |
2452  * | fish | None |
2453  * +----------+------------------+
2454  * [4 rows x 2 columns]
2455  * \endcode
2456  *
2457  * \see stack
2458  * \see groupby
2459  */
2460  gl_sframe unstack(const std::vector<std::string>& columns,
2461  const std::string& new_column_name = "") const;
2462 
2463  /**
2464  * Remove duplicate rows of the \ref gl_sframe. Will not necessarily preserve the
2465  * order of the given \ref gl_sframe in the new \ref gl_sframe.
2466  *
2467  * Example:
2468  * \code
2469  * gl_sframe sf{ {"id", {1,2,3,3,4}},
2470  * {"value", {1,2,3,3,4}} };
2471  * std::cout << sf.unique() << std::endl;
2472  * \endcode
2473  *
2474  * Produces output:
2475  * \code{.txt}
2476  * +----+-------+
2477  * | id | value |
2478  * +----+-------+
2479  * | 2 | 2 |
2480  * | 4 | 4 |
2481  * | 3 | 3 |
2482  * | 1 | 1 |
2483  * +----+-------+
2484  * [4 rows x 2 columns]
2485  * \endcode
2486  *
2487  * \see \ref gl_sarray.unique
2488  */
2489  gl_sframe unique() const;
2490 
2491  /**
2492  * Sort current \ref gl_sframe by a single column, using the given sort order.
2493  *
2494  * Only columns that are type of str, int and float can be sorted.
2495  *
2496  * \param column The name of the column to be sorted.
2497  *
2498  * \param ascending Optional. Sort all columns in the given order.
2499  *
2500  * Example:
2501  * \code
2502  * gl_sframe sf{ {"a", {1,3,2,1}},
2503  * {"b", {"a","c","b","b"}},
2504  * {"c", {"x","y","z","y"}} };
2505  * std::cout << sf.sort("a") << std::endl;
2506  * \endcode
2507  *
2508  * Produces output:
2509  * \code{.txt}
2510  * +---+---+---+
2511  * | a | b | c |
2512  * +---+---+---+
2513  * | 1 | a | x |
2514  * | 1 | b | y |
2515  * | 2 | b | z |
2516  * | 3 | c | y |
2517  * +---+---+---+
2518  * [4 rows x 3 columns]
2519  *
2520  * \endcode
2521  *
2522  * Example:
2523  * \code
2524  * // To sort by column "a", descending
2525  * std::cout << sf.sort("a", false) << std::endl;
2526  * \endcode
2527  *
2528  * Produces output:
2529  * \code{.txt}
2530  * +---+---+---+
2531  * | a | b | c |
2532  * +---+---+---+
2533  * | 3 | c | y |
2534  * | 2 | b | z |
2535  * | 1 | a | x |
2536  * | 1 | b | y |
2537  * +---+---+---+
2538  * [4 rows x 3 columns]
2539  * \endcode
2540  *
2541  * \see topk
2542  */
2543  gl_sframe sort(const std::string& column, bool ascending = true) const;
2544 
2545  /**
2546  * \overload
2547  *
2548  * Sort current \ref gl_sframe by a multiple columns, using the given sort order.
2549  *
2550  * \param columns The names of the columns to be sorted.
2551  *
2552  * \param ascending Optional. Sort all columns in the given order.
2553  *
2554  * The result will be sorted first by
2555  * first column, followed by second column, and so on. All columns will
2556  * be sorted in the same order as governed by the "ascending"
2557  * parameter.
2558  *
2559  * Example:
2560  * \code
2561  * // To sort by column "a" and "b", all ascending
2562  * std::cout << sf.sort({"a", "b"}) << std::endl;
2563  * \endcode
2564  *
2565  * Produces output:
2566  * \code{.txt}
2567  * +---+---+---+
2568  * | a | b | c |
2569  * +---+---+---+
2570  * | 1 | a | x |
2571  * | 1 | b | y |
2572  * | 2 | b | z |
2573  * | 3 | c | y |
2574  * +---+---+---+
2575  * [4 rows x 3 columns]
2576  *
2577  * \endcode
2578  *
2579  * \see topk
2580  */
2581  gl_sframe sort(const std::vector<std::string>& columns, bool ascending = true) const;
2582 
2583  /**
2584  * \overload
2585  */
2586  gl_sframe sort(const std::initializer_list<std::string>& columns, bool ascending = true) const;
2587 
2588  /**
2589  * \overload
2590  * Sort current \ref gl_sframe by a multiple columns, using different sort order for each column.
2591  *
2592  * \param column_and_ascending A map from column name to sort order (ascending is true)
2593  *
2594  * To sort by column "a" ascending, and then by column "c" descending
2595  * To control the sort ordering for each column
2596  * individually, "sort_columns" must be a list of (str, bool) pairs.
2597  * Given this case, the first value is the column name and the second
2598  * value is a boolean indicating whether the sort order is ascending.
2599  *
2600  * Example:
2601  * \code
2602  * // To sort by column "a" ascending, and then by column "c" descending
2603  * std::cout << sf.sort({{"a", true}, {"c", false}}) << std::endl;
2604  * \endcode
2605  *
2606  * Produces output:
2607  * \code{.txt}
2608  * +---+---+---+
2609  * | a | b | c |
2610  * +---+---+---+
2611  * | 1 | b | y |
2612  * | 1 | a | x |
2613  * | 2 | b | z |
2614  * | 3 | c | y |
2615  * +---+---+---+
2616  * [4 rows x 3 columns]
2617  * \endcode
2618  */
2619  gl_sframe sort(const std::vector<std::pair<std::string, bool>>& column_and_ascending) const;
2620 
2621  /**
2622  * Remove missing values from an \ref gl_sframe. A missing value is either "FLEX_UNDEFINED"
2623  * or "NaN". If "how" is "any", a row will be removed if any of the
2624  * columns in the "columns" parameter contains at least one missing
2625  * value. If "how" is "all", a row will be removed if all of the columns
2626  * in the "columns" parameter are missing values.
2627  * If the "columns" parameter is not specified, the default is to
2628  * consider all columns when searching for missing values.
2629  *
2630  * \param columns Optional. The columns to use when looking for missing values.
2631  * By default, all columns are used.
2632  *
2633  * \param how Optional. Specifies whether a row should be dropped if at least one column
2634  * has missing values, or if all columns have missing values. "any" is
2635  * default.
2636  *
2637  * For instance
2638  * \code
2639  * gl_sframe sf { {"a", {1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
2640  * {"b", {"a", "b", FLEX_UNDEFINED}} };
2641  *
2642  * std::cout << sf.dropna() << std::endl;
2643  * \endcode
2644  *
2645  * Produces output:
2646  *
2647  * \code{.txt}
2648  * +---+---+
2649  * | a | b |
2650  * +---+---+
2651  * | 1 | a |
2652  * +---+---+
2653  * [1 rows x 2 columns]
2654  * \endcode
2655  *
2656  * \code
2657  * // Drop when all values are missing.
2658  * std::cout << sf.dropna({}, all) << std::endl;
2659  * \endcode
2660  *
2661  * Produces output:
2662  * \code{.txt}
2663  * +------+---+
2664  * | a | b |
2665  * +------+---+
2666  * | 1 | a |
2667  * | None | b |
2668  * +------+---+
2669  * [2 rows x 2 columns]
2670  *
2671  * \endcode
2672  * Example:
2673  * \code
2674  * // Drop rows where column "a" has a missing value.
2675  * std::cout << sf.dropna({"a"}) << std::endl;
2676  * \endcode
2677  *
2678  * Produces output:
2679  * \code{.txt}
2680  * +---+---+
2681  * | a | b |
2682  * +---+---+
2683  * | 1 | a |
2684  * +---+---+
2685  * [1 rows x 2 columns]
2686  * \endcode
2687  *
2688  * \see dropna_split
2689  */
2690  gl_sframe dropna(const std::vector<std::string>& columns = std::vector<std::string>(),
2691  std::string how = "any", bool recursive = false) const;
2692 
2693  /**
2694  * Split rows with missing values from this \ref gl_sframe. This function has
2695  * the same functionality as dropna, but returns a tuple of two \ref
2696  * gl_sframe objects. The first item is the expected output from dropna, and
2697  * the second item contains all the rows filtered out by the "dropna"
2698  * algorithm.
2699  *
2700  * \param columns Optional. The columns to use when looking for missing values.
2701  * By default, all columns are used.
2702  *
2703  * \param how Optional. Specifies whether a row should be dropped if at least
2704  * one column has missing values, or if all columns have missing values.
2705  * "any" is default.
2706  *
2707  * \param recursive Optional. It will recursively check whether a cell contains
2708  * nan or not. This is handy for nested data structure like list, dictionary.
2709  * For instance, {{FLEX_UNDEFINED, 1}, {1} will be treat as nan and will be removed
2710  * if recursive is set to be true. Otherwise it won't be treated as nan-value.
2711  *
2712  * Example:
2713  * \code
2714  * gl_sframe sf { {"a": {1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
2715  * {"b": {"a", "b", FLEX_UNDEFINED}} };
2716  * gl_sframe good, bad;
2717  * std::tie(good, bad) = sf.dropna_split();
2718  * std::cout << good << std::endl;
2719  * \endcode
2720  *
2721  * Produces output:
2722  * \code{.txt}
2723  * +---+---+
2724  * | a | b |
2725  * +---+---+
2726  * | 1 | a |
2727  * +---+---+
2728  * [1 rows x 2 columns]
2729  *
2730  * \endcode
2731  *
2732  * Example:
2733  * \code
2734  * std::cout << bad << std::endl;
2735  * \endcode
2736  *
2737  * Produces output:
2738  * \code{.txt}
2739  * +------+------+
2740  * | a | b |
2741  * +------+------+
2742  * | None | b |
2743  * | None | None |
2744  * +------+------+
2745  * [2 rows x 2 columns]
2746  * \endcode
2747  *
2748  * \see dropna
2749  */
2750  std::pair<gl_sframe, gl_sframe> dropna_split(
2751  const std::vector<std::string>& columns=std::vector<std::string>(),
2752  std::string how = "any", bool recursive = false) const;
2753 
2754  /**
2755  * Fill all missing values with a given value in a given column. If the
2756  * "value" is not the same type as the values in "column", this method
2757  * attempts to convert the value to the original column"s type. If this
2758  * fails, an error is raised.
2759  *
2760  * \param column The name of the column to modify.
2761  *
2762  * \param value The value used to replace all missing values.
2763  *
2764  * \param recursive The recursive is used to set the manner of nan-value checking.
2765  * If this value is true, a cell will be treated as missing value iff it contains nan.
2766  * For instance, {{FLEX_UNDEFINED, 1}, {0}} and {FLEX_UNDEFINED, 1} will be all treated
2767  * as nan-values.
2768  *
2769  * Example:
2770  * \code
2771  * gl_sframe sf {{"a": {1, FLEX_UNDEFINED, FLEX_UNDEFINED},
2772  * {"b":["13.1", "17.2", FLEX_UNDEFINED]}};
2773  * sf = sf.fillna("a", 0);
2774  * std::cout << sf << std::endl;
2775  * \endcode
2776  *
2777  * Produces output:
2778  * \code{.txt}
2779  * +---+------+
2780  * | a | b |
2781  * +---+------+
2782  * | 1 | 13.1 |
2783  * | 0 | 17.2 |
2784  * | 0 | None |
2785  * +---+------+
2786  * [3 rows x 2 columns]
2787  * \endcode
2788  *
2789  * \see dropna
2790  */
2791  gl_sframe fillna(const std::string& column, flexible_type value) const;
2792 
2793  /**
2794  * Returns a new \ref gl_sframe with a new column that numbers each row
2795  * sequentially. By default the count starts at 0, but this can be changed
2796  * to a positive or negative number. The new column will be named with
2797  * the given column name. An error will be raised if the given column
2798  * name already exists in the \ref gl_sframe.
2799  *
2800  * \param column_name Optional. The name of the new column that will hold the
2801  * row numbers.
2802  *
2803  * \param start Optional. The number used to start the row number count.
2804  *
2805  * Example:
2806  * \code
2807  * sf = gl_sframe{{"a": {1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
2808  * {"b": {"a", "b", FLEX_UNDEFINED}} };
2809  * std::cout << sf.add_row_number() << std::endl;
2810  * \endcode
2811  *
2812  * Produces output:
2813  * \code{.txt}
2814  * +----+------+------+
2815  * | id | a | b |
2816  * +----+------+------+
2817  * | 0 | 1 | a |
2818  * | 1 | None | b |
2819  * | 2 | None | None |
2820  * +----+------+------+
2821  * [3 rows x 3 columns]
2822  * \endcode
2823  */
2824  gl_sframe add_row_number(const std::string& column_name = "id", size_t start = 0) const;
2825 
2826  friend std::ostream& operator<<(std::ostream& out, const gl_sframe& other);
2827 
2828  virtual std::shared_ptr<unity_sframe> get_proxy() const;
2829 
2830 
2831  private:
2832  void instantiate_new();
2833 
2834  std::shared_ptr<unity_sframe> m_sframe;
2835 
2836  std::shared_ptr<sframe_reader> get_sframe_reader() const;
2837 };
2838 
2839 /**
2840  * Provides printing of the gl_sframe.
2841  */
2842 std::ostream& operator<<(std::ostream& out, const gl_sframe& other);
2843 
2844 
2845 /**
2846  * A range object providing one pass iterators over part or all of a gl_sframe.
2847  *See \ref gl_sframe::range_iterator for usage examples.
2848  *
2849  * \see gl_sframe::range_iterator
2850  */
2852  public:
2853  typedef sframe_rows::row type;
2854 
2855  gl_sframe_range(std::shared_ptr<sframe_reader> m_sframe_reader,
2856  size_t start, size_t end);
2857  gl_sframe_range(const gl_sframe_range&) = default;
2858  gl_sframe_range(gl_sframe_range&&) = default;
2859  gl_sframe_range& operator=(const gl_sframe_range&) = default;
2860  gl_sframe_range& operator=(gl_sframe_range&&) = default;
2861 
2862  /// Iterator type
2863  struct iterator:
2864  public boost::iterator_facade<iterator,
2865  const sframe_rows::row&, boost::single_pass_traversal_tag> {
2866  public:
2867  iterator() = default;
2868  iterator(const iterator&) = default;
2869  iterator(iterator&&) = default;
2870  iterator& operator=(const iterator&) = default;
2871  iterator& operator=(iterator&&) = default;
2872 
2873  iterator(gl_sframe_range& range, bool is_start);
2874  private:
2875  friend class boost::iterator_core_access;
2876  void increment();
2877  void advance(size_t n);
2878  inline bool equal(const iterator& other) const {
2879  return m_counter == other.m_counter;
2880  }
2881  const type& dereference() const;
2882  size_t m_counter = 0;
2883  gl_sframe_range* m_owner = NULL;
2884  };
2885 
2886  /// const_iterator type
2888 
2889  /**
2890  * Returns an iterator to the start of the range.
2891  * Once the iterator is advanced, later calls to begin() have undefined
2892  * behavior.
2893  *
2894  * The returned iterator is invalidated once the parent range_iterator is
2895  * destroyed.
2896  */
2897  iterator begin();
2898 
2899  /**
2900  * Returns an iterator to the end of the range.
2901  *
2902  * The returned iterator is invalidated once the parent range_iterator is
2903  * destroyed.
2904  */
2905  iterator end();
2906  private:
2907  std::shared_ptr<sframe_reader_buffer> m_sframe_reader_buffer;
2908 };
2909 
2910 
2911 /**
2912  * \ingroup group_glsdk
2913  * A reference to a column in a gl_sframe.
2914  * Used to enable
2915  * \code
2916  * sf["a"] = gl_sarray...
2917  * \endcode
2918  */
2920  public:
2921  gl_sarray_reference() = delete;
2924  gl_sarray_reference& operator=(const gl_sarray_reference&);
2925  gl_sarray_reference& operator=(const gl_sarray&);
2926  gl_sarray_reference& operator=(const flexible_type& value);
2927  virtual std::shared_ptr<unity_sarray> get_proxy() const;
2928  private:
2929  gl_sarray_reference(const gl_sarray_reference&) = default;
2930 
2931  gl_sarray_reference(gl_sframe& sf, std::string column_name);
2932 
2933  gl_sframe& m_sf;
2934  std::string m_column_name;
2935  friend class gl_sframe;
2936 };
2937 
2938 
2939 /**
2940  * \ingroup group_glsdk
2941  * A reference to a column in a gl_sframe.
2942  * Used to enable
2943  * \code
2944  * sf["a"] = gl_sarray...
2945  * \endcode
2946  */
2948  public:
2949  const_gl_sarray_reference() = delete;
2951  virtual std::shared_ptr<unity_sarray> get_proxy() const;
2952  private:
2954 
2955  const_gl_sarray_reference(const gl_sframe& sf, std::string column_name);
2956 
2957  const gl_sframe& m_sf;
2958  std::string m_column_name;
2959  friend class gl_sframe;
2960 };
2961 
2962 
2963 class gl_sframe_writer_impl;
2964 
2965 /**
2966  * \ingroup group_glsdk
2967  * Provides the ability to write \ref gl_sframe.
2968  * The gl_sframe is internally cut into a collection of segments. Each segment
2969  * can be written to independently, and the resultant SFrame is the effective
2970  * concatenation of all the segments.
2971  *
2972  * \code
2973  * // Writes an SFrame of 4 segments, and 2 columns "a" and "b", both of which
2974  * // are integers.
2975  * gl_sframe_writer writer({"a","b"},
2976  * {flex_type_enum:INTEGER, flex_type_enum::INTEGER},
2977  * 4);
2978  *
2979  * // for each segment, write a bunch of (i, i) pair values.
2980  * // segment 0 has 10 0's,
2981  * // segment 1 has 10 1's,
2982  * // etc
2983  * for (size_t seg = 0;seg < 4; ++seg) {
2984  * for (size_t i = 0;i < 10; ++i) {
2985  * writer.write({i, i}, seg);
2986  * }
2987  * }
2988  *
2989  * gl_sframe sa = writer.close();
2990  * // sa is now an SFrame of 40 elements comprising of
2991  * // four consecutive sequences of (1,1) to (10,10)
2992  * \endcode
2993  *
2994  * Different segments can be written safely in parallel. It is not safe to
2995  * write to the same segment simultanously.
2996  */
2998  public:
2999  /**
3000  * Constructs a writer to write an gl_sarray of a particular type.
3001  *
3002  * \param column_name The column names of the SFrame.
3003  *
3004  * \param type The type of each column of the SFrame. Everything written to
3005  * the writer (via \ref write) must be of that type, is implicitly castable
3006  * to that type, or is a missing value denoted with a FLEX_UNDEFINED value.
3007  *
3008  * \param num_segments Optional. The number of segments of the SFrame.
3009  * Adjusting this parameter has little performance impact on the resultant
3010  * gl_sframe. Modifying this value is only helpful for providing writer
3011  * parallelism. Defaults to the number of cores on the machine.
3012  */
3013  gl_sframe_writer(const std::vector<std::string>& column_names,
3014  const std::vector<flex_type_enum>& column_types,
3015  size_t num_segments = (size_t)(-1));
3016 
3017  /**
3018  * Writes a single value to a given segment.
3019  *
3020  * For instance,
3021  * \code
3022  * gl_sframe_writer({"a","b"}, {flex_type_enum:FLOAT, flex_type_enum::STRING}, 1);
3023  * writer.write({1.5, "hello"}, 0);
3024  * \endcode
3025  *
3026  * Different segments can be written safely in parallel. It is not safe to
3027  * write to the same segment simultanously.
3028  *
3029  * \param f The value to write. This value should be of an array of the
3030  * requested typse (as set in the constructor), or is castable to the
3031  * requested type, or is FLEX_UNDEFINED.
3032  *
3033  * \param segmentid The segment to write to.
3034  */
3035  void write(const std::vector<flexible_type>& f, size_t segmentid);
3036 
3037  /**
3038  * Writes a range of values to a given segment.
3039  *
3040  * Essentially equivalent to:
3041  * \code
3042  * while(start != end) write(*start++);
3043  * \endcode
3044  *
3045  * Different segments can be written safely in parallel. It is not safe to
3046  * write to the same segment simultanously.
3047  *
3048  * \param start The start iterator of the range to write.
3049  *
3050  * \param end The end iterator of the range to write.
3051  *
3052  * \param segmentid The segment to write to.
3053  */
3054  template <typename T>
3055  void write(T begin, T end, size_t segmentid) {
3056  while (begin != end) {
3057  write((*begin), segmentid);
3058  ++begin;
3059  }
3060  }
3061 
3062  /**
3063  * Stops all writes and returns the resultant SFrame.
3064  */
3065  gl_sframe close();
3066 
3067  /**
3068  * Returns the number of segments of the Aarray; this is the same value
3069  * provided on construction of the writer.
3070  */
3071  size_t num_segments() const;
3072 
3073  ~gl_sframe_writer();
3074 
3075  private:
3076  std::unique_ptr<gl_sframe_writer_impl> m_writer_impl;
3077 };
3078 
3079 
3080 } // turicreate
3081 #endif // TURI_UNITY_GL_SFRAME_HPP
groupby_descriptor_type MAX(const std::string &col)
groupby_descriptor_type QUANTILE(const std::string &col, double quantile)
groupby_descriptor_type VARIANCE(const std::string &col)
groupby_descriptor_type MIN(const std::string &col)
iterator const_iterator
const_iterator type
Definition: gl_sframe.hpp:2887
std::shared_ptr< sframe > sort(std::shared_ptr< planner_node > sframe_planner_node, const std::vector< std::string > column_names, const std::vector< size_t > &sort_column_indices, const std::vector< bool > &sort_orders)
groupby_descriptor_type SUM(const std::string &col)
static std::ostream & operator<<(std::ostream &out, const uint128_t &x)
Enables printing of uint128_t values.
groupby_descriptor_type ARGMAX(const std::string &agg, const std::string &out)
groupby_descriptor_type CONCAT(const std::string &col)
groupby_descriptor_type STDV(const std::string &col)
void write(T begin, T end, size_t segmentid)
Definition: gl_sframe.hpp:3055
groupby_descriptor_type COUNT_DISTINCT(const std::string &col)
groupby_descriptor_type AVG(const std::string &col)
std::shared_ptr< group_aggregate_value > m_aggregator
aggregator
Definition: gl_sframe.hpp:67
groupby_descriptor_type MEAN(const std::string &col)
std::set< T > values(const std::map< Key, T > &map)
Definition: stl_util.hpp:386
groupby_descriptor_type COUNT()
groupby_descriptor_type SELECT_ONE(const std::string &col)
groupby_descriptor_type VAR(const std::string &col)
static flexible_type FLEX_UNDEFINED
std::vector< std::string > m_group_columns
columns as input into the aggregator
Definition: gl_sframe.hpp:64
groupby_descriptor_type STD(const std::string &col)
groupby_descriptor_type ARGMIN(const std::string &agg, const std::string &out)
groupby_descriptor_type make_aggregator(const std::vector< std::string > &group_columns, const Args &... args)
Definition: gl_sframe.hpp:99