Turi Create  4.0
gl_sarray.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_GL_SARRAY_HPP
7 #define TURI_UNITY_GL_SARRAY_HPP
8 #include <cmath>
9 #include <memory>
10 #include <cstddef>
11 #include <string>
12 #include <iostream>
13 #include <core/storage/sframe_data/sframe_rows.hpp>
14 #include <core/storage/sframe_data/group_aggregate_value.hpp>
15 #include <core/data/flexible_type/flexible_type.hpp>
16 #include <visualization/server/plot.hpp>
17 
18 namespace turi {
19 /**************************************************************************/
20 /* */
21 /* Forward Declarations */
22 /* */
23 /**************************************************************************/
24 class unity_sarray;
25 class unity_sarray_base;
26 class gl_sframe;
27 class gl_sarray_range;
28 
29 template <typename T>
30 class sarray;
31 
32 template <typename T>
34 
35 template <typename T>
37 
38 /**
39  * \ingroup group_glsdk
40  * An immutable, homogeneously typed array object backed by persistent storage.
41  *
42  * The gl_sarray is a contiguous column of a single type with missing value
43  * support, and works with disk to support the holding of data that is much
44  * larger than the machine's main memory. Runtime typing of the gl_sarray is
45  * managed through the \ref flexible_type, which is an efficient runtime typed
46  * value. The types supported by the flexible_type are listed in \ref
47  * flex_type_enum.
48  *
49  *
50  * ### Construction
51  *
52  * Abstractly the gl_sarray provides an interface to read and write \ref
53  * flexible_type values where all values have the same type at runtime (for
54  * instance flex_type_enum::INTEGER). The special type \ref
55  * flex_type_enum::UNDEFINED (or the value \ref FLEX_UNDEFINED )
56  * is used to denote a missing value and can be used in combination with any
57  * types.
58  *
59  * For instance:
60  *
61  * \code
62  * // creates an array of 5 integers
63  * gl_sarray g({1,2,3,4,5});
64  *
65  * // creates an array of 5 doubles
66  * gl_sarray g({1.0,2.0,3.0,4.0,5.0});
67  *
68  * // creates an array of 4 doubles with one missing value
69  * gl_sarray g({1.0,2.0,3.0,FLEX_UNDEFINED,5.0});
70  * \endcode
71  *
72  * While the gl_sarray is conceptually immutable, all that really means is that
73  * element-wise modifications are not permitted. However, full SArray assignments
74  * are permitted.
75  *
76  * \code
77  * gl_sarray g({1,2,3,4,5});
78  * gl_sarray s = g + 1;
79  * // s is {2,3,4,5,6}
80  * \endcode
81  *
82  * ### Usage
83  *
84  * The gl_sarray API is designed to very closely mimic the Python SArray API
85  * and supports much of the Python-like capabilities, but in C++.
86  *
87  * For instance, vector and operations:
88  * \code
89  * gl_sarray s{1,2,3,4,5};
90  * gl_sarray y{2.0,3.0,2.5,1.5,2.5};
91  * auto t = (s + 10) / y;
92  * \endcode
93  *
94  * Logical filters:
95  * \code
96  * gl_sarray s{1,2,3,4,5};
97  * gl_sarray selector{0,0,1,1,1}
98  * auto t = s[selector];
99  * // t is [3,4,5]
100  *
101  *
102  * gl_sarray s{1,2,3,4,5};
103  * auto t = s[s < 3];
104  * // t is [1,2]
105  * \endcode
106  *
107  * Python Range slicing:
108  * \code
109  * gl_sarray s{1,2,3,4,5};
110  * auto t = s[{0,3}];
111  * auto u = s[{-3,-1}];
112  *
113  * // t is [1,2]
114  * // u is [3,4]
115  * \endcode
116  *
117  * And many others.
118  *
119  * The gl_sarray can be read \b inefficiently using operator[]
120  * \code
121  * gl_sarray s{1,2,3,4,5};
122  * int val = s[2];
123  * // val == 3
124  * \endcode
125  *
126  * Or iterated efficiently using the \ref range_iterator
127  * \code
128  * for (const auto& i: sa.range_iterator() {
129  * ...
130  * }
131  * \endcode
132  *
133  *
134  * The range_iterator materializes the SFrame if not already materialized, but
135  * \ref materialize_to_callback can be used to read the SFrame without
136  * materialization.
137  *
138  * The gl_sarray can constructed in a variety of means:
139  * - If the data to be written is already in memory, it can be created
140  * using the
141  * \ref gl_sarray::gl_sarray(const std::vector<flexible_type>& values, flex_type_enum dtype) "gl_sarray constructor"
142  * - Otherwise, the \ref gl_sarray_writer can be used which provides a simple
143  * write interface.
144  *
145  * ### Python Binding
146  *
147  * When used as an input argument in an SDK function, it permits a Python SArray
148  * to be passed as an argument. When used in an output argument, it will return
149  * a Python SArray.
150  *
151  * For instance:
152  * \code
153  * //
154  * // Compiled as example.so
155  * //
156  * gl_sarray add_one_to_array(gl_sarray data) {
157  * return s + 1;
158  * }
159  * BEGIN_FUNCTION_REGISTRATION
160  * REGISTER_FUNCTION(add_one_to_array, "data");
161  * END_FUNCTION_REGISTRATION
162  * \endcode
163  *
164  * Will allow this to be done in Python:
165  * \code{.py}
166  * import turicreate as gl
167  * import example
168  * sa = SArray([1,2,3,4,5])
169  * ret = example.add_one_to_array(sa)
170  * # ret is now [2,3,4,5,6]
171  * \endcode
172  *
173  * ### Details
174  *
175  * The gl_sarray is internally a reference object. i.e. in the code below,
176  * both a and b will point to the same underlying sarray. However since
177  * gl_sarray's are immutable, this does not introduce any interface quirks.
178  * \code
179  * gl_sarray a{1,2,3};
180  * gl_sarray b = a;
181  * \endcode
182  *
183  * The gl_sarray is also lazy evaluated behind the scenes to minimize disk
184  * access. Thus regardless of the size of the SArray or the complexity of the
185  * lambda operation, this operation will run quickly.
186  * \code
187  * b = (sa.apply(some_complicated_function) + 5) / 2;
188  * \endcode
189  *
190  * This may have the unfortunate effect of hiding errors until materialization
191  * is forced to occur. i.e. it might be some time much later in your code
192  * that errors in some_complicated_function will trigger.
193  *
194  * However, not all operations are lazy and certain operations will force
195  * materialization, and that is a constant target for optimization.
196  *
197  * If you want to force materialization yourself, use \ref materialize()
198  */
199 class gl_sarray {
200  public:
201  /// Constructs an empty SArray
202  gl_sarray();
203 
204  /// Copy Constructor
205  gl_sarray(const gl_sarray&);
206 
207  /// Move Constructor
208  gl_sarray(gl_sarray&&);
209 
210  /// Copy Assignment
211  gl_sarray& operator=(const gl_sarray&);
212 
213  /// Move Assignment
215 
216  /**
217  * Constructs a gl_sarray from a binary SArray saved previously with
218  * \ref save().
219  *
220  * \see save()
221  */
222  explicit gl_sarray(const std::string& directory);
223 
224  /**
225  * Constructs an gl_sarray from a in memory vector of values.
226  * \code
227  * std::vector<flexible_type> values{1,2,3,4,5};
228  *
229  * // auto infers data type
230  * gl_sarray sa(values);
231  *
232  * // resultant array is of floating point type.
233  * // Automatic type casting is performed internally.
234  * gl_sarray sa(values, flex_type_enum::FLOAT);
235  * \endcode
236  *
237  */
238  gl_sarray(const std::vector<flexible_type>& values,
240 
241  virtual ~gl_sarray();
242 
243  void construct_from_vector(const std::vector<flexible_type>& values,
245 
246  /**
247  * Constructs a gl_sarray from an initializer list of values.
248  *
249  * Type is automatically determined.
250  * \code
251  * // creates an array of 5 integers
252  * gl_sarray g({1,2,3,4,5});
253  *
254  * // creates an array of 5 doubles
255  * gl_sarray g({1.0,2.0,3.0,4.0,5.0});
256  *
257  * // non-contiguous type. Most general type is selected.
258  * // This will result in an array of strings.
259  * gl_sarray g({1,2.0,"3"});
260  * \endcode
261  */
262  gl_sarray(const std::initializer_list<flexible_type>& values);
263 
264 /**************************************************************************/
265 /* */
266 /* Static Constructors */
267 /* */
268 /**************************************************************************/
269 
270  /**
271  * Returns a gl_sarray of size with a constant value.
272  *
273  * \param value The value to fill the array
274  * \param size The size of the array
275  *
276  * \code
277  * // Construct an SArray consisting of 10 zeroes:
278  * gl_sarray zeros = gl_sarray::from_const(0, 10);
279  * \endcode
280  */
281  static gl_sarray from_const(const flexible_type& value, size_t size);
282 
283  /**
284  * Returns a gl_sarray of a sequence of integer values.
285  * \param start The starting value
286  * \param end One past the last value
287  * \param reverse If the values are in reverse
288  *
289  * \code
290  * // returns a sequence of values from 0 to 99
291  * gl_sarray::from_sequence(0, 100);
292  * // returns a sequence of values from 99 to 0
293  * gl_sarray::from_sequence(0, 100, true);
294  * \endcode
295  */
296  static gl_sarray from_sequence(size_t start, size_t end, bool reverse=false);
297 
298  /**
299  * Constructs an SArray from a json record files.
300  *
301  * A json record file contains an array of dictionaries.
302  * Resultant SArray is of dictionary type.
303  */
304  static gl_sarray read_json(const std::string& url);
305 
306  /**************************************************************************/
307  /* */
308  /* Implicit Type Converters */
309  /* */
310  /**************************************************************************/
311  /// \cond TURI_INTERNAL
312  /**
313  * \internal
314  * Implicit conversion from backend unity_sarray objects.
315  */
316  gl_sarray(std::shared_ptr<unity_sarray> sarray);
317  /**
318  * \internal
319  * Implicit conversion from backend unity_sarray_base objects.
320  */
321  gl_sarray(std::shared_ptr<unity_sarray_base> sarray);
322  /**
323  * \internal
324  * Implicit conversion from backend sarray objects.
325  */
326  gl_sarray(std::shared_ptr<sarray<flexible_type> > sarray);
327  /**
328  * \internal
329  * Implicit conversion to backend sarray objects.
330  */
331  operator std::shared_ptr<unity_sarray>() const;
332  /**
333  * \internal
334  * Implicit conversion to backend sarray objects.
335  */
336  operator std::shared_ptr<unity_sarray_base>() const;
337 
338  /**
339  * \internal
340  * Conversion to materialized backend sarray object.
341  */
342  std::shared_ptr<sarray<flexible_type> > materialize_to_sarray() const;
343 
344  /// \endcond
345 
346  /**************************************************************************/
347  /* */
348  /* Operator Overloads */
349  /* */
350  /**************************************************************************/
351 
352  /**
353  * \name Numeric operator overloads.
354  *
355  * Most operators are overloaded and will perform element-wise operations
356  * on the entire array.
357  *
358  * For instance:
359  * \code
360  * gl_sarray a{1,2,3,4,5};
361  * // an array of 5 exclamation marks
362  * gl_sarray b = gl_sarray::from_const("!", 5);
363  *
364  * auto ret = (2 * sa - 1).astype(flex_type_enum::STRING) + b;
365  * // results in ret being the array ["1!", "3!", "5!", "7!", "9!"];
366  * \endcode
367  *
368  * Comparison operators will return a gl_sarray of binary integers.
369  * \code
370  * gl_sarray a{1,2,3,4,5};
371  * auto ret = a > 3;
372  * // ret will be an integer array containing [0,0,0,1,1]
373  * \endcode
374  *
375  * Logical and bitwise operators are equivalent: & and && mean the same thing
376  * and | and || and provide logical element-wise "and" and "or"s.
377  *
378  * \code
379  * gl_sarray a{1,2,3,4,5};
380  * auto ret = a > 1 && a <= 4;
381  * // ret will be an integer array containing [0,1,1,1,0]
382  * \endcode
383  *
384  * These are useful for the logical filter operation:
385  * \code
386  * gl_sarray a{1,2,3,4,5};
387  * gl_sarray b = a.astype(flex_type_enum::STRING);
388  * auto ret = b[a > 1 && a <= 4];
389  * // ret will be an string array containing ["2","3","4"]
390  * \endcode
391  *
392  * The logical and bitwise operators can be used with non-integral arrays
393  * in which case all empty values evaluate to False. i.e. for string,
394  * list, and dictionary SArrays, empty values are interpreted as false.
395  *
396  * For instance:
397  * \code
398  * gl_sarray a{"1","","2"}; // sarray of strings
399  * gl_sarray b{1,1,0}; // sarray of integers
400  * auto ret = a && b; // ret is now {1, 0, 0}
401  * \endcode
402  */
403  ///@{
404  gl_sarray operator+(const gl_sarray& other) const;
405  gl_sarray operator-(const gl_sarray& other) const;
406  gl_sarray operator*(const gl_sarray& other) const;
407  gl_sarray operator/(const gl_sarray& other) const;
408  gl_sarray operator<(const gl_sarray& other) const;
409  gl_sarray operator>(const gl_sarray& other) const;
410  gl_sarray operator<=(const gl_sarray& other) const;
411  gl_sarray operator>=(const gl_sarray& other) const;
412  gl_sarray operator==(const gl_sarray& other) const;
413 
414  gl_sarray operator+(const flexible_type& other) const;
415  gl_sarray operator-(const flexible_type& other) const;
416  gl_sarray operator*(const flexible_type& other) const;
417  gl_sarray operator/(const flexible_type& other) const;
418  gl_sarray operator<(const flexible_type& other) const;
419  gl_sarray operator>(const flexible_type& other) const;
420  gl_sarray operator<=(const flexible_type& other) const;
421  gl_sarray operator>=(const flexible_type& other) const;
422  gl_sarray operator==(const flexible_type& other) const;
423 
424  gl_sarray operator+=(const gl_sarray& other);
425  gl_sarray operator-=(const gl_sarray& other);
426  gl_sarray operator*=(const gl_sarray& other);
427  gl_sarray operator/=(const gl_sarray& other);
428 
429  gl_sarray operator+=(const flexible_type& other);
430  gl_sarray operator-=(const flexible_type& other);
431  gl_sarray operator*=(const flexible_type& other);
432  gl_sarray operator/=(const flexible_type& other);
433  gl_sarray operator&&(const gl_sarray& other) const;
434  gl_sarray operator||(const gl_sarray& other) const;
435  gl_sarray operator&(const gl_sarray& other) const;
436  gl_sarray operator|(const gl_sarray& other) const;
437  ///@}
438 
439  /**
440  * Performs an element-wise substring search of "item". The current array
441  * must contains strings and item must be a string. Produces a 1 for each
442  * row if item is a substring of the row and 0 otherwise.
443  */
444  gl_sarray contains(const flexible_type& other) const;
445 
446  /**
447  * Returns the value at a particular array index; generally inefficient.
448  *
449  * This returns the value of the array at a particular index. Will raise
450  * an exception if the index is out of bounds. This operation is generally
451  * inefficient: the range_iterator() is prefered.
452  */
453  flexible_type operator[](int64_t i) const;
454 
455  /**
456  * Performs a logical filter.
457  *
458  * This function performs a logical filter: i.e. it subselects all the
459  * elements in this array where the corresponding value in the other array
460  * evaluates to true.
461  * \code
462  * gl_sarray a{1,2,3,4,5};
463  * auto ret = a[a > 1 && a <= 4];
464  * // ret is now the array [2,3,4]
465  * \endcode
466  */
467  gl_sarray operator[](const gl_sarray& slice) const;
468 
469  /**
470  * Performs a slice Python style.
471  *
472  * \param slice A list of 2 or 3 values. If 2 values, this is interpreted as
473  * {start, end} indices, with an implicit value of step = 1.
474  * If 3 values, this is interpreted as {start, step, end}.
475  * Values at the positions [start, start+step, start+2*start, ...] are returned
476  * until end (exclusive) is reached. Negative start and end values are
477  * interpreted as offsets from the end of the array.
478  *
479  * Given a gl_sarray
480  * \code
481  * gl_sarray a{1,2,3,4,5,6,7,8,9,10};
482  * \endcode
483  *
484  * Slicing a consecutive range:
485  * \code
486  * auto ret = a[{1,4}]; // start at index 1, end at index 4
487  * // ret is the array [2,3,4]
488  * \endcode
489  *
490  * Slicing a range with a step:
491  * \code
492  * auto ret = a[{1,2,8}]; // start at index 1, end at index 8 with step size 2
493  * // ret is the array [2,4,6,8]
494  * \endcode
495  *
496  * Using negative indexing:
497  * \code
498  * auto ret = a[{-3,-1}]; // start at end - 3, end at index end - 1
499  * // ret is the array [8,9]
500  * \endcode
501  */
502  gl_sarray operator[](const std::initializer_list<int64_t>& slice) const;
503 
504  /**************************************************************************/
505  /* */
506  /* Make Friends */
507  /* */
508  /**************************************************************************/
509  friend gl_sarray operator+(const flexible_type& opnd, const gl_sarray& opnd2);
510  friend gl_sarray operator-(const flexible_type& opnd, const gl_sarray& opnd2);
511  friend gl_sarray operator*(const flexible_type& opnd, const gl_sarray& opnd2);
512  friend gl_sarray operator/(const flexible_type& opnd, const gl_sarray& opnd2);
513  friend gl_sarray operator<(const flexible_type& opnd, const gl_sarray& opnd2);
514  friend gl_sarray operator>(const flexible_type& opnd, const gl_sarray& opnd2);
515  friend gl_sarray operator<=(const flexible_type& opnd, const gl_sarray& opnd2);
516  friend gl_sarray operator>=(const flexible_type& opnd, const gl_sarray& opnd2);
517  friend gl_sarray operator==(const flexible_type& opnd, const gl_sarray& opnd2);
518 
519 
520  /**************************************************************************/
521  /* */
522  /* Iterators */
523  /* */
524  /**************************************************************************/
525  friend class gl_sarray_range;
526 
527 
528  /**
529  * Calls a callback function passing each row of the SArray.
530  *
531  * This does not materialize the array if not necessary.
532  * The callback may be called in parallel in which case the argument provides
533  * a thread number. The function should return false, but may return
534  * true at anytime to quit the iteration process. It may also throw exceptions
535  * which will be forwarded to the caller of this function.
536  *
537  * Each call to the callback passes:
538  * - a thread id,
539  * - a shared_ptr to an sframe_rows object
540  *
541  * The sframe_rows object looks like a vector<vector<flexible_type>>.
542  * i.e. to look at all the rows, you need to write:
543  *
544  * \code
545  * sa.materalize_to_callback([&](size_t, const std::shared_ptr<sframe_rows>& rows) {
546  * for(const auto& row: *rows) {
547  * // each row looks like an std::vector<flexible_type>
548  * // and can be casted to to a vector<flexible_type> if necessary
549  *
550  * // But this this is an sarray, the element you want is always in
551  * // row[0]
552  * }
553  * });
554  * \endcode
555  *
556  * \param callback The callback to call
557  * \param nthreads Number of threads. If not specified, #cpus is used
558  */
560  std::function<bool(size_t, const std::shared_ptr<sframe_rows>&)> callback,
561  size_t nthreads = (size_t)(-1));
562 
563 
564  /**
565  * Returns a one pass range object with begin() and end() iterators.
566  *
567  * This will materialize the array.
568  *
569  * See \ref materialize_to_callback for a lazy version.
570  *
571  * \param start The starting index of the range
572  * \param end The ending index of the range
573  *
574  * \code
575  * // create a sequence of 1,000 integer values
576  * gl_sarray sa = gl_sarray::from_sequence(0,1000);
577  *
578  * // get a range over the entire array
579  * auto ra = sa.range_iterator();
580  * auto iter = ra.begin();
581  * while (iter != ra.end()) {
582  * std::cout << *iter;
583  * ++iter;
584  * }
585  * \endcode
586  *
587  * Or more compactly with C++11 syntax:
588  * \code
589  * for(const auto& val: sa.range_iterator()) {
590  * std::cout << val << "\n";
591  * }
592  * \endcode
593  *
594  * The range returned only supports one pass. The outcome of a second call to
595  * begin() is undefined after any iterator is advanced.
596  *
597  * \see gl_sarray_range
598  */
599  gl_sarray_range range_iterator(size_t start=0, size_t end=(size_t)(-1)) const;
600 
601  /**************************************************************************/
602  /* */
603  /* All other functions */
604  /* */
605  /**************************************************************************/
606 
607  /**
608  *
609  * Saves the gl_sarray to file.
610  *
611  * When format is "binary" (default), the saved SArray will be in a directory
612  * named with the `targetfile` parameter. When format is "text" or "csv",
613  * it is saved as a single human readable text file.
614  *
615  * \param filename A local path or a remote URL. If format is 'text', it
616  * will be saved as a text file. If format is 'binary', a directory will be
617  * created at the location which will contain the SArray.
618  *
619  * \param format Either "binary", "text", "csv". Defaults to "binary". optional.
620  * Format in which to save the SFrame. Binary saved SArrays can be
621  * loaded much faster and without any format conversion losses.
622  * 'text' and 'csv' are synonymous: Each SArray row will be written
623  * as a single line in an output text file. If not
624  * given, will try to infer the format from filename given. If file
625  * name ends with 'csv', 'txt' or '.csv.gz', then save as 'csv' format,
626  * otherwise save as 'binary' format.
627  */
628  void save(const std::string& directory, const std::string& format="binary") const;
629 
630  /**
631  * The size of the SArray.
632  */
633  size_t size() const;
634 
635  /**
636  * True if size() == 0.
637  */
638  bool empty() const;
639 
640  /**
641  * Returns data type of the gl_sarray.
642  *
643  * \code
644  * gl_sarray sa{1,2,3,4,5};
645  * flex_type_enum dtype = sa.dtype(); // dtype is flex_type_enum::INTEGER
646  *
647  * gl_sarray sa{"1","2","3","4","5"};
648  * flex_type_enum dtype = sa.dtype(); // dtype is flex_type_enum::STRING
649  * \endcode
650  */
651  flex_type_enum dtype() const;
652 
653  /**
654  * For a gl_sarray that is lazily evaluated, force persist this sarray to disk,
655  * committing all lazy evaluated operations.
656  *
657  * \see is_materialized
658  */
659  void materialize() const;
660 
661  /**
662  * Returns whether or not the sarray has been materialized.
663  *
664  * \see materialize
665  */
666  bool is_materialized() const;
667 
668  /**
669  * Returns an gl_sarray which contains the first n rows of this gl_sarray.
670  *
671  * \param n The number of rows to fetch.
672  * \code
673  * gl_sarray sa({0,1,2,3,4,5,6,7,8,9});
674  * auto ret = sa.head(5); // an array of values [0,1,2,3,4]
675  * \endcode
676  */
677  gl_sarray head(size_t n) const;
678 
679  /**
680  * Returns an gl_sarray which contains the last n rows of this gl_sarray.
681  *
682  * \param n The number of rows to fetch.
683  * \code
684  * gl_sarray sa({0,1,2,3,4,5,6,7,8,9});
685  * auto ret = sa.tail(5); // an array of values [5,6,7,8,9]
686  * \endcode
687  */
688  gl_sarray tail(size_t n) const;
689  /**
690  *
691  * Count words in the gl_sarray.
692  *
693  * \param to_lower Optional. If True, all words are converted to lower case
694  * before counting.
695  *
696  * Return an gl_sarray of dictionary type where each
697  * element contains the word count for each word that appeared in the
698  * corresponding input element. The words are split on all whitespace and
699  * punctuation characters. Only works if this SArray is of string type.
700  * Parameters:
701  *
702  * \code
703  * sa = turicreate.SArray(["The quick brown fox jumps.",
704  * "Word word WORD, word!!!word"])
705  * auto ret = count_words(sa)
706  * // output array is of type flex_type_enum::DICT and contains
707  * [{'quick': 1, 'brown': 1, 'jumps': 1, 'fox': 1, 'the': 1}, {'word': 5}]
708  * \endcode
709  *
710  * \see count_ngrams
711  */
712  gl_sarray count_words(bool to_lower=true, turi::flex_list delimiters={"\r", "\v", "\n", "\f", "\t", " "}) const;
713 
714 
715  /**
716  * Return an SArray of dict type where each element contains the count for
717  * each of the n-grams that appear in the corresponding input element. The
718  * n-grams can be specified to be either character n-grams or word n-grams. The
719  * input SArray must contain strings. Parameters:
720  *
721  * \param n Optional. The number of words in each n-gram. An n value of 1
722  * returns word counts. Defaults to 2.
723  *
724  * \param method Optional. Either "word" or "character". If “word”, the
725  * function performs a count of word n-grams. If “character”, does a character
726  * n-gram count. Defaults to "word".
727  *
728  * \param to_lower Optional. If true, all words are converted to lower case
729  * before counting. Defaults to true.
730  *
731  * \param ignore_space Optional. If method is “character”, indicates if
732  * spaces between words are counted as part of the n-gram. For instance, with
733  * the input SArray element of “fun games”, if this parameter is set to False
734  * one tri-gram would be ‘n g’. If ignore_space is set to True, there would be
735  * no such tri-gram (there would still be ‘nga’). This parameter has no effect
736  * if the method is set to “word”. Defaults to true.
737  *
738  * \code
739  * gl_sarray sa({"I like big dogs. I LIKE BIG DOGS."});
740  * gl_sarray ret = count_ngrams(sa, 3);
741  * // returns gl_sarray of dictionary type containing
742  * // [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}]
743  * \endcode
744  * \code
745  * gl_sarray sa(["Fun. Is. Fun"]);
746  * gl_sarray ret = count_ngrams(sa, 3, "character")
747  * // returns gl_sarray of dictionary type containing
748  * [{'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}]
749  * \endcode
750  *
751  mp* \see count_words
752  */
753  gl_sarray count_ngrams(size_t n=2, std::string method="word",
754  bool to_lower=true, bool ignore_space=true) const;
755 
756  /**
757  * Filter an SArray of dictionary type by the given keys. By default, all
758  * keys that are in the provided list in "keys" are \b excluded from the
759  * returned SArray.
760  *
761  * \param keys A collection of keys to trim down the elements in the
762  * SArray.
763  *
764  * \param exclude Optional If True, all keys that are in the input key list
765  * are removed. If False, only keys that are in the input key list are
766  * retained. Defaults to true.
767  *
768  * \code
769  * gl_sarray sa({flex_dict{{"this",1}, {"is",1}, {"dog",2}},
770  * flex_dict{{"this", 2}, {"are",2}, {"cat", 1}} });
771  * gl_Sarray ret = sa.dict_trim_by_keys({"this", "is", "and", "are"});
772  * // returns an SArray of dictionary type containing
773  * // [{'dog': 2}, {'cat': 1}]
774  * \endcode
775  *
776  * \see dict_trim_by_values
777  */
778  gl_sarray dict_trim_by_keys(const std::vector<flexible_type>& keys,
779  bool exclude=true) const;
780 
781  /**
782  * Filter dictionary values to a given range (inclusive). Trimming is only
783  * performed on values which can be compared to the bound values. Fails on
784  * SArrays whose data type is not ``dict``.
785  *
786  * \param lower Optional. The lowest dictionary value that would be retained
787  * in the result. If FLEX_UNDEFINED , lower bound is not applied. Defaults to
788  * FLEX_UNDEFINED.
789  *
790  * \param upper Optional. The highest dictionary value that would be retained
791  * in the result. If FLEX_UNDEFINED, upper bound is not applied. Defaults to
792  * FLEX_UNDEFINED.
793  *
794  * Example:
795  * \code
796  * auto sa = gl_sarray({flex_dict{{"this",1},{"is",5},{"dog",7}},
797  * flex_dict{{"this", 2},{"are",1},{"cat", 5}} });
798  * std::cout << sa.dict_trim_by_values(2,5);
799  * std::cout << sa.dict_trim_by_values(upper=5);
800  * \endcode
801  *
802  * Produces output:
803  * \code{.txt}
804  * dtype: dict
805  * Rows: 2
806  * [{'is': 5}, {'this': 2, 'cat': 5}]
807  *
808  * dtype: dict
809  * Rows: 2
810  * [{'this': 1, 'is': 5}, {'this': 2, 'are': 1, 'cat': 5}]
811  * \endcode
812  *
813  * \see dict_trim_by_keys
814  */
816  const flexible_type& upper = FLEX_UNDEFINED) const;
817  /**
818  * Create an gl_sarray that contains all the keys from each dictionary
819  * element as a list. Fails on gl_sarray objects whose data type is not "dict".
820  *
821  *
822  * Example:
823  * \code
824  * auto sa = gl_sarray({flex_dict{{"this",1},{ "is",5},{ "dog",7}},
825  * flex_dict{{"this", 2},{ "are", 1},{ "cat", 5}}});
826  * std::cout << sa.dict_keys();
827  * \endcode
828  *
829  * Produces output:
830  * \code{.txt}
831  * dtype: list
832  * Rows: 2
833  * [['this', 'is', 'dog'], ['this', 'are', 'cat']]
834  * \endcode
835  *
836  * \see dict_values
837  */
838  gl_sarray dict_keys() const;
839 
840  /**
841  * Create an \ref gl_sarray that contains all the values from each dictionary
842  * element as a list. Fails on \ref gl_sarray objects whose data type is not
843  * "dict".
844  *
845  *
846  * Example:
847  * \code
848  * auto sa = gl_sarray({flex_dict{{"this",1},{"is",5},{"dog",7}},
849  * flex_dict{{"this", 2},{"are", 1},{"cat", 5}}});
850  * std::cout << sa.dict_values();
851  * \endcode
852  *
853  * Produces output:
854  * \code{.txt}
855  * dtype: list
856  * Rows: 2
857  * [[1, 5, 7], [2, 1, 5]]
858  * \endcode
859  *
860  * \see dict_keys
861  */
862  gl_sarray dict_values() const;
863 
864  /**
865  * Create a boolean \ref gl_sarray by checking the keys of an \ref gl_sarray
866  * of dictionaries. An element of the output \ref gl_sarray is True if the
867  * corresponding input element's dictionary has any of the given keys. Fails
868  * on \ref gl_sarray objects whose data type is not "dict".
869  *
870  * \param keys A list of key values to check each dictionary against.
871  *
872  * Example:
873  * \code
874  * auto sa = gl_sarray({flex_dict{{"this",1},{ "is",5},{ "dog",7}},
875  * flex_dict{{"animal",1}},
876  * flex_dict{{"this", 2},{ "are", 1},{ "cat", 5}}});
877  * std::cout << sa.dict_has_any_keys({"is", "this", "are"});
878  * \endcode
879  *
880  * Produces output:
881  * \code{.txt}
882  * dtype: int
883  * Rows: 3
884  * [1, 1, 0]
885  * \endcode
886  *
887  * \see dict_has_all_keys
888  */
889  gl_sarray dict_has_any_keys(const std::vector<flexible_type>& keys) const;
890 
891  /**
892  * Create a boolean \ref gl_sarray by checking the keys of an \ref gl_sarray
893  * of dictionaries. An element of the output \ref gl_sarray is True if the
894  * corresponding input element's dictionary has all of the given keys. Fails
895  * on \ref gl_sarray objects whose data type is not "dict".
896  *
897  * \param keys A list of key values to check each dictionary against.
898  *
899  * Example:
900  * \code
901  * auto sa = gl_sarray({flex_dict{{"this",1},{"is",5},{"dog",7}},
902  * flex_dict{{"this", 2},{"are", 1},{"cat", 5}}});
903  * std::cout << sa.dict_has_all_keys({"is", "this"});
904  * \endcode
905  *
906  * Produces output:
907  * \code{.txt}
908  * dtype: int
909  * Rows: 2
910  * [1, 0]
911  * \endcode
912  *
913  * \see dict_has_any_keys
914  */
915  gl_sarray dict_has_all_keys(const std::vector<flexible_type>& keys) const;
916 
917 
918  /**
919  * Transform each element of the \ref gl_sarray by a given function. The
920  * result \ref gl_sarray is of type "dtype". "fn" should be a function that
921  * returns exactly one value which can be cast into the type specified by
922  * "dtype".
923  *
924  * \param fn The function to transform each element. Must return exactly one
925  * value which can be cast into the type specified by "dtype".
926  *
927  * \param dtype The data type of the new \ref gl_sarray.
928  *
929  * \param skip_undefined Optional. If true, will not apply "fn" to
930  * any undefined values. Defaults to true.
931  *
932  * Example:
933  * \code
934  * auto sa = gl_sarray({1,2,3});
935  * std::cout << sa.apply([](const flexible_type& x) { return x*1; },
936  * flex_type_enum::INTEGER);
937  * \endcode
938  *
939  * Produces output:
940  * \code{.txt}
941  * dtype: int
942  * Rows: 3
943  * [2, 4, 6]
944  * \endcode
945  *
946  * \see gl_sframe::apply
947  */
948  gl_sarray apply(std::function<flexible_type(const flexible_type&)> fn,
949  flex_type_enum dtype,
950  bool skip_undefined=true) const;
951 
952  /**
953  * Filter this \ref gl_sarray by a function. Returns a new \ref gl_sarray
954  * filtered by this \ref gl_sarray. If "fn" evaluates an element to true,
955  * this element is copied to the new \ref gl_sarray. If not, it isn't. Throws
956  * an exception if the return type of "fn" is not castable to a boolean
957  * value.
958  *
959  * \param fn Function that filters the \ref gl_sarray. Must evaluate to bool
960  * or int.
961  *
962  * \param skip_undefined Optional. If true, will not apply fn to any
963  * undefined values.
964  *
965  * Example:
966  * \code
967  * auto sa = gl_sarray({1,2,3});
968  * std::cout << sa.filter([](flexible_type x){ return x < 3; });
969  * \endcode
970  *
971  * Produces output:
972  * \code{.txt}
973  * dtype: int
974  * Rows: 2
975  * [1, 2]
976  * \endcode
977  *
978  * This function is equivalent to the combination of a logical_filter and
979  * an apply.
980  * \code
981  * res = sa[sa.apply(fn)];
982  * \endcode
983  */
984  gl_sarray filter(std::function<bool(const flexible_type&)> fn,
985  bool skip_undefined=true) const;
986 
987  /**
988  * Create an \ref gl_sarray which contains a subsample of the current
989  * \ref gl_sarray.
990  *
991  * \param fraction The fraction of the rows to fetch. Must be between 0 and 1.
992  *
993  * Example:
994  * \code
995  * auto sa = gl_sarray::from_sequence(0, 10);
996  * std::cout << sa.sample(.3);
997  * \endcode
998  *
999  * Produces output:
1000  * \code{.txt}
1001  * dtype: int
1002  * Rows: 3
1003  * [2, 6, 9]
1004  * \endcode
1005  */
1006  gl_sarray sample(double fraction) const;
1007 
1008  /**
1009  * Create an \ref gl_sarray which contains a subsample of the current
1010  * \ref gl_sarray.
1011  *
1012  * \param fraction The fraction of the rows to fetch. Must be between 0 and 1.
1013  *
1014  * \param seed The random seed for the random number generator.
1015  * Deterministic output is obtained if this is set to a constant.
1016  *
1017  * Example:
1018  * \code
1019  * auto sa = gl_sarray::from_sequence(0, 10);
1020  * std::cout << sa.sample(.3, 12345);
1021  * \endcode
1022  *
1023  * Produces output:
1024  * \code{.txt}
1025  * dtype: int
1026  * Rows: 3
1027  * [1,3,6,9]
1028  * \endcode
1029  */
1030  gl_sarray sample(double fraction, size_t seed, bool exact=false) const;
1031 
1032  /**
1033  * Returns an SArray with a hash of each element. seed can be used to change
1034  * the hash function to allow this method to be used for random number generation.
1035  *
1036  * \param seed Defaults to 0. Can be changed to different values to get different hash results.
1037  *
1038  * Example:
1039  * \code
1040  * sa = gl_sarray::from_sequence(0,10);
1041  * std::cout << sa.hash(123)
1042  * \endcode
1043  *
1044  * Produces output:
1045  * \code{.txt}
1046  * dtype: int
1047  * Rows: 10
1048  * [-2176393851141330893, 7600995152976636137, -5571280844667425574, -4385410391720336496, -4446257658862464208, -7571182417602171808, 3644372782970789199, 3084542717492096231, 4758268028978242780, -6520852338875851008]
1049  * \endcode
1050  */
1051  gl_sarray hash(size_t seed = 0) const;
1052 
1053  /**
1054  * Return true if every element of the \ref gl_sarray evaluates to true. For
1055  * numeric \ref gl_sarray objects zeros and missing values ("None") evaluate
1056  * to false, while all non-zero, non-missing values evaluate to true. For
1057  * string, list, and dictionary \ref gl_sarray objects, empty values (zero
1058  * length strings, lists or dictionaries) or missing values ("None") evaluate
1059  * to false. All other values evaluate to true. Returns true on an empty
1060  * \ref gl_sarray.
1061  *
1062  * Example:
1063  * \code
1064  * std::cout << gl_sarray({1, FLEX_UNDEFINED}).all();
1065  * std::cout << gl_sarray({1, 0}).all();
1066  * std::cout << gl_sarray({1, 2}).all();
1067  * std::cout << gl_sarray({"hello", "world"}).all();
1068  * std::cout << gl_sarray({"hello", ""}).all();
1069  * std::cout << gl_sarray({}).all();
1070  * \endcode
1071  *
1072  * Produces output:
1073  * \code{.txt}
1074  * 0
1075  * 0
1076  * 1
1077  * 1
1078  * 0
1079  * 1
1080  * \endcode
1081  *
1082  * \see any
1083  */
1084 
1085 
1086 
1087  bool all() const;
1088 
1089  /**
1090  * Return true if any element of the \ref gl_sarray evaluates to true. For
1091  * numeric \ref gl_sarray objects any non-zero value evaluates to true. For
1092  * string, list, and dictionary \ref gl_sarray objects, any element of
1093  * non-zero length evaluates to true. Returns false on an empty \ref
1094  * gl_sarray.
1095  *
1096  * Example:
1097  * \code
1098  * std::cout << gl_sarray({1, FLEX_UNDEFINED}).any();
1099  * std::cout << gl_sarray({1, 0}).any();
1100  * std::cout << gl_sarray({0, 0}).any();
1101  * std::cout << gl_sarray({"hello", "world"}).any();
1102  * std::cout << gl_sarray({"hello", ""}).any();
1103  * std::cout << gl_sarray({"", ""}).any();
1104  * std::cout << gl_sarray({}).any();
1105  * \endcode
1106  *
1107  * Produces output:
1108  * \code{.txt}
1109  * 1
1110  * 1
1111  * 0
1112  * 1
1113  * 1
1114  * 0
1115  * 0
1116  * \endcode
1117  *
1118  * \see all
1119  */
1120  bool any() const;
1121 
1122  /**
1123  * Get maximum numeric value in \ref gl_sarray. Returns FLEX_UNDEFINED on an
1124  * empty \ref gl_sarray. Raises an exception if called on an \ref gl_sarray
1125  * with non-numeric type.
1126  *
1127  * Example:
1128  * \code
1129  * std::cout << gl_sarray({14, 62, 83, 72, 77, 96, 5, 25, 69, 66}).max();
1130  * \endcode
1131  *
1132  * Produces output:
1133  * \code{.txt}
1134  * 96
1135  * \endcode
1136  *
1137  * \see min
1138  */
1139  flexible_type max() const;
1140 
1141  /**
1142  * Get minimum numeric value in \ref gl_sarray. Returns FLEX_UNDEFINED on an
1143  * empty \ref gl_sarray. Raises an exception if called on an \ref gl_sarray
1144  * with non-numeric type.
1145  *
1146  * Example:
1147  * \code
1148  * std::cout << gl_sarray({14, 62, 83, 72, 77, 96, 5, 25, 69, 66}).min();
1149  * \endcode
1150  *
1151  *
1152  * \see max
1153  */
1154  flexible_type min() const;
1155 
1156  /**
1157  * Sum of all values in this \ref gl_sarray.
1158  *
1159  * Raises an exception if called on an \ref gl_sarray of strings, lists, or
1160  * dictionaries. If the \ref gl_sarray contains numeric arrays (flex_vec)
1161  * and all the arrays are the same length, the sum over all the arrays will
1162  * be returned. Returns FLEX_UNDEFINED on an empty \ref gl_sarray. For large
1163  * values, this may overflow without warning.
1164  */
1165  flexible_type sum() const;
1166 
1167  /**
1168  * Mean of all the values in the \ref gl_sarray, or mean image. Returns
1169  * FLEX_UNDEFINED on an empty \ref gl_sarray. Raises an exception if called
1170  * on an \ref gl_sarray with non-numeric type or non-Image type.
1171  */
1172  flexible_type mean() const;
1173 
1174  /**
1175  * Standard deviation of all the values in the \ref gl_sarray.
1176  * Returns FLEX_UNDEFINED on an empty \ref gl_sarray. Raises an exception if
1177  * called on an \ref gl_sarray with non-numeric type.
1178  */
1179  flexible_type std() const;
1180 
1181  /**
1182  * Number of non-zero elements in the \ref gl_sarray.
1183  */
1184  size_t nnz() const;
1185 
1186  /**
1187  * Number of missing elements in the \ref gl_sarray.
1188  */
1189  size_t num_missing() const;
1190 
1191 
1192  /**
1193  * Create a new \ref gl_sarray with all the values cast to str. The string
1194  * format is specified by the 'str_format' parameter.
1195  *
1196  * \param str_format The format to output the string. Default format is
1197  * "%Y-%m-%dT%H:%M:%S%ZP". See the strftime specification for details on
1198  * the format string.
1199  *
1200  * Example:
1201  * \code
1202  *
1203  * boost::posix_time::ptime t(boost::gregorian::date(2011, 1, 1));
1204  * boost::posix_time::ptime epoch(boost::gregorian::date(1970,1,1));
1205  * auto x = (t - epoch).total_seconds();
1206  *
1207  * auto sa = gl_sarray({flex_date_time(x)});
1208  * std::cout << sa.datetime_to_str("%e %b %Y");
1209  * \endcode
1210  *
1211  * Produces output:
1212  * \code{.txt}
1213  * dtype: str
1214  * Rows: 1
1215  * [" 1 Jan 2011"]
1216  * \endcode
1217  *
1218  * \see str_to_datetime
1219  */
1220  gl_sarray datetime_to_str(const std::string& str_format="%Y-%m-%dT%H:%M:%S%ZP") const;
1221 
1222  /**
1223  * Create a new \ref gl_sarray with all the values cast to datetime. The
1224  * string format is specified by the 'str_format' parameter.
1225  *
1226  * \param str_format The format to parse the string. Default format is
1227  * "%Y-%m-%dT%H:%M:%S%ZP". See the strptime specification for details on
1228  * the format string.
1229  *
1230  * Example:
1231  *
1232  * \code
1233  * auto sa = gl_sarray({"20-Oct-2011 09:30:10 GMT-05:30"});
1234  * std::cout << sa.str_to_datetime("%d-%b-%Y %H:%M:%S %ZP");
1235  * \endcode
1236  *
1237  * Produces output:
1238  * \code{.txt}
1239  * dtype: datetime
1240  * Rows: 1
1241  * [20111020T093010]
1242  * \endcode
1243  *
1244  * \see datetime_to_str
1245  */
1246  gl_sarray str_to_datetime(const std::string& str_format="%Y-%m-%dT%H:%M:%S%ZP") const;
1247 
1248 
1249  /**
1250  * Create a new \ref gl_sarray with all the values cast to
1251  * \ref turi::image_type of uniform size.
1252  *
1253  * \param width int The width of the new images.
1254  *
1255  * \param height int The height of the new images.
1256  *
1257  * \param channels int. Number of channels of the new images.
1258  *
1259  * \param undefined_on_failure optional. defaults to true. If true,
1260  * return FLEX_UNDEFINED type instead of Image type on failure. If false,
1261  * raises error upon failure.
1262  *
1263  * \param allow_rounding optional. Default to false. If true, rounds
1264  * non-integer values when converting to Image type. If false, raises error
1265  * upon rounding.
1266  */
1267  gl_sarray pixel_array_to_image(size_t width, size_t height, size_t channels=3,
1268  bool undefined_on_failure=true) const;
1269 
1270  /**
1271  * Create a new \ref gl_sarray with all values cast to the given type. Throws
1272  * an exception if the types are not castable to the given type.
1273  *
1274  * \param dtype The type to cast the elements to in \ref gl_sarray
1275  *
1276  * \param undefined_on_failure: Optional. Defaults to True. If set to true,
1277  * runtime cast failures will be emitted as missing values rather than
1278  * failing.
1279  *
1280  * Example:
1281  * \code
1282  * auto sa = gl_sarray({'1','2','3','4'});
1283  * std::cout << sa.astype(flex_type_enum::INTEGER);
1284  * \endcode
1285  *
1286  * Produces output:
1287  * \code{.txt}
1288  * dtype: int
1289  * Rows: 4
1290  * [1, 2, 3, 4]
1291  * \endcode
1292  *
1293  * Given an SArray of strings that look like dicts, convert to a dictionary
1294  * type:
1295  * \code
1296  * auto sa = gl_sarray({'flex_dict{{1:2 3,4}}', 'flex_dict{{a:b c,d}}'});
1297  * std::cout << sa.astype(flex_type_enum::DICT);
1298  * \endcode
1299  *
1300  * Produces output:
1301  * \code{.txt}
1302  * dtype: dict
1303  * Rows: 2
1304  * [{1: 2, 3: 4}, {'a': 'b', 'c': 'd'}]
1305  * \endcode
1306  *
1307  */
1308  gl_sarray astype(flex_type_enum dtype, bool undefined_on_failure=true) const;
1309 
1310  /**
1311  * Create a new \ref gl_sarray with each value clipped to be within the given
1312  * bounds. In this case, "clipped" means that values below the lower bound
1313  * will be set to the lower bound value. Values above the upper bound will be
1314  * set to the upper bound value. This function can operate on \ref gl_sarray
1315  * objects of numeric type as well as array type, in which case each
1316  * individual element in each array is clipped. By default "lower" and
1317  * "upper" are set to "float('nan')" which indicates the respective bound
1318  * should be ignored. The method fails if invoked on an \ref gl_sarray of
1319  * non-numeric type.
1320  * \param lower Optional. The lower bound used to clip.
1321  * Ignored if equal to FLEX_UNDEFINED (the default).
1322  *
1323  * \param upper Optional. The upper bound used to clip.
1324  * Ignored if equal to FLEX_UNDEFINED (the default).
1325  *
1326  * Example:
1327  * \code
1328  * auto sa = gl_sarray({1,2,3});
1329  * std::cout << sa.clip(2,2);
1330  * \endcode
1331  *
1332  * Produces output:
1333  * \code{.txt}
1334  * dtype: int
1335  * Rows: 3
1336  * [2, 2, 2]
1337  * \endcode
1338  *
1339  * \see clip_lower
1340  * \see clip_upper
1341  */
1343  flexible_type upper=FLEX_UNDEFINED) const;
1344 
1345  /**
1346  * Create new \ref gl_sarray with all values clipped to the given lower
1347  * bound. This function can operate on numeric arrays, as well as vector
1348  * arrays, in which case each individual element in each vector is clipped.
1349  * Throws an exception if the \ref gl_sarray is empty or the types are
1350  * non-numeric.
1351  *
1352  * \param threshold The lower bound used to clip values.
1353  *
1354  * Example:
1355  * \code
1356  * auto sa = gl_sarray({1,2,3});
1357  * std::cout << sa.clip_lower(2);
1358  * \endcode
1359  *
1360  * Produces output:
1361  * \code{.txt}
1362  * dtype: int
1363  * Rows: 3
1364  * [2, 2, 3]
1365  * \endcode
1366  *
1367  * \see clip
1368  * \see clip_upper
1369  */
1370  gl_sarray clip_lower(flexible_type threshold) const;
1371 
1372  /**
1373  * Create new \ref gl_sarray with all values clipped to the given upper
1374  * bound. This function can operate on numeric arrays, as well as vector
1375  * arrays, in which case each individual element in each vector is clipped.
1376  *
1377  * \param threshold The upper bound used to clip values.
1378  *
1379  * Example:
1380  * \code
1381  * auto sa = gl_sarray({1,2,3});
1382  * std::cout << sa.clip_upper(2);
1383  * \endcode
1384  *
1385  * Produces output:
1386  * \code{.txt}
1387  * dtype: int
1388  * Rows: 3
1389  * [1, 2, 2]
1390  * \endcode
1391  *
1392  * \see clip
1393  * \see clip_lower
1394  */
1395  gl_sarray clip_upper(flexible_type threshold) const;
1396 
1397  /**
1398  * Create new \ref gl_sarray containing only the non-missing values of the
1399  * \ref gl_sarray. A missing value shows up in an \ref gl_sarray as
1400  * 'FLEX_UNDEFINED'. This will also drop NAN values.
1401  */
1402  gl_sarray dropna() const;
1403 
1404  /**
1405  * Create new \ref gl_sarray with all missing values (FLEX_UNDEFINED or NaN)
1406  * filled in with the given value. The size of the new \ref gl_sarray will
1407  * be the same as the original \ref gl_sarray. If the given value is not the
1408  * same type as the values in the \ref gl_sarray, "fillna" will attempt to
1409  * convert the value to the original \ref gl_sarray's type. If this fails, an
1410  * error will be raised.
1411  *
1412  * \param value The value used to replace all missing values
1413  */
1414  gl_sarray fillna(flexible_type value) const;
1415 
1416  /**
1417  * Create an \ref gl_sarray indicating which elements are in the top k.
1418  * Entries are '1' if the corresponding element in the current \ref gl_sarray is a
1419  * part of the top k elements, and '0' if that corresponding element is
1420  * not. Order is descending by default.
1421  *
1422  * \param topk Optional. Defaults to 10. The number of elements to determine
1423  * if 'top'
1424  *
1425  * \param reverse Optional. Defaults to false. If true, return the topk
1426  * elements in ascending order
1427  */
1428  gl_sarray topk_index(size_t topk=10, bool reverse=false) const;
1429 
1430  /**
1431  * Append an \ref gl_sarray to the current \ref gl_sarray. Returns a new
1432  * \ref gl_sarray with the rows from both \ref gl_sarray objects. Both
1433  * \ref gl_sarray objects must be of the same type.
1434  *
1435  * \param other Another \ref gl_sarray whose rows are appended to current \ref gl_sarray.
1436  *
1437  * Example:
1438  * \code
1439  * auto sa = gl_sarray({1, 2, 3});
1440  * auto sa2 = gl_sarray({4, 5, 6});
1441  * std::cout << sa.append(sa2);
1442  * \endcode
1443  *
1444  * Produces output:
1445  * \code{.txt}
1446  * dtype: int
1447  * Rows: 6
1448  * [1, 2, 3, 4, 5, 6]
1449  * \endcode
1450  *
1451  * \see \ref gl_sframe.append
1452  */
1453  gl_sarray append(const gl_sarray& other) const;
1454 
1455  /**
1456  * Get all unique values in the current \ref gl_sarray. Raises an error
1457  * if the \ref gl_sarray is of dictionary type. Will not necessarily preserve
1458  * the order of the given \ref gl_sarray in the new \ref gl_sarray.
1459  *
1460  * \see gl_sframe::unique
1461  */
1462  gl_sarray unique() const;
1463 
1464  /**
1465  * Length of each element in the current \ref gl_sarray. Only works on \ref
1466  * gl_sarray objects of dict, array, or list type. If a given element is a
1467  * missing value, then the output elements is also a missing value. This
1468  * function is equivalent to the following:
1469  *
1470  * sa_item_len = sa.apply([](const flexible_type& x) {
1471  * return flexible_type(x.get_type() == flex_type_enum::UNDEFINED ? 0 : x.size();)
1472  * });
1473  *
1474  * Example:
1475  * \code
1476  * auto sa = gl_sarray({flex_dict{{"is_restaurant", 1}, {"is_electronics", 0}},
1477  * flex_dict{{"is_restaurant", 1}, {"is_retail", 1}, {"is_electronics", 0}},
1478  * flex_dict{{"is_restaurant", 0}, {"is_retail", 1}, {"is_electronics", 0}},
1479  * flex_dict{{"is_restaurant", 0}},
1480  * flex_dict{{"is_restaurant", 1}, {"is_electronics", 1}},
1481  * FLEX_UNDEFINED});
1482  * std::cout << sa.item_length();
1483  * \endcode
1484  *
1485  * Produces output:
1486  * \code{.txt}
1487  * dtype: int
1488  * Rows: 6
1489  * [2, 3, 3, 1, 2, None]
1490  * \endcode
1491  *
1492  */
1493  gl_sarray item_length() const;
1494 
1495  /**
1496  * Splits an \ref gl_sarray of datetime type to multiple columns, return a
1497  * new \ref gl_sframe that contains expanded columns. A \ref gl_sarray of datetime will be
1498  * split by default into an \ref gl_sframe of 6 columns, one for each
1499  * year/month/day/hour/minute/second element.
1500  *
1501  * When splitting a \ref gl_sarray of datetime type, new columns are named:
1502  * prefix.year, prefix.month, etc. The prefix is set by the parameter
1503  * "column_name_prefix" and defaults to 'X'. If column_name_prefix is
1504  * FLEX_UNDEFINED or empty, then no prefix is used.
1505  *
1506  * If tzone parameter is true, then timezone information is represented
1507  * as one additional column which is a float shows the offset from
1508  * GMT(0.0) or from UTC.
1509  *
1510  * \param column_name_prefix Optional. If provided, expanded column names
1511  * would start with the given prefix. Defaults to "X".
1512  *
1513  * \param limit: Optional. Limits the set of datetime elements to expand.
1514  * Elements are 'year','month','day','hour','minute', and 'second'.
1515  *
1516  * \param tzone: Optional. A boolean parameter that determines whether to
1517  * show timezone column or not. Defaults to false.
1518  *
1519  * Example:
1520  * \code
1521  * auto sa = gl_sarray({"20-Oct-2011", "10-Jan-2012"});
1522  * auto date_sarray = sa.str_to_datetime("%d-%b-%Y");
1523  * auto split_sf = date_sarray.split_datetime("", {"day","year"});
1524  * std::cout << split_sf;
1525  * \endcode
1526  *
1527  * Produces output:
1528  * \code{.txt}
1529  * Columns:
1530  * day integer
1531  * year integer
1532  * +----------------+----------------+
1533  * | day | year |
1534  * +----------------+----------------+
1535  * | 20 | 2011 |
1536  * | 10 | 2012 |
1537  * +----------------+----------------+
1538  * [2 rows x 2 columns]
1539  * \endcode
1540  *
1541  */
1542  gl_sframe split_datetime(const std::string& column_name_prefix = "X",
1543  const std::vector<std::string>& limit = {"year","month","day","hour","minute","second"},
1544  bool tzone=false) const;
1545 
1546  /**
1547  * Convert an \ref gl_sarray of list, array, or dict type to an \ref
1548  * gl_sframe with multiple columns.
1549  *
1550  * "unpack" expands an \ref gl_sarray using the values of each
1551  * vector/list/dict as elements in a new \ref gl_sframe of multiple columns.
1552  * For example, an \ref gl_sarray of lists each of length 4 will be expanded
1553  * into an \ref gl_sframe of 4 columns, one for each list element. An \ref
1554  * gl_sarray of lists/arrays of varying size will be expand to a number of
1555  * columns equal to the longest list/array. An \ref gl_sarray of
1556  * dictionaries will be expanded into as many columns as there are keys.
1557  *
1558  * When unpacking an \ref gl_sarray of list or vector type, new columns are
1559  * named: "column_name_prefix".0, "column_name_prefix".1, etc. If unpacking a
1560  * column of dict type, unpacked columns are named "column_name_prefix".key1,
1561  * "column_name_prefix".key2, etc.
1562  *
1563  * When unpacking an \ref gl_sarray of list or dictionary types, missing
1564  * values in the original element remain as missing values in the resultant
1565  * columns. If the "na_value" parameter is specified, all values equal to
1566  * this given value are also replaced with missing values. In an \ref
1567  * gl_sarray of vector type, NaN is interpreted as a missing value.
1568  *
1569  * \ref gl_sframe::pack_columns() is the reverse effect of unpack.
1570  *
1571  * \param column_name_prefix Optional. If provided, unpacked column
1572  * names would start with the given prefix. Defaults to "X". If the empty
1573  * string is used, no prefix is used.
1574  *
1575  * \param column_types Optional. Column types for the unpacked columns. If
1576  * not provided, column types are automatically inferred from first 100 rows.
1577  * Defaults to FLEX_UNDEFINED.
1578  *
1579  * \param na_value Optional. Convert all values that are equal to "na_value"
1580  * to missing value if specified.
1581  *
1582  * \param limit optional limits in the set of list/vector/dict keys to unpack.
1583  * For list/vector gl_sarrays, "limit" must contain integer indices.
1584  * For dict gl_sarrays, "limit" must contain dictionary keys.
1585  *
1586  * Example:
1587  * \code
1588  * auto sa = gl_sarray({flex_dict{{"word", "a"},{"count", 1}},
1589  * flex_dict{{"word", "cat"},{"count", 2}},
1590  * flex_dict{{"word", "is"},{"count", 3}},
1591  * flex_dict{{"word", "coming"},{"count", 4}}});
1592  * std::cout << sa.unpack("");
1593  * \endcode
1594  * Produces output:
1595  * \code{.txt}
1596  * Columns:
1597  * count int
1598  * word str
1599  * Rows: 4
1600  * Data:
1601  * +-------+--------+
1602  * | count | word |
1603  * +-------+--------+
1604  * | 1 | a |
1605  * | 2 | cat |
1606  * | 3 | is |
1607  * | 4 | coming |
1608  * +-------+--------+
1609  * [4 rows x 2 columns]
1610  * \endcode
1611  *
1612  * Unpack only the key "word":
1613  * \code
1614  * std::cout << sa.unpack("X", {}, FLEX_UNDEFINED, {"word"});
1615  * \endcode
1616  * Produces output:
1617  * \code{.txt}
1618  * Columns:
1619  * X.word str
1620  * Rows: 4
1621  * Data:
1622  * +--------+
1623  * | X.word |
1624  * +--------+
1625  * | a |
1626  * | cat |
1627  * | is |
1628  * | coming |
1629  * +--------+
1630  * [4 rows x 1 columns]
1631  * \endcode
1632  *
1633  * Convert all zeros to missing values:
1634  * \code
1635  * auto sa2 = gl_sarray({flex_vec{1, 0, 1},
1636  * flex_vec{1, 1, 1},
1637  * flex_vec{0, 1}});
1638  * std::cout << sa2.unpack("X", {flex_type_enum::INTEGER,
1639  * flex_type_enum::INTEGER,
1640  * flex_type_enum::INTEGER}, 0);
1641  * \endcode
1642  * Produces output:
1643  * \code{.txt}
1644  * Columns:
1645  * X.0 int
1646  * X.1 int
1647  * X.2 int
1648  * Rows: 3
1649  * Data:
1650  * +------+------+------+
1651  * | X.0 | X.1 | X.2 |
1652  * +------+------+------+
1653  * | 1 | None | 1 |
1654  * | 1 | 1 | 1 |
1655  * | None | 1 | None |
1656  * +------+------+------+
1657  * [3 rows x 3 columns]
1658  * \endcode
1659  */
1660  gl_sframe unpack(const std::string& column_name_prefix = "X",
1661  const std::vector<flex_type_enum>& column_types = std::vector<flex_type_enum>(),
1662  const flexible_type& na_value = FLEX_UNDEFINED,
1663  const std::vector<flexible_type>& limit = std::vector<flexible_type>()) const;
1664 
1665  /**
1666  * Sort all values in this \ref gl_sarray. Sort only works for sarray of
1667  * type str, int and float, otherwise TypeError will be raised. Creates a
1668  * new, sorted \ref gl_sarray.
1669  *
1670  * \param ascending Optional. Defaults to True. If true, the sarray values
1671  * are sorted in ascending order, otherwise, descending order.
1672  *
1673  * Example:
1674  * \code
1675  * auto sa = SArray({3,2,1});
1676  * std::cout << sa.sort();
1677  * \endcode
1678  *
1679  * Produces output:
1680  * \code{.txt}
1681  * dtype: int
1682  * Rows: 3
1683  * [1, 2, 3]
1684  * \endcode
1685  *
1686  */
1687  gl_sarray sort(bool ascending=true) const;
1688 
1689 
1690  /**
1691  *
1692  * This returns an SArray with each element sliced accordingly to the
1693  * slice specified.
1694  *
1695  * \param start The start position of the slice
1696  * \param stop The stop position of the slice
1697  * \param step The step size of the slice (default = 1)
1698  *
1699  * \return an SArray with each individual vector/string/list sliced
1700  * according to the arguments.
1701  *
1702  * This is conceptually equivalent to the python equivalent of:
1703  * \code
1704  * g.apply(lambda x: x[start:step:stop])
1705  * \endcode
1706  *
1707  * The SArray must be of type list, vector, or string.
1708  *
1709  * For instance:
1710  * \code
1711  * g = SArray({"abcdef","qwerty"});
1712  * std::cout << g.subslice(0, 2);
1713  * \endcode
1714  *
1715  * Produces output:
1716  * \code{.txt}
1717  * dtype: str
1718  * Rows: 2
1719  * ["ab", "qw"]
1720  * \endcode
1721  *
1722  * Negative indeices:
1723  * \code
1724  * std::cout << g.subslice(3,-1);
1725  * \endcode
1726  * Produces output:
1727  * \code{.txt}
1728  * dtype: str
1729  * Rows: 2
1730  * ["de", "rt"]
1731  * \endcode
1732  *
1733  * Arrays:
1734  * \code
1735  * g = SArray({{1,2,3}, {4,5,6}});
1736  * std::cout << g.subslice(0, 1);
1737  * \endcode
1738  *
1739  * Produces output:
1740  * \code{.txt}
1741  * dtype: str
1742  * Rows: 2
1743  * [[1], [4]]
1744  * \endcode
1745  */
1748  flexible_type step = FLEX_UNDEFINED) const;
1749 
1750 /**
1751  *
1752  * An abstraction to perform cumulative aggregates.
1753  * y <- x.cumulative_aggregate(f, w_0)
1754  *
1755  * The abstraction is as follows:
1756  * y[i+1], w[i+1] = func(x[i], w[i])
1757  * where w[i] is some arbitary state.
1758  *
1759  * \param[in] Built in aggregate to use (e.g, sum, min, max etc.)
1760  * \return SArray
1761  *
1762  * \code
1763  * sa = SArray([1, 2, 3, 4, 5])
1764  * sa.cumulative_aggregate(std::make_shared<groupby_operators::sum>());
1765  * \endcode
1766  *
1767  * produces an SArray that looks like the following:
1768  * dtype: int
1769  * [1, 3, 6, 10, 15]
1770  * \endcode
1771  * \endcode
1772  *
1773  */
1775  std::shared_ptr<group_aggregate_value> aggregator) const;
1776  gl_sarray builtin_cumulative_aggregate(const std::string& name) const;
1777 
1778  /**
1779  *
1780  * This returns an SArray where each element is a cumulative aggregate of
1781  * all its previous elements. Only works in an SArray of numeric type or
1782  * numeric-array types.
1783  *
1784  * \return an SArray
1785  *
1786  * \code
1787  * sa = SArray([1, 2, 3, 4, 5])
1788  * sa.cumulative_sum()
1789  * \endcode
1790  *
1791  * produces an SArray that looks like the following:
1792  * dtype: int
1793  * [1, 3, 6, 10, 15]
1794  * \endcode
1795  *
1796  */
1797  gl_sarray cumulative_sum() const;
1798  gl_sarray cumulative_min() const;
1799  gl_sarray cumulative_max() const;
1800  gl_sarray cumulative_var() const;
1801  gl_sarray cumulative_std() const;
1802  gl_sarray cumulative_avg() const;
1803 
1804  /**
1805  * Apply an aggregate function over a moving window.
1806  *
1807  * \param input The input SArray (expects to be materialized)
1808  * \param fn_name string representation of the aggregation function to use.
1809  * The mapping is the same string mapping used by the groupby aggregate
1810  * function.
1811  * \param window_start The start of the moving window relative to the current
1812  * value being calculated, inclusive. For example, 2 values behind the current
1813  * would be -2, and 0 indicates that the start of the window is the current
1814  * value.
1815  * \param window_end The end of the moving window relative to the current value
1816  * being calculated, inclusive. Must be greater than `window_start`. For
1817  * example, 0 would indicate that the current value is the end of the window,
1818  * and 2 would indicate that the window ends at 2 data values after the
1819  * current.
1820  * \param min_observations The minimum allowed number of non-NULL values in the
1821  * moving window for the emitted value to be non-NULL. size_t(-1) indicates
1822  * that all values must be non-NULL.
1823  *
1824  * Returns an SArray of the same length as the input, with a type that matches
1825  * the type output by the aggregation function.
1826  *
1827  * Throws an exception if:
1828  * - window_end < window_start
1829  * - The window size is excessively large (currently hardcoded to UINT_MAX).
1830  * - The given function name corresponds to a function that will not operate
1831  * on the data type of the input SArray.
1832  * - The aggregation function returns more than one non-NULL types.
1833  *
1834  * Example:
1835  * \code
1836  * gl_sarray a{0,1,2,3,4,5,6,7,8,9};
1837  * // Moving window encompasses 3 values behind current and current value.
1838  * auto result = a.rolling_apply(std::string("__builtin__avg__"), -3, 0);
1839  * \endcode
1840  *
1841  * Produces an SArray with these values:
1842  * \code
1843  * {NULL,NULL,NULL,1.5,2.5,3.5,4.5,5.5,6.5,7.5}
1844  * \endcode
1845  */
1846  gl_sarray builtin_rolling_apply(const std::string &fn_name,
1847  ssize_t start,
1848  ssize_t end,
1849  size_t min_observations=size_t(-1)) const;
1850 
1851  /**
1852  * Show a visualization of the SArray.
1853  */
1854  void show(const std::string& path_to_client, const flexible_type& title, const flexible_type& xlabel, const flexible_type& ylabel) const;
1855 
1856  /**
1857  * Return a visualization of the SArray.
1858  */
1859  std::shared_ptr<visualization::Plot> plot(const flexible_type& title, const flexible_type& xlabel, const flexible_type& ylabel) const;
1860 
1861  /**
1862  * \internal
1863  * Gets the internal implementation object.
1864  */
1865  virtual std::shared_ptr<unity_sarray> get_proxy() const;
1866 
1867  private:
1868  void instantiate_new();
1869 
1870  void ensure_has_sarray_reader() const;
1871 
1872  std::shared_ptr<unity_sarray> m_sarray;
1873 
1874  mutable std::shared_ptr<sarray_reader<flexible_type> > m_sarray_reader;
1875 
1876 }; // gl_sarray
1877 
1878 
1879 
1880 gl_sarray operator+(const flexible_type& opnd, const gl_sarray& opnd2);
1881 gl_sarray operator-(const flexible_type& opnd, const gl_sarray& opnd2);
1882 gl_sarray operator*(const flexible_type& opnd, const gl_sarray& opnd2);
1883 gl_sarray operator/(const flexible_type& opnd, const gl_sarray& opnd2);
1884 gl_sarray operator<(const flexible_type& opnd, const gl_sarray& opnd2);
1885 gl_sarray operator>(const flexible_type& opnd, const gl_sarray& opnd2);
1886 gl_sarray operator<=(const flexible_type& opnd, const gl_sarray& opnd2);
1887 gl_sarray operator>=(const flexible_type& opnd, const gl_sarray& opnd2);
1888 gl_sarray operator==(const flexible_type& opnd, const gl_sarray& opnd2);
1889 
1890 
1891 /**
1892  * Provides printing of the gl_sarray.
1893  */
1894 std::ostream& operator<<(std::ostream& out, const gl_sarray& other);
1895 
1896 
1897 
1898 
1899 
1900 
1901 
1902 /**
1903  * \ingroup group_glsdk
1904  * A range object providing one pass iterators over part or all of a gl_sarray.
1905  *See \ref gl_sarray::range_iterator for usage examples.
1906  *
1907  * \see gl_sarray::range_iterator
1908  */
1910  public:
1911  /// content type
1913 
1914  gl_sarray_range(std::shared_ptr<sarray_reader<flexible_type> > m_sarray_reader,
1915  size_t start, size_t end);
1916  gl_sarray_range(const gl_sarray_range&) = default;
1917  gl_sarray_range(gl_sarray_range&&) = default;
1918  gl_sarray_range& operator=(const gl_sarray_range&) = default;
1920 
1921  /// Iterator type
1922  struct iterator:
1923  public boost::iterator_facade<iterator,
1924  const flexible_type, boost::single_pass_traversal_tag> {
1925  public:
1926  iterator() = default;
1927  iterator(const iterator&) = default;
1928  iterator(iterator&&) = default;
1929  iterator& operator=(const iterator&) = default;
1930  iterator& operator=(iterator&&) = default;
1931 
1932  iterator(gl_sarray_range& range, bool is_start);
1933  private:
1934  friend class boost::iterator_core_access;
1935  void increment();
1936  void advance(size_t n);
1937  inline bool equal(const iterator& other) const {
1938  return m_counter == other.m_counter;
1939  }
1940  const type& dereference() const;
1941  size_t m_counter = 0;
1942  gl_sarray_range* m_owner = NULL;
1943  };
1944 
1945  /// const_iterator type
1947 
1948  /**
1949  * Returns an iterator to the start of the range.
1950  * Once the iterator is advanced, later calls to begin() have undefined
1951  * behavior.
1952  *
1953  * The returned iterator is invalidated once the parent range_iterator is
1954  * destroyed.
1955  */
1956  iterator begin();
1957 
1958  /**
1959  * Returns an iterator to the end of the range.
1960  *
1961  * The returned iterator is invalidated once the parent range_iterator is
1962  * destroyed.
1963  */
1964  iterator end();
1965  private:
1966  flexible_type m_current_value;
1967  std::shared_ptr<sarray_reader_buffer<flexible_type> > m_sarray_reader_buffer;
1968 };
1969 
1970 
1971 /**
1972  * Utility function to infer the most general type of an in memory vector of
1973  * flexible_types.
1974  */
1975 flex_type_enum infer_type_of_list(const std::vector<flexible_type>& vec);
1976 
1977 
1978 class gl_sarray_writer_impl;
1979 
1980 /**
1981  * \ingroup group_glsdk
1982  * Provides the ability to write gl_sarrays.
1983  * The gl_sarray is internally cut into a collection of segments. Each segment
1984  * can be written to independently, and the resultant SArray is the effective
1985  * concatenation of all the segments.
1986  *
1987  * \code
1988  * // Writes an integer SArray of 4 segments.
1989  * gl_sarray_writer writer(flex_type_enum:INTEGER, 4);
1990  *
1991  * // for each segment, write a bunch of 10 values.
1992  * // segment 0 has 10 0's,
1993  * // segment 1 has 10 1's,
1994  * // etc
1995  * for (size_t seg = 0;seg < 4; ++seg) {
1996  * for (size_t i = 0;i < 10; ++i) {
1997  * writer.write(i, seg);
1998  * }
1999  * }
2000  *
2001  * gl_sarray sa = writer.close();
2002  * // sa is now an SArray of 40 elements comprising of
2003  * // the sequence 10 0's, 10 1's, 10 2's, 10 3's
2004  * \endcode
2005  *
2006  * Different segments can be written safely in parallel. It is not safe to
2007  * write to the same segment simultanously.
2008  */
2010  public:
2011  /**
2012  * Constructs a writer to write an gl_sarray of a particular type.
2013  *
2014  * \param type The content type of the SArray. Everything written to the
2015  * writer (via \ref write) must be of that type, is implicitly castable to
2016  * that type, or is a missing value denoted with a FLEX_UNDEFINED value.
2017  *
2018  * \param num_segments Optional. The number of segments of the SArray.
2019  * Adjusting this parameter has little performance impact on the resultant
2020  * gl_sarray. Modifying this value is only helpful for providing writer
2021  * parallelism. Defaults to the number of cores on the machine.
2022  */
2023  gl_sarray_writer(flex_type_enum type, size_t num_segments = (size_t)(-1));
2024 
2025  /**
2026  * Writes a single value to a given segment.
2027  *
2028  * For instance,
2029  * \code
2030  * gl_sarray_writer writer(flex_type_enum:FLOAT, 1);
2031  * writer.write(1.5, 0); // writes the value 1.5 to segment 0
2032  * writer.write(1, 0); // writes the value 1.0 to segment 0 (integer can be cast to float)
2033  * \endcode
2034  *
2035  * Strings are the most general type and everything can cast to it. hence:
2036  * \code
2037  * gl_sarray_writer writer(flex_type_enum:STRING, 1);
2038  * writer.write("hello", 0); // writes the value "hello" to segment 0
2039  * writer.write(1.5, 0); // writes the value "1.5" to segment 0
2040  * writer.write(1, 0); // writes the value "1" to segment 0
2041  * \endcode
2042  *
2043  * Different segments can be written safely in parallel. It is not safe to
2044  * write to the same segment simultanously.
2045  *
2046  * \param f The value to write. This value should be of the requested type
2047  * (as set in the constructor), or is castable to the requested type, or is
2048  * FLEX_UNDEFINED.
2049  *
2050  * \param segmentid The segment to write to.
2051  */
2052  void write(const flexible_type& f, size_t segmentid);
2053 
2054  /**
2055  * Writes a range of values to a given segment.
2056  *
2057  * Essentially equivalent to:
2058  * \code
2059  * while(start != end) write(*start++);
2060  * \endcode
2061  *
2062  * Different segments can be written safely in parallel. It is not safe to
2063  * write to the same segment simultanously.
2064  *
2065  * \param start The start iterator of the range to write.
2066  *
2067  * \param end The end iterator of the range to write.
2068  *
2069  * \param segmentid The segment to write to.
2070  */
2071  template <typename T>
2072  void write(T begin, T end, size_t segmentid) {
2073  while (begin != end) {
2074  write((*begin), segmentid);
2075  ++begin;
2076  }
2077  }
2078 
2079  /**
2080  * Stops all writes and returns the resultant SArray.
2081  */
2082  gl_sarray close();
2083 
2084  /**
2085  * Returns the number of segments of the SArray; this is the same value
2086  * provided on construction of the writer.
2087  */
2088  size_t num_segments() const;
2089 
2090  ~gl_sarray_writer();
2091 
2092  private:
2093  std::unique_ptr<gl_sarray_writer_impl> m_writer_impl;
2094 
2095 };
2096 
2097 } // namespace turi
2098 #endif
gl_sarray astype(flex_type_enum dtype, bool undefined_on_failure=true) const
size_t num_missing() const
gl_sarray append(const gl_sarray &other) const
gl_sarray dropna() const
gl_sarray()
Constructs an empty SArray.
void save(const std::string &directory, const std::string &format="binary") const
gl_sarray dict_trim_by_values(const flexible_type &lower=FLEX_UNDEFINED, const flexible_type &upper=FLEX_UNDEFINED) const
gl_sarray clip_upper(flexible_type threshold) const
void write(T begin, T end, size_t segmentid)
Definition: gl_sarray.hpp:2072
static gl_sarray from_sequence(size_t start, size_t end, bool reverse=false)
gl_sarray apply(std::function< flexible_type(const flexible_type &)> fn, flex_type_enum dtype, bool skip_undefined=true) const
gl_sarray dict_values() const
gl_sarray fillna(flexible_type value) const
flexible_type std() const
gl_sarray hash(size_t seed=0) const
gl_sarray sort(bool ascending=true) const
gl_sarray filter(std::function< bool(const flexible_type &)> fn, bool skip_undefined=true) const
size_t nnz() const
bool any() const
std::set< Key > keys(const std::map< Key, T > &map)
Definition: stl_util.hpp:358
gl_sarray clip_lower(flexible_type threshold) const
flexible_type operator[](int64_t i) const
static std::ostream & operator<<(std::ostream &out, const uint128_t &x)
Enables printing of uint128_t values.
iterator const_iterator
const_iterator type
Definition: gl_sarray.hpp:1946
flexible_type min() const
flex_type_enum infer_type_of_list(const std::vector< flexible_type > &vec)
flexible_type max() const
gl_sarray datetime_to_str(const std::string &str_format="%Y-%m-%dT%H:%M:%S%ZP") const
gl_sarray cumulative_sum() const
gl_sarray builtin_rolling_apply(const std::string &fn_name, ssize_t start, ssize_t end, size_t min_observations=size_t(-1)) const
gl_sarray dict_trim_by_keys(const std::vector< flexible_type > &keys, bool exclude=true) const
std::shared_ptr< visualization::Plot > plot(const flexible_type &title, const flexible_type &xlabel, const flexible_type &ylabel) const
gl_sarray clip(flexible_type lower=FLEX_UNDEFINED, flexible_type upper=FLEX_UNDEFINED) const
gl_sarray item_length() const
virtual std::shared_ptr< unity_sarray > get_proxy() const
gl_sarray dict_keys() const
gl_sarray pixel_array_to_image(size_t width, size_t height, size_t channels=3, bool undefined_on_failure=true) const
gl_sarray & operator=(const gl_sarray &)
Copy Assignment.
gl_sarray contains(const flexible_type &other) const
gl_sframe split_datetime(const std::string &column_name_prefix="X", const std::vector< std::string > &limit={"year","month","day","hour","minute","second"}, bool tzone=false) const
void show(const std::string &path_to_client, const flexible_type &title, const flexible_type &xlabel, const flexible_type &ylabel) const
gl_sarray tail(size_t n) const
static gl_sarray from_const(const flexible_type &value, size_t size)
gl_sarray count_words(bool to_lower=true, turi::flex_list delimiters={"\, "\", "\", "\", "\", " "}) const
gl_sarray sample(double fraction) const
bool all() const
gl_sarray topk_index(size_t topk=10, bool reverse=false) const
flexible_type type
content type
Definition: gl_sarray.hpp:1912
gl_sarray dict_has_any_keys(const std::vector< flexible_type > &keys) const
gl_sarray str_to_datetime(const std::string &str_format="%Y-%m-%dT%H:%M:%S%ZP") const
gl_sarray_range range_iterator(size_t start=0, size_t end=(size_t)(-1)) const
std::set< T > values(const std::map< Key, T > &map)
Definition: stl_util.hpp:386
flexible_type sum() const
static gl_sarray read_json(const std::string &url)
gl_sarray head(size_t n) const
gl_sarray dict_has_all_keys(const std::vector< flexible_type > &keys) const
gl_sframe unpack(const std::string &column_name_prefix="X", const std::vector< flex_type_enum > &column_types=std::vector< flex_type_enum >(), const flexible_type &na_value=FLEX_UNDEFINED, const std::vector< flexible_type > &limit=std::vector< flexible_type >()) const
void materialize_to_callback(std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)> callback, size_t nthreads=(size_t)(-1))
gl_sarray subslice(flexible_type start=FLEX_UNDEFINED, flexible_type stop=FLEX_UNDEFINED, flexible_type step=FLEX_UNDEFINED) const
gl_sarray count_ngrams(size_t n=2, std::string method="word", bool to_lower=true, bool ignore_space=true) const
static flexible_type FLEX_UNDEFINED
flexible_type mean() const
flex_type_enum dtype() const
std::vector< flexible_type > flex_list
void materialize() const
bool empty() const
gl_sarray unique() const
bool is_materialized() const
gl_sarray cumulative_aggregate(std::shared_ptr< group_aggregate_value > aggregator) const
size_t size() const