Turi Create  4.0
unity_sarray.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_SARRAY_HPP
7 #define TURI_UNITY_SARRAY_HPP
8 
9 #include <vector>
10 #include <memory>
11 #include <core/data/flexible_type/flexible_type.hpp>
12 #include <model_server/lib/api/unity_sarray_interface.hpp>
13 #include <visualization/server/plot.hpp>
14 
15 namespace turi {
16 
17 // forward declarations
18 template <typename T>
19 class sarray;
20 template <typename T>
21 class sarray_reader;
22 template <typename T>
23 class sarray_iterator;
24 
25 namespace query_eval {
26 struct planner_node;
27 } // query_eval
28 
29 /**
30  * This is the SArray object exposed to Python. Abstractly, it stores a
31  * single column of a flexible_type. An Sarray represents a single immutable
32  * column: i.e. once created, it cannot be modified.
33  *
34  * Internally, it is represented as a single shared_ptr to an
35  * \ref "sarray<flexible_type>" sarray object. We delay construction of the
36  * internal sarray object until a "construct" call is made. This allows the
37  * class to be used in the following way:
38  *
39  * \code
40  * unity_sarray array;
41  * // creates the array
42  * array.construct(...);
43  * // now the array is immutable.
44  * \endcode
45  *
46  * Multiple different construct functions can then be used to create sarrays
47  * from different sources: some sources may require the sarray to download
48  * files, etc.
49  *
50  * The SArray may require temporary on disk storage which will be deleted when
51  * the SArray is deleted. The temporary file names are obtained from
52  * \ref turi::get_temp_name
53  */
54 class unity_sarray: public unity_sarray_base {
55 
56  public:
57  /** Default Constructor.
58  * Does nothing basically. Use one of the construct_from_* functions to
59  * construct the contents of the SArray.
60  */
61  unity_sarray();
62 
63  /// Destructor. Deletes all temporary sarray files created.
64  ~unity_sarray();
65 
66  unity_sarray(const unity_sarray& other);
67 
68  unity_sarray& operator=(const unity_sarray& other);
69 
70  /**
71  * Constructs an Sarray from an in memory vector.
72  * If the current object is already storing an array, it is cleared
73  * (\ref clear()).May throw an exception on failure. If an exception occurs,
74  * the contents of SArray is empty.
75  */
76  void construct_from_vector(const std::vector<flexible_type>& vec, flex_type_enum type);
77 
78  /**
79  * Constructs a unity_sarray from an existing sarray.
80  * This simply sets this class's shared_ptr to the one given by the parameter.
81  */
82  void construct_from_sarray(std::shared_ptr<sarray<flexible_type>> s_ptr);
83 
84  /**
85  * Constructs a unity_sarray from a const value.
86  */
87  void construct_from_const(const flexible_type& value, size_t size,
89 
90  /**
91  * Constructs a unity_sarray from a parallel iterator generator.
92  */
93  void construct_from_planner_node(std::shared_ptr<query_eval::planner_node> node);
94 
95 
96  /**
97  * Constructs an Sarray from an existing directory on disk saved with
98  * save_array() or a on disk sarray prefix (saved with
99  * save_array_by_index_file()). This function will automatically detect if
100  * the location is a directory, or a file. The files will not be deleted on
101  * destruction. If the current object is already storing an array, it is
102  * cleared (\ref clear()). May throw an exception on failure. If an exception
103  * occurs, the contents of SArray is empty.
104  */
105  void construct_from_sarray_index(std::string location);
106 
107  /**
108  * Constructs an Sarray from a url. Each line of the file will be a row in the
109  * resultant SArray, and each row will be of string type.
110  * If the current object is already storing an array, it is cleared
111  * (\ref clear()).May throw an exception on failure. If an exception occurs,
112  * the contents of SArray is empty.
113  */
114  void construct_from_files(std::string url, flex_type_enum type);
115 
116  /**
117  * Constructs an SArray from one or more json record files.
118  *
119  * Each json record file contains an array of dictionaries.
120  * Resultant SArray is of dictionary type.
121  */
122  void construct_from_json_record_files(std::string url);
123 
124  /**
125  * Given a URL, this function attempts to autodetect if it should
126  * - treat it as a .sidx file and load an SArray from it (construct_from_sarray_index)
127  * - treat it as a file to read line by line (construct_from_files)
128  * - treat it as a directory and load an SArray from it (construct_from_sarray_index)
129  */
130  void construct_from_autodetect(std::string url, flex_type_enum type);
131 
132  /**
133  * Saves a copy of the current sarray into a directory.
134  * Does not modify the current sarray
135  */
136  void save_array(std::string target_directory);
137 
138 
139  /**
140  * Saves a copy of the current sarray into a target location defined by
141  * an index file. DOes not modify the current sarray.
142  */
143  void save_array_by_index_file(std::string index_file);
144 
145  /**
146  * Clears the contents of the SArray, deleting all temporary files if any.
147  */
148  void clear();
149 
150  /**
151  * Returns the number of rows in the SArray. Or 0 if the SArray is empty.
152  */
153  size_t size();
154 
155  /**
156  * Returns true if size is obtainable efficiently.
157  */
158  bool has_size();
159 
160  /**
161  * Obtains the underlying sarray pointer.
162  * TODO: will slowly move away all users of this function to get_lazy_sarray
163  */
164  std::shared_ptr<sarray<flexible_type> > get_underlying_sarray();
165 
166  /**
167  * Returns the underlying planner pointer
168  */
169  std::shared_ptr<query_eval::planner_node> get_planner_node();
170 
171  /**
172  * Returns some number of rows of the SArray
173  *
174  * NOTE: If there are more elements asked for than can fit into
175  * memory, this makes no attempt to stop crashing your computer.
176  */
177  std::shared_ptr<unity_sarray_base> head(size_t nrows);
178 
179  /**
180  * Same as head, return vector<flexible_type>, used for testing.
181  */
182  std::vector<flexible_type> _head(size_t nrows) {
183  auto result = head(nrows);
184  auto ret = result->to_vector();
185  return ret;
186  }
187 
188  /**
189  * Returns the type name of the SArray
190  */
191  flex_type_enum dtype();
192 
193  /**
194  * Returns a new sarray which is a transform of this using a Python lambda
195  * function pickled into a string.
196  */
197  std::shared_ptr<unity_sarray_base> transform(const std::string& lambda,
198  flex_type_enum type,
199  bool skip_undefined,
200  uint64_t seed);
201 
202  /**
203  * Returns a new sarray which is a transform of this using a registered
204  * toolkit function.
205  */
206  std::shared_ptr<unity_sarray_base> transform_native(
207  const function_closure_info& closure,
208  flex_type_enum type,
209  bool skip_undefined,
210  uint64_t seed);
211 
212  std::shared_ptr<unity_sarray_base> transform_lambda(std::function<flexible_type(const flexible_type&)> lambda,
213  flex_type_enum type,
214  bool skip_undefined,
215  uint64_t seed);
216 
217  /**
218  * Append all rows from "other" sarray to "this" sarray and returns a new sarray
219  * that contains all rows from both sarrays
220  */
221  std::shared_ptr<unity_sarray_base> append(std::shared_ptr<unity_sarray_base> other);
222 
223  /**
224  * If this sarray contains vectors, this returns a new sarray comprising of a
225  * vertical slice of the vector from position start (inclusive) to position
226  * end (exclusive). Throws an exception if the sarray is not an vector.
227  *
228  * If end==(start+1), the output is an SArray of doubles. if end > start,
229  * the output is an SArray of vectors, each of length (end - start).
230  * If a vector cannot be sliced (for instance the length of the vector is less
231  * than end), the resultant value will be UNDEFINED.
232  *
233  * End must be greater than start. throws an exception otherwise.
234  */
235  std::shared_ptr<unity_sarray_base> vector_slice(size_t start, size_t end);
236 
237 
238  /**
239  * Returns a new SArray which is filtered to by the given lambda function.
240  * If the lambda evaluates an element to true, this element is copied to the
241  * new SArray. If not, it isn't. Throws an exception if the return type
242  * of the lambda is not castable to a boolean value.
243  */
244  std::shared_ptr<unity_sarray_base> filter(const std::string& lambda, bool skip_undefined, uint64_t seed);
245 
246 
247  /**
248  * Returns a new SArray which is filtered by a given logical column.
249  * The index array must be the same length as the current array. An output
250  * array is returned containing only the elements in the current where are the
251  * corresponding element in the index array evaluates to true.
252  */
253  std::shared_ptr<unity_sarray_base> logical_filter(std::shared_ptr<unity_sarray_base> index);
254 
255  /**
256  * Returns a new SArray which has the top k elements selected.
257  * k should be reasonably small. O(k) memory required.
258  *
259  * If reverse if true, the bottom k is returned instead
260  */
261  std::shared_ptr<unity_sarray_base> topk_index(size_t k, bool reverse);
262 
263 
264  /**
265  * Returns true if all the values in the sarray are non-zero / non-empty. An
266  * empty array returns true.
267  */
268  bool all();
269 
270  /**
271  * Returns true if any value in the sarray is non-zero / non-empty. An
272  * empty array returns false.
273  */
274  bool any();
275 
276  /**
277  * Creates a new SArray with the datetime values casted to string.
278  *
279  * "format" determines the string format for the output SArray.
280  */
281  std::shared_ptr<unity_sarray_base> datetime_to_str(std::string format);
282 
283  /**
284  * Creates a new SArray with the string values casted to datetime.
285  *
286  * "format" determines the string format for the input SArray.
287  */
288  std::shared_ptr<unity_sarray_base> str_to_datetime(std::string format);
289 
290  /**
291  * Creates a new SArray with the same values as current one, but casted to
292  * the given type.
293  *
294  * If undefined_on_failure is set, cast failures do not cause errors, but
295  * become undefined values.
296  *
297  * \see lazy_astype
298  */
299  std::shared_ptr<unity_sarray_base> astype(flex_type_enum dtype, bool undefined_on_failure = false);
300 
301  /**
302  * Creates a new SArray with the same values as current one, but casted to
303  * the given type. Performed lazily.
304  *
305  * If undefined_on_failure is set, cast failures do not cause errors, but
306  * become undefined values.
307  * \see astype
308  */
309  std::shared_ptr<unity_sarray_base> lazy_astype(flex_type_enum dtype,
310  bool undefined_on_failure = false);
311 
312  /**
313  * Creates a new SArray with the same values as the current one, except
314  * any values above or below the given bounds are changed to be equal
315  * to the bound.
316  *
317  * If lower or upper are given a flex_undefined(), this is interpreted
318  * to mean that there is no bound there. For example,
319  * clip(flex_undefined(), 25) clips with no lower bound and an upper bound
320  * of 25.
321  */
322  std::shared_ptr<unity_sarray_base> clip(flexible_type lower = flex_undefined(),
323  flexible_type upper = flex_undefined());
324 
325  /**
326  * Returns the largest element in the sarray. An empty array returns
327  * flex_undefined, which in python is numpy.nan. Only works for INTEGER
328  * and FLOAT. Throws an exception if invoked on an sarray of any other type.
329  * Undefined values in the array are skipped.
330  */
331  flexible_type max();
332 
333  /**
334  * Returns the smallest element in the sarray. An empty array returns
335  * flex_undefined, which in python is numpy.nan. Only works for INTEGER
336  * and FLOAT. Throws an exception if invoked on an sarray of any other type.
337  * Undefined values in the array are skipped.
338  */
339  flexible_type min();
340 
341  /**
342  * Returns the sum of all elements in the sarray. An empty returns
343  * flex_undefined, which in python is numpy.nan. Only works for INTEGER
344  * and FLOAT. Throws an exception if invoked on an sarray of any other type.
345  * Overflows without shame.
346  * Undefined values in the array are skipped.
347  */
348  flexible_type sum();
349 
350  /**
351  * Returns the mean of the elements in sarray as a flex_float.
352  *
353  * Invoking on an empty sarray returns flex_undefined.
354  * Invoking on a non-numeric type throws an exception.
355  * Undefined values in the array are skipped.
356  */
357  flexible_type mean();
358 
359  /**
360  * Returns the medain of the elements in the sarray.
361  *
362  * Invoking on an empty sarray returns flex_undefined.
363  * Invoking on a non-numeric type throws an exception.
364  * Undefined values in the array are skipped.
365  */
366  flexible_type median(bool approx);
367 
368  /**
369  * Returns the standard deviation of the elements in sarray as a flex_float.
370  *
371  * \param ddof ...stands for "delta degrees of freedom". Adjusts the degrees
372  * of freedom in the variance calculation. If ddof=0, there are N degrees
373  * of freedom, with N being the number of elements in the sarray.
374  *
375  * Throws an exception if:
376  * ddof >= sarray size
377  * sarray is of a non-numeric type
378  *
379  * Returns flex_undefined if executed on empty or non-existent sarray.
380  * Undefined values in the array are skipped.
381  */
382  flexible_type std(size_t ddof=0);
383 
384  /**
385  * Returns the variance of the elements in sarray as a flex_float.
386  *
387  * \param ddof ...stands for "delta degrees of freedom". Adjusts the degrees
388  * of freedom in the variance calculation. If ddof=0, there are N degrees
389  * of freedom, with N being the number of elements in the sarray.
390  *
391  * Throws an exception if:
392  * ddof >= sarray size
393  * sarray is of a non-numeric type
394  *
395  * Returns flex_undefined if executed on empty or non-existent SArray.
396  * Undefined values in the array are skipped.
397  */
398  flexible_type var(size_t ddof=0);
399 
400 
401  /**
402  * Returns the number of missing values in the SArray.
403  */
404  size_t num_missing();
405 
406  /**
407  * Returns the number of non-zero elements in the array.
408  * Functionally equivalent to
409  * \code
410  * nonzero().length()
411  * \endcode
412  * But takes much less memory.
413  */
414  size_t nnz();
415 
416 
417  /**
418  * Performs the equivalent of array [op] other , where other is a scalar value.
419  * The operation must be one of the following: "+", "-", "*", "/", "<", ">",
420  * "<=", ">=", "==", "!=",'%','in'. The type of the new array is dependent on the
421  * semantics of the operation.
422  * - comparison operators always return integers
423  * - +,-,* of integer against integers always return integers
424  * - / of integer against integer always returns floats
425  * - +,-,*,/ of floats against floats always return floats
426  * - +,-,*,/ of integer against floats or floats against integers
427  * always return floats.
428  * - %. integer against integers -> integers
429  * - "in". must be string against string, and returns a boolean.
430  *
431  * This function throws a string exception if there is a type mismatch (
432  * for instance you cannot add a string value to an integer array), or if
433  * the operation is invalid.
434  *
435  * UNDEFINED values in the array are ignored.
436  *
437  * On success, a new array is returned. The new array is the same length and
438  * has the same segment structure.
439  */
440  std::shared_ptr<unity_sarray_base> left_scalar_operator(flexible_type other,
441  std::string op);
442 
443  /**
444  * Performs the equivalent of other [op] array, where other is a scalar value.
445  * The operation must be one of the following: "+", "-", "*", "/", "<", ">",
446  * "<=", ">=", "==", "!=". The type of the new array is dependent on the
447  * semantics of the operation.
448  * - comparison operators always return integers
449  * - +,-,* of integer against integers always return integers
450  * - / of integer against integer always returns floats
451  * - +,-,*,/ of floats against floats always return floats
452  * - +,-,*,/ of integer against floats or floats against integers
453  * always return floats.
454  *
455  * This function throws a string exception if there is a type mismatch (
456  * for instance you cannot add a string value to an integer array), or if
457  * the operation is invalid.
458  *
459  * UNDEFINED values in the array are ignored.
460  *
461  * On success, a new array is returned. The new array is the same length and
462  * has the same segment structure.
463  */
464  std::shared_ptr<unity_sarray_base> right_scalar_operator(flexible_type other, std::string op);
465 
466 
467 
468  /**
469  * Performs the equivalent of array [op] other, where other is an SArray.
470  * The operation must be one of the following: "+", "-", "*", "/", "<", ">",
471  * "<=", ">=", "==", "!=". The type of the new array is dependent on the
472  * semantics of the operation.
473  * - comparison operators always return integers
474  * - +,-,* of integer against integers always return integers
475  * - / of integer against integer always returns floats
476  * - +,-,*,/ of floats against floats always return floats
477  * - +,-,*,/ of integer against floats or floats against integers
478  * always return floats.
479  *
480  * This function throws a string exception if there is a type mismatch (
481  * for instance you cannot add a string value to an integer array), or if
482  * the operation is invalid.
483  *
484  * UNDEFINED values in the array are ignored.
485  *
486  * On success, a new array is returned. The new array is the same length and
487  * has the same segment structure.
488  */
489  std::shared_ptr<unity_sarray_base> vector_operator(
490  std::shared_ptr<unity_sarray_base> other, std::string op);
491 
492  /**
493  * Returns a new array with all UNDEFINED values removed.
494  * A new array is returned with the same type as the current array, but
495  * potentially shorter. If the array has no missing values, the output array
496  * has the same length and the same segment structure as this array.
497  */
498  std::shared_ptr<unity_sarray_base> drop_missing_values();
499 
500  /**
501  * Returns a new integer typed array indicating the presence of a missing value or float NA
502  * in the corresponding element.
503  *
504  * If recursive is true, then it also checks if a NA is present in any element of a recursive type.
505  *
506  * If missing_is_true is true, then the array contains a 1 if the element is a
507  * missing value and a 0 if it is not; otherwise, it returns 1 on the presence of a na.
508  */
509  std::shared_ptr<unity_sarray_base> missing_mask(bool recursive = false, bool missing_is_true = true);
510 
511  /**
512  * Returns a new array with all UNDEFINED values replaced with the given value.
513  *
514  * Throws if the given value is not convertible to the SArray's type.
515  */
516  std::shared_ptr<unity_sarray_base> fill_missing_values(flexible_type default_value);
517 
518  /**
519  * Returns some number of rows on the end of the SArray. The values are
520  * returned in the order they were found in the SArray.
521  *
522  * NOTE: If there are more elements asked for than can fit into
523  * memory, this makes no attempt to stop crashing your computer.
524  */
525  std::shared_ptr<unity_sarray_base> tail(size_t nrows=10);
526 
527  std::vector<flexible_type> _tail(size_t nrows=10) {
528  auto result = tail(nrows);
529  auto ret = result->to_vector();
530  return ret;
531  }
532 
533  /**
534  * Returns a uniform random sample of the sarray, that contains percent of
535  * the total elements, without replacement, using the random_seed.
536  */
537  std::shared_ptr<unity_sarray_base> sample(float percent, uint64_t random_seed, bool exact=false);
538 
539  /**
540  * Returns an SArray of type flex_int that contains the hash of each element.
541  * The hash function takes a seed value so this can be used for
542  * random generation as well.
543  */
544  std::shared_ptr<unity_sarray_base> hash(uint64_t seed);
545 
546  /**
547  * Do a word-count for each element in the SArray and return a SArray of dictionary
548  **/
549  std::shared_ptr<unity_sarray_base> count_bag_of_words(std::map<std::string, flexible_type> options);
550 
551  /**
552  * Do a character n-gram count for each element in the SArray and return a SArray of dictionary type.
553  * Parameter n is the number or charachters in each n-gram
554  * options takes: to_lower, which makes words lower case
555  * ignore_space, which ignores spaces in calculating charachter n-grams
556  **/
557  std::shared_ptr<unity_sarray_base> count_character_ngrams(size_t n, std::map<std::string, flexible_type> options);
558 
559 
560  /**
561  * Do a character n-gram count for each element in the SArray and return a SArray of dictionary type.
562  * Parameter n is the number of words in each n-gram
563  * options takes: to_lower, which makes words lower case
564  **/
565  std::shared_ptr<unity_sarray_base> count_ngrams(size_t n, std::map<std::string, flexible_type> options);
566 
567  /**
568  * If SArray dtype is dict, filter out each dict by the given keys.
569  * If exclude is True, then all keys that are in the input key list are removed
570  * If exclude is False, then only keys that are in the input key list are retained
571  **/
572  std::shared_ptr<unity_sarray_base> dict_trim_by_keys(const std::vector<flexible_type>& keys, bool exclude);
573 
574  /**
575  * If SArray dtype is dict, filter out each dict by the given value boundary.
576  * all items whose value is not in the low/up bound are removed from the dictionary
577  * The boundary are included. I.e, if a value is either lower or upper bound, then
578  * the key/value pair is included in the result
579  * This function will fail if the value is not comparable
580  **/
581  std::shared_ptr<unity_sarray_base> dict_trim_by_values(const flexible_type& lower, const flexible_type& upper);
582 
583  /**
584  * If SArray dtype is dict, returns a new SArray which contains keys for input dictionary
585  * otherwise throws exception
586  **/
587  std::shared_ptr<unity_sarray_base> dict_keys();
588 
589  /**
590  * If SArray dtype is dict, returns a new SArray which contains values for input dictionary
591  * otherwise throws exception
592  **/
593  std::shared_ptr<unity_sarray_base> dict_values();
594 
595  /**
596  * If SArray dtype is dict, returns a new SArray which contains integer of 1s or 0s with 1
597  * means the original array element has at least one key in the param
598  * otherwise throws exception
599  **/
600  std::shared_ptr<unity_sarray_base> dict_has_any_keys(const std::vector<flexible_type>& keys);
601 
602  /**
603  * If SArray dtype is dict, returns a new SArray which contains integer of 1s or 0s with 1
604  * means the original array element has all keys in the param
605  * otherwise throws exception
606  **/
607  std::shared_ptr<unity_sarray_base> dict_has_all_keys(const std::vector<flexible_type>& keys);
608 
609  /**
610  ** Returns a new SArray that contains elements that are the length of each item
611  ** in input SArray. This function only works on SArray of type vector, list and
612  ** dict. It is equivalent to the following python work
613  ** sa_ret = sa.apply(lambda x: len(x))
614  **/
615  std::shared_ptr<unity_sarray_base> item_length();
616 
617  /**
618  * Expand an SArray of datetime type to a set of new columns.
619  *
620  * \param column_name_prefix: prefix for the expanded column name
621  * \param expanded_column_elements: a list including the elements to expand
622  * from the datetime column. Elements could be 'year','month','day'
623  * 'hour','minute','second', and 'timezone'.
624  * \param expanded_columns_types: list of types for the expanded columns
625 
626  * Returns a new SFrame that contains the expanded columns
627  **/
628  std::shared_ptr<unity_sframe_base> expand(
629  const std::string& column_name_prefix,
630  const std::vector<flexible_type>& expanded_column_elements,
631  const std::vector<flex_type_enum>& expanded_columns_types);
632  /**
633  * Unpack an SArray of dict/list/vector type to a set of new columns.
634  * For dictionary type, each unique key is a new column
635  * For vector/list type, each sub column in the vector is a new column
636  *
637  * \param column_name_prefix: prefix for the unpacked column name
638  * \param unpacked_keys: list of keys to unpack, this is list of string for
639  * dictionary type, and list of integers for list/array type. This list is
640  * used to limit the subset of values to unpack
641  * \param unpacked_column_types: list of types for the unpacked columns
642  * \param na_value: if not undefined, replace all na_value with missing values
643 
644  * Returns a new SFrame that contains the unpacked columns
645  **/
646  std::shared_ptr<unity_sframe_base> unpack(
647  const std::string& column_name_prefix,
648  const std::vector<flexible_type>& unpacked_keys,
649  const std::vector<flex_type_enum>& unpacked_columns_types,
650  const flexible_type& na_value);
651 
652  /**
653  * Unpack a dict SArray to a set of new columns by extracting each key from dict
654  * and creating new column for each unique key. The key name becomes column name
655 
656  * \param column_name_prefix: prefix for the unpacked column name
657  * \param limit: limited keys for the unpack
658  * \param na_value: if not undefined, replace all na_value with missing values
659 
660  * Returns a new SFrame that contains the unpacked columns
661  **/
662  std::shared_ptr<unity_sframe_base> unpack_dict(
663  const std::string& column_name_prefix,
664  const std::vector<flexible_type>& limit,
665  const flexible_type& na_value);
666 
667  /**
668  * Return the subslice of the sarray
669  *
670  * \param start The start row of the slice, cycle around if negative
671  * \param step Take an element every step, can be negative
672  * \param stop The end row of the slice, cycle around if negative
673  */
674  std::shared_ptr<unity_sarray_base> subslice(flexible_type start, flexible_type step, flexible_type stop);
675 
676 
677  /**
678  * is_true and is_false and this SArray must be the same size.
679  * Returns an SArray of the same size.
680  *
681  * For each non-zero value in this SArray, it picks up the corresponding value from is_true.
682  * For each zero value in this SArray, it picks up the corresponding value from is_false.
683  */
684  std::shared_ptr<unity_sarray_base> ternary_operator(std::shared_ptr<unity_sarray_base> is_true,
685  std::shared_ptr<unity_sarray_base> is_false);
686 
687 
688  /**
689  * Returns an SArray of the same length but with all constant values.
690  * Does so without materializing the SArray.
691  *
692  * \note This is really only useful for internal use
693  */
694  std::shared_ptr<unity_sarray_base> to_const(const flexible_type& value, flex_type_enum dtype);
695 
696  /**
697  * Begin iteration through the SArray.
698  *
699  * Works together with \ref iterator_get_next(). The usage pattern
700  * is as follows:
701  * \code
702  * array.begin_iterator();
703  * while(1) {
704  * auto ret = array.iterator_get_next(64);
705  * // do stuff
706  * if (ret.size() < 64) {
707  * // we are done
708  * break;
709  * }
710  * }
711  * \endcode
712  *
713  * Note that use of pretty much any of the other data-dependent SArray
714  * functions will invalidate the iterator.
715  */
716  void begin_iterator();
717 
718  /**
719  * Obtains the next block of elements of size len from the SFrame.
720  * Works together with \ref begin_iterator(). See the code example
721  * in \ref begin_iterator() for details.
722  *
723  * This function will always return a vector of length 'len' unless
724  * at the end of the array, or if an error has occured.
725  *
726  * \param len The number of elements to return
727  * \returns The next collection of elements in the array. Returns less then
728  * len elements on end of file or failure.
729  */
730  std::vector<flexible_type> iterator_get_next(size_t len);
731 
732  /**
733  * Return the content as a vector. Convenience function.
734  */
735  std::vector<flexible_type> to_vector() {
736  begin_iterator();
737  return iterator_get_next(size());
738  }
739 
740  /**
741  * materialize the sarray, this is different from save() as this is a temporary persist of
742  * this sarray to disk to speed up some computation (for example, lambda)
743  * this will NOT create a new uity_sarray.
744  **/
745  void materialize();
746 
747  /**
748  * test hook to check if the array is materialized
749  **/
750  bool is_materialized();
751  /**
752  * Returns an integer which attempts to uniquely identifies the contents of
753  * the SArray.
754  *
755  * This is not generally guaranteed to be actually a unique identifier for
756  * the data contents. It certainly tries to be, but both false positives
757  * and false negatives can be possible. It tries *really* hard to avoid
758  * false positives though.
759  *
760  * If the array is lazy, it returns a random number.
761  * If the array is materialized, it returns a hash of the file names and
762  * row sizes that make up the array.
763  */
764  size_t get_content_identifier();
765 
766  /**
767  * Extracts a range of rows from an SArray as a new SArray.
768  * This will extract rows beginning at start (inclusive) and ending at
769  * end(exclusive) in steps of "step".
770  * step must be at least 1.
771  */
772  std::shared_ptr<unity_sarray_base> copy_range(size_t start, size_t step, size_t end);
773 
774  static std::shared_ptr<unity_sarray_base>
775  create_sequential_sarray(ssize_t size, ssize_t start, bool reverse);
776 
777  /**
778  * Construct a boolean array with approximately a percent of the array
779  * randomly true.
780  *
781  * if exact is false,
782  * each row is a sample from Bernoulli(percent). On average, 'percent'
783  * fraction of the array will be true, but this will not be exact.
784  *
785  * If exact is true, \ref make_exact_uniform_boolean_array is used.
786  */
787  static std::shared_ptr<unity_sarray_base> make_uniform_boolean_array(size_t size,
788  float percent,
789  uint64_t random_seed,
790  bool exact=false);
791 
792  /**
793  * Construct a boolean array with exactly a certain number of true elements.
794  *
795  * if num_trues is > size, an array of all trues of length size is returned.
796  */
797  static std::shared_ptr<unity_sarray_base> make_exact_uniform_boolean_array(size_t size,
798  size_t num_trues,
799  uint64_t random_seed);
800 
801  /**
802  * Construct a int array with uniform distribution between 0 and \ref max_int.
803  *
804  */
805  static std::shared_ptr<unity_sarray_base> make_uniform_int_array(size_t size, size_t max_int);
806 
807  std::shared_ptr<unity_sarray_base> builtin_rolling_apply(
808  const std::string &fn_name,
809  ssize_t before,
810  ssize_t after,
811  size_t min_observations);
812 
813  std::shared_ptr<unity_sarray_base> builtin_cumulative_aggregate(const std::string& name);
814 
815  void save(oarchive& oarc) const;
816 
817  void load(iarchive& iarc);
818 
819  void show(const std::string& path_to_client,
820  const flexible_type& title,
821  const flexible_type& xlabel,
822  const flexible_type& ylabel);
823 
824  std::shared_ptr<model_base> plot(const flexible_type& _title,
825  const flexible_type& _xlabel,
826  const flexible_type& _ylabel);
827 
828  private:
829  /**
830  * Pointer to the lazy evaluator logical operator node.
831  * This can never be NULL.
832  */
833  std::shared_ptr<query_eval::planner_node> m_planner_node;
834 
835  /**
836  * Supports \ref begin_iterator() and \ref iterator_get_next().
837  * The next segment I will read. (i.e. the current segment I am reading
838  * is iterator_next_segment_id - 1)
839  */
840  size_t iterator_next_segment_id = 0;
841 
842 
843  /**
844  * A copy of the current SArray. This allows iteration, and other
845  * SAarray operations to operate together safely in harmony without collisions.
846  */
847  std::unique_ptr<sarray_reader<flexible_type> > iterator_sarray_ptr;
848 
849  /**
850  * Supports \ref begin_iterator() and \ref iterator_get_next().
851  * The begin iterator of the current segment I am reading.
852  */
853  std::unique_ptr<sarray_iterator<flexible_type>> iterator_current_segment_iter;
854 
855  /**
856  * Supports \ref begin_iterator() and \ref iterator_get_next().
857  * The end iterator of the current segment I am reading.
858  */
859  std::unique_ptr<sarray_iterator<flexible_type>> iterator_current_segment_enditer;
860 
861 
862  /**
863  * Performs either of "array [op] other" or "other [op] array",
864  * where other is a scalar value.
865  * The operation must be one of the following: "+", "-", "*", "/", "<", ">",
866  * "<=", ">=", "==", "!=". The type of the new array is dependent on the
867  * semantics of the operation.
868  * - comparison operators always return integers
869  * - +,-,* of integer against integers always return integers
870  * - / of integer against integer always returns floats
871  * - +,-,*,/ of floats against floats always return floats
872  * - +,-,*,/ of integer against floats or floats against integers
873  * always return floats.
874  *
875  * This function throws a string exception if there is a type mismatch (
876  * for instance you cannot add a string value to an integer array), or if
877  * the operation is invalid.
878  *
879  * If right_operator is false, array [op] other is performed.
880  * Otherwise is right_operator is true, other [op] array is performed.
881  *
882  * UNDEFINED values in the array are ignored.
883  *
884  * On success, a new array is returned. The new array
885  * is the same length and has the same segment structure.
886  */
887  std::shared_ptr<unity_sarray_base> scalar_operator(flexible_type other,
888  std::string op,
889  bool right_operator);
890 
891 
892  void construct_from_unity_sarray(const unity_sarray& other);
893 
894 };
895 
896 } // namespace turi
897 
898 #endif
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
std::set< Key > keys(const std::map< Key, T > &map)
Definition: stl_util.hpp:358
STL namespace.
void copy_range(S &&input, T &&output, size_t start, size_t step, size_t end)
Definition: algorithm.hpp:599
std::vector< flexible_type > _head(size_t nrows)
std::vector< flexible_type > to_vector()
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())
Definition: algorithm.hpp:64