Turi Create  4.0
turi::unity_sarray Class Reference

#include <core/storage/sframe_interface/unity_sarray.hpp>

Public Member Functions

 unity_sarray ()
 
 ~unity_sarray ()
 Destructor. Deletes all temporary sarray files created.
 
void construct_from_vector (const std::vector< flexible_type > &vec, flex_type_enum type)
 
void construct_from_sarray (std::shared_ptr< sarray< flexible_type >> s_ptr)
 
void construct_from_const (const flexible_type &value, size_t size, flex_type_enum type=flex_type_enum::UNDEFINED)
 
void construct_from_planner_node (std::shared_ptr< query_eval::planner_node > node)
 
void construct_from_sarray_index (std::string location)
 
void construct_from_files (std::string url, flex_type_enum type)
 
void construct_from_json_record_files (std::string url)
 
void construct_from_autodetect (std::string url, flex_type_enum type)
 
void save_array (std::string target_directory)
 
void save_array_by_index_file (std::string index_file)
 
void clear ()
 
size_t size ()
 
bool has_size ()
 
std::shared_ptr< sarray< flexible_type > > get_underlying_sarray ()
 
std::shared_ptr< query_eval::planner_nodeget_planner_node ()
 
std::shared_ptr< unity_sarray_base > head (size_t nrows)
 
std::vector< flexible_type_head (size_t nrows)
 
flex_type_enum dtype ()
 
std::shared_ptr< unity_sarray_base > transform (const std::string &lambda, flex_type_enum type, bool skip_undefined, uint64_t seed)
 
std::shared_ptr< unity_sarray_base > transform_native (const function_closure_info &closure, flex_type_enum type, bool skip_undefined, uint64_t seed)
 
std::shared_ptr< unity_sarray_base > append (std::shared_ptr< unity_sarray_base > other)
 
std::shared_ptr< unity_sarray_base > vector_slice (size_t start, size_t end)
 
std::shared_ptr< unity_sarray_base > filter (const std::string &lambda, bool skip_undefined, uint64_t seed)
 
std::shared_ptr< unity_sarray_base > logical_filter (std::shared_ptr< unity_sarray_base > index)
 
std::shared_ptr< unity_sarray_base > topk_index (size_t k, bool reverse)
 
bool all ()
 
bool any ()
 
std::shared_ptr< unity_sarray_base > datetime_to_str (std::string format)
 
std::shared_ptr< unity_sarray_base > str_to_datetime (std::string format)
 
std::shared_ptr< unity_sarray_base > astype (flex_type_enum dtype, bool undefined_on_failure=false)
 
std::shared_ptr< unity_sarray_base > lazy_astype (flex_type_enum dtype, bool undefined_on_failure=false)
 
std::shared_ptr< unity_sarray_base > clip (flexible_type lower=flex_undefined(), flexible_type upper=flex_undefined())
 
flexible_type max ()
 
flexible_type min ()
 
flexible_type sum ()
 
flexible_type mean ()
 
flexible_type median (bool approx)
 
flexible_type std (size_t ddof=0)
 
flexible_type var (size_t ddof=0)
 
size_t num_missing ()
 
size_t nnz ()
 
std::shared_ptr< unity_sarray_base > left_scalar_operator (flexible_type other, std::string op)
 
std::shared_ptr< unity_sarray_base > right_scalar_operator (flexible_type other, std::string op)
 
std::shared_ptr< unity_sarray_base > vector_operator (std::shared_ptr< unity_sarray_base > other, std::string op)
 
std::shared_ptr< unity_sarray_base > drop_missing_values ()
 
std::shared_ptr< unity_sarray_base > missing_mask (bool recursive=false, bool missing_is_true=true)
 
std::shared_ptr< unity_sarray_base > fill_missing_values (flexible_type default_value)
 
std::shared_ptr< unity_sarray_base > tail (size_t nrows=10)
 
std::shared_ptr< unity_sarray_base > sample (float percent, uint64_t random_seed, bool exact=false)
 
std::shared_ptr< unity_sarray_base > hash (uint64_t seed)
 
std::shared_ptr< unity_sarray_base > count_bag_of_words (std::map< std::string, flexible_type > options)
 
std::shared_ptr< unity_sarray_base > count_character_ngrams (size_t n, std::map< std::string, flexible_type > options)
 
std::shared_ptr< unity_sarray_base > count_ngrams (size_t n, std::map< std::string, flexible_type > options)
 
std::shared_ptr< unity_sarray_base > dict_trim_by_keys (const std::vector< flexible_type > &keys, bool exclude)
 
std::shared_ptr< unity_sarray_base > dict_trim_by_values (const flexible_type &lower, const flexible_type &upper)
 
std::shared_ptr< unity_sarray_base > dict_keys ()
 
std::shared_ptr< unity_sarray_base > dict_values ()
 
std::shared_ptr< unity_sarray_base > dict_has_any_keys (const std::vector< flexible_type > &keys)
 
std::shared_ptr< unity_sarray_base > dict_has_all_keys (const std::vector< flexible_type > &keys)
 
std::shared_ptr< unity_sarray_base > item_length ()
 
std::shared_ptr< unity_sframe_base > expand (const std::string &column_name_prefix, const std::vector< flexible_type > &expanded_column_elements, const std::vector< flex_type_enum > &expanded_columns_types)
 
std::shared_ptr< unity_sframe_base > unpack (const std::string &column_name_prefix, const std::vector< flexible_type > &unpacked_keys, const std::vector< flex_type_enum > &unpacked_columns_types, const flexible_type &na_value)
 
std::shared_ptr< unity_sframe_base > unpack_dict (const std::string &column_name_prefix, const std::vector< flexible_type > &limit, const flexible_type &na_value)
 
std::shared_ptr< unity_sarray_base > subslice (flexible_type start, flexible_type step, flexible_type stop)
 
std::shared_ptr< unity_sarray_base > ternary_operator (std::shared_ptr< unity_sarray_base > is_true, std::shared_ptr< unity_sarray_base > is_false)
 
std::shared_ptr< unity_sarray_base > to_const (const flexible_type &value, flex_type_enum dtype)
 
void begin_iterator ()
 
std::vector< flexible_typeiterator_get_next (size_t len)
 
std::vector< flexible_typeto_vector ()
 
void materialize ()
 
bool is_materialized ()
 
size_t get_content_identifier ()
 
std::shared_ptr< unity_sarray_base > copy_range (size_t start, size_t step, size_t end)
 

Static Public Member Functions

static std::shared_ptr< unity_sarray_base > make_uniform_boolean_array (size_t size, float percent, uint64_t random_seed, bool exact=false)
 
static std::shared_ptr< unity_sarray_base > make_exact_uniform_boolean_array (size_t size, size_t num_trues, uint64_t random_seed)
 
static std::shared_ptr< unity_sarray_base > make_uniform_int_array (size_t size, size_t max_int)
 

Detailed Description

This is the SArray object exposed to Python. Abstractly, it stores a single column of a flexible_type. An Sarray represents a single immutable column: i.e. once created, it cannot be modified.

Internally, it is represented as a single shared_ptr to an "sarray<flexible_type>" sarray object. We delay construction of the internal sarray object until a "construct" call is made. This allows the class to be used in the following way:

// creates the array
array.construct(...);
// now the array is immutable.

Multiple different construct functions can then be used to create sarrays from different sources: some sources may require the sarray to download files, etc.

The SArray may require temporary on disk storage which will be deleted when the SArray is deleted. The temporary file names are obtained from turi::get_temp_name

Definition at line 54 of file unity_sarray.hpp.

Constructor & Destructor Documentation

◆ unity_sarray()

turi::unity_sarray::unity_sarray ( )

Default Constructor. Does nothing basically. Use one of the construct_from_* functions to construct the contents of the SArray.

Member Function Documentation

◆ _head()

std::vector<flexible_type> turi::unity_sarray::_head ( size_t  nrows)
inline

Same as head, return vector<flexible_type>, used for testing.

Definition at line 182 of file unity_sarray.hpp.

◆ all()

bool turi::unity_sarray::all ( )

Returns true if all the values in the sarray are non-zero / non-empty. An empty array returns true.

◆ any()

bool turi::unity_sarray::any ( )

Returns true if any value in the sarray is non-zero / non-empty. An empty array returns false.

◆ append()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::append ( std::shared_ptr< unity_sarray_base >  other)

Append all rows from "other" sarray to "this" sarray and returns a new sarray that contains all rows from both sarrays

◆ astype()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::astype ( flex_type_enum  dtype,
bool  undefined_on_failure = false 
)

Creates a new SArray with the same values as current one, but casted to the given type.

If undefined_on_failure is set, cast failures do not cause errors, but become undefined values.

See also
lazy_astype

◆ begin_iterator()

void turi::unity_sarray::begin_iterator ( )

Begin iteration through the SArray.

Works together with iterator_get_next(). The usage pattern is as follows:

array.begin_iterator();
while(1) {
auto ret = array.iterator_get_next(64);
// do stuff
if (ret.size() < 64) {
// we are done
break;
}
}

Note that use of pretty much any of the other data-dependent SArray functions will invalidate the iterator.

◆ clear()

void turi::unity_sarray::clear ( )

Clears the contents of the SArray, deleting all temporary files if any.

◆ clip()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::clip ( flexible_type  lower = flex_undefined(),
flexible_type  upper = flex_undefined() 
)

Creates a new SArray with the same values as the current one, except any values above or below the given bounds are changed to be equal to the bound.

If lower or upper are given a flex_undefined(), this is interpreted to mean that there is no bound there. For example, clip(flex_undefined(), 25) clips with no lower bound and an upper bound of 25.

◆ construct_from_autodetect()

void turi::unity_sarray::construct_from_autodetect ( std::string  url,
flex_type_enum  type 
)

Given a URL, this function attempts to autodetect if it should

  • treat it as a .sidx file and load an SArray from it (construct_from_sarray_index)
  • treat it as a file to read line by line (construct_from_files)
  • treat it as a directory and load an SArray from it (construct_from_sarray_index)

◆ construct_from_const()

void turi::unity_sarray::construct_from_const ( const flexible_type value,
size_t  size,
flex_type_enum  type = flex_type_enum::UNDEFINED 
)

Constructs a unity_sarray from a const value.

◆ construct_from_files()

void turi::unity_sarray::construct_from_files ( std::string  url,
flex_type_enum  type 
)

Constructs an Sarray from a url. Each line of the file will be a row in the resultant SArray, and each row will be of string type. If the current object is already storing an array, it is cleared (clear()).May throw an exception on failure. If an exception occurs, the contents of SArray is empty.

◆ construct_from_json_record_files()

void turi::unity_sarray::construct_from_json_record_files ( std::string  url)

Constructs an SArray from one or more json record files.

Each json record file contains an array of dictionaries. Resultant SArray is of dictionary type.

◆ construct_from_planner_node()

void turi::unity_sarray::construct_from_planner_node ( std::shared_ptr< query_eval::planner_node node)

Constructs a unity_sarray from a parallel iterator generator.

◆ construct_from_sarray()

void turi::unity_sarray::construct_from_sarray ( std::shared_ptr< sarray< flexible_type >>  s_ptr)

Constructs a unity_sarray from an existing sarray. This simply sets this class's shared_ptr to the one given by the parameter.

◆ construct_from_sarray_index()

void turi::unity_sarray::construct_from_sarray_index ( std::string  location)

Constructs an Sarray from an existing directory on disk saved with save_array() or a on disk sarray prefix (saved with save_array_by_index_file()). This function will automatically detect if the location is a directory, or a file. The files will not be deleted on destruction. If the current object is already storing an array, it is cleared (clear()). May throw an exception on failure. If an exception occurs, the contents of SArray is empty.

◆ construct_from_vector()

void turi::unity_sarray::construct_from_vector ( const std::vector< flexible_type > &  vec,
flex_type_enum  type 
)

Constructs an Sarray from an in memory vector. If the current object is already storing an array, it is cleared (clear()).May throw an exception on failure. If an exception occurs, the contents of SArray is empty.

◆ copy_range()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::copy_range ( size_t  start,
size_t  step,
size_t  end 
)

Extracts a range of rows from an SArray as a new SArray. This will extract rows beginning at start (inclusive) and ending at end(exclusive) in steps of "step". step must be at least 1.

◆ count_bag_of_words()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::count_bag_of_words ( std::map< std::string, flexible_type options)

Do a word-count for each element in the SArray and return a SArray of dictionary

◆ count_character_ngrams()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::count_character_ngrams ( size_t  n,
std::map< std::string, flexible_type options 
)

Do a character n-gram count for each element in the SArray and return a SArray of dictionary type. Parameter n is the number or charachters in each n-gram options takes: to_lower, which makes words lower case ignore_space, which ignores spaces in calculating charachter n-grams

◆ count_ngrams()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::count_ngrams ( size_t  n,
std::map< std::string, flexible_type options 
)

Do a character n-gram count for each element in the SArray and return a SArray of dictionary type. Parameter n is the number of words in each n-gram options takes: to_lower, which makes words lower case

◆ datetime_to_str()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::datetime_to_str ( std::string  format)

Creates a new SArray with the datetime values casted to string.

"format" determines the string format for the output SArray.

◆ dict_has_all_keys()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::dict_has_all_keys ( const std::vector< flexible_type > &  keys)

If SArray dtype is dict, returns a new SArray which contains integer of 1s or 0s with 1 means the original array element has all keys in the param otherwise throws exception

◆ dict_has_any_keys()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::dict_has_any_keys ( const std::vector< flexible_type > &  keys)

If SArray dtype is dict, returns a new SArray which contains integer of 1s or 0s with 1 means the original array element has at least one key in the param otherwise throws exception

◆ dict_keys()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::dict_keys ( )

If SArray dtype is dict, returns a new SArray which contains keys for input dictionary otherwise throws exception

◆ dict_trim_by_keys()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::dict_trim_by_keys ( const std::vector< flexible_type > &  keys,
bool  exclude 
)

If SArray dtype is dict, filter out each dict by the given keys. If exclude is True, then all keys that are in the input key list are removed If exclude is False, then only keys that are in the input key list are retained

◆ dict_trim_by_values()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::dict_trim_by_values ( const flexible_type lower,
const flexible_type upper 
)

If SArray dtype is dict, filter out each dict by the given value boundary. all items whose value is not in the low/up bound are removed from the dictionary The boundary are included. I.e, if a value is either lower or upper bound, then the key/value pair is included in the result This function will fail if the value is not comparable

◆ dict_values()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::dict_values ( )

If SArray dtype is dict, returns a new SArray which contains values for input dictionary otherwise throws exception

◆ drop_missing_values()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::drop_missing_values ( )

Returns a new array with all UNDEFINED values removed. A new array is returned with the same type as the current array, but potentially shorter. If the array has no missing values, the output array has the same length and the same segment structure as this array.

◆ dtype()

flex_type_enum turi::unity_sarray::dtype ( )

Returns the type name of the SArray

◆ expand()

std::shared_ptr<unity_sframe_base> turi::unity_sarray::expand ( const std::string &  column_name_prefix,
const std::vector< flexible_type > &  expanded_column_elements,
const std::vector< flex_type_enum > &  expanded_columns_types 
)

Expand an SArray of datetime type to a set of new columns.

Parameters
column_name_prefixprefix for the expanded column name
expanded_column_elementsa list including the elements to expand from the datetime column. Elements could be 'year','month','day' 'hour','minute','second', and 'timezone'.
expanded_columns_typeslist of types for the expanded columns

Returns a new SFrame that contains the expanded columns

◆ fill_missing_values()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::fill_missing_values ( flexible_type  default_value)

Returns a new array with all UNDEFINED values replaced with the given value.

Throws if the given value is not convertible to the SArray's type.

◆ filter()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::filter ( const std::string &  lambda,
bool  skip_undefined,
uint64_t  seed 
)

Returns a new SArray which is filtered to by the given lambda function. If the lambda evaluates an element to true, this element is copied to the new SArray. If not, it isn't. Throws an exception if the return type of the lambda is not castable to a boolean value.

◆ get_content_identifier()

size_t turi::unity_sarray::get_content_identifier ( )

Returns an integer which attempts to uniquely identifies the contents of the SArray.

This is not generally guaranteed to be actually a unique identifier for the data contents. It certainly tries to be, but both false positives and false negatives can be possible. It tries really hard to avoid false positives though.

If the array is lazy, it returns a random number. If the array is materialized, it returns a hash of the file names and row sizes that make up the array.

◆ get_planner_node()

std::shared_ptr<query_eval::planner_node> turi::unity_sarray::get_planner_node ( )

Returns the underlying planner pointer

◆ get_underlying_sarray()

std::shared_ptr<sarray<flexible_type> > turi::unity_sarray::get_underlying_sarray ( )

Obtains the underlying sarray pointer. TODO: will slowly move away all users of this function to get_lazy_sarray

◆ has_size()

bool turi::unity_sarray::has_size ( )

Returns true if size is obtainable efficiently.

◆ hash()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::hash ( uint64_t  seed)

Returns an SArray of type flex_int that contains the hash of each element. The hash function takes a seed value so this can be used for random generation as well.

◆ head()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::head ( size_t  nrows)

Returns some number of rows of the SArray

NOTE: If there are more elements asked for than can fit into memory, this makes no attempt to stop crashing your computer.

◆ is_materialized()

bool turi::unity_sarray::is_materialized ( )

test hook to check if the array is materialized

◆ item_length()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::item_length ( )

Returns a new SArray that contains elements that are the length of each item in input SArray. This function only works on SArray of type vector, list and dict. It is equivalent to the following python work sa_ret = sa.apply(lambda x: len(x))

◆ iterator_get_next()

std::vector<flexible_type> turi::unity_sarray::iterator_get_next ( size_t  len)

Obtains the next block of elements of size len from the SFrame. Works together with begin_iterator(). See the code example in begin_iterator() for details.

This function will always return a vector of length 'len' unless at the end of the array, or if an error has occured.

Parameters
lenThe number of elements to return
Returns
The next collection of elements in the array. Returns less then len elements on end of file or failure.

◆ lazy_astype()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::lazy_astype ( flex_type_enum  dtype,
bool  undefined_on_failure = false 
)

Creates a new SArray with the same values as current one, but casted to the given type. Performed lazily.

If undefined_on_failure is set, cast failures do not cause errors, but become undefined values.

See also
astype

◆ left_scalar_operator()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::left_scalar_operator ( flexible_type  other,
std::string  op 
)

Performs the equivalent of array [op] other , where other is a scalar value. The operation must be one of the following: "+", "-", "*", "/", "<", ">", "<=", ">=", "==", "!=",'','in'. The type of the new array is dependent on the semantics of the operation.

  • comparison operators always return integers
  • +,-,* of integer against integers always return integers
  • / of integer against integer always returns floats
  • +,-,*,/ of floats against floats always return floats
  • +,-,*,/ of integer against floats or floats against integers always return floats.
  • %. integer against integers -> integers
  • "in". must be string against string, and returns a boolean.

This function throws a string exception if there is a type mismatch ( for instance you cannot add a string value to an integer array), or if the operation is invalid.

UNDEFINED values in the array are ignored.

On success, a new array is returned. The new array is the same length and has the same segment structure.

◆ logical_filter()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::logical_filter ( std::shared_ptr< unity_sarray_base >  index)

Returns a new SArray which is filtered by a given logical column. The index array must be the same length as the current array. An output array is returned containing only the elements in the current where are the corresponding element in the index array evaluates to true.

◆ make_exact_uniform_boolean_array()

static std::shared_ptr<unity_sarray_base> turi::unity_sarray::make_exact_uniform_boolean_array ( size_t  size,
size_t  num_trues,
uint64_t  random_seed 
)
static

Construct a boolean array with exactly a certain number of true elements.

if num_trues is > size, an array of all trues of length size is returned.

◆ make_uniform_boolean_array()

static std::shared_ptr<unity_sarray_base> turi::unity_sarray::make_uniform_boolean_array ( size_t  size,
float  percent,
uint64_t  random_seed,
bool  exact = false 
)
static

Construct a boolean array with approximately a percent of the array randomly true.

if exact is false, each row is a sample from Bernoulli(percent). On average, 'percent' fraction of the array will be true, but this will not be exact.

If exact is true, make_exact_uniform_boolean_array is used.

◆ make_uniform_int_array()

static std::shared_ptr<unity_sarray_base> turi::unity_sarray::make_uniform_int_array ( size_t  size,
size_t  max_int 
)
static

Construct a int array with uniform distribution between 0 and max_int.

◆ materialize()

void turi::unity_sarray::materialize ( )

materialize the sarray, this is different from save() as this is a temporary persist of this sarray to disk to speed up some computation (for example, lambda) this will NOT create a new uity_sarray.

◆ max()

flexible_type turi::unity_sarray::max ( )

Returns the largest element in the sarray. An empty array returns flex_undefined, which in python is numpy.nan. Only works for INTEGER and FLOAT. Throws an exception if invoked on an sarray of any other type. Undefined values in the array are skipped.

◆ mean()

flexible_type turi::unity_sarray::mean ( )

Returns the mean of the elements in sarray as a flex_float.

Invoking on an empty sarray returns flex_undefined. Invoking on a non-numeric type throws an exception. Undefined values in the array are skipped.

◆ median()

flexible_type turi::unity_sarray::median ( bool  approx)

Returns the medain of the elements in the sarray.

Invoking on an empty sarray returns flex_undefined. Invoking on a non-numeric type throws an exception. Undefined values in the array are skipped.

◆ min()

flexible_type turi::unity_sarray::min ( )

Returns the smallest element in the sarray. An empty array returns flex_undefined, which in python is numpy.nan. Only works for INTEGER and FLOAT. Throws an exception if invoked on an sarray of any other type. Undefined values in the array are skipped.

◆ missing_mask()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::missing_mask ( bool  recursive = false,
bool  missing_is_true = true 
)

Returns a new integer typed array indicating the presence of a missing value or float NA in the corresponding element.

If recursive is true, then it also checks if a NA is present in any element of a recursive type.

If missing_is_true is true, then the array contains a 1 if the element is a missing value and a 0 if it is not; otherwise, it returns 1 on the presence of a na.

◆ nnz()

size_t turi::unity_sarray::nnz ( )

Returns the number of non-zero elements in the array. Functionally equivalent to

nonzero().length()

But takes much less memory.

◆ num_missing()

size_t turi::unity_sarray::num_missing ( )

Returns the number of missing values in the SArray.

◆ right_scalar_operator()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::right_scalar_operator ( flexible_type  other,
std::string  op 
)

Performs the equivalent of other [op] array, where other is a scalar value. The operation must be one of the following: "+", "-", "*", "/", "<", ">", "<=", ">=", "==", "!=". The type of the new array is dependent on the semantics of the operation.

  • comparison operators always return integers
  • +,-,* of integer against integers always return integers
  • / of integer against integer always returns floats
  • +,-,*,/ of floats against floats always return floats
  • +,-,*,/ of integer against floats or floats against integers always return floats.

This function throws a string exception if there is a type mismatch ( for instance you cannot add a string value to an integer array), or if the operation is invalid.

UNDEFINED values in the array are ignored.

On success, a new array is returned. The new array is the same length and has the same segment structure.

◆ sample()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::sample ( float  percent,
uint64_t  random_seed,
bool  exact = false 
)

Returns a uniform random sample of the sarray, that contains percent of the total elements, without replacement, using the random_seed.

◆ save_array()

void turi::unity_sarray::save_array ( std::string  target_directory)

Saves a copy of the current sarray into a directory. Does not modify the current sarray

◆ save_array_by_index_file()

void turi::unity_sarray::save_array_by_index_file ( std::string  index_file)

Saves a copy of the current sarray into a target location defined by an index file. DOes not modify the current sarray.

◆ size()

size_t turi::unity_sarray::size ( )

Returns the number of rows in the SArray. Or 0 if the SArray is empty.

◆ std()

flexible_type turi::unity_sarray::std ( size_t  ddof = 0)

Returns the standard deviation of the elements in sarray as a flex_float.

Parameters
ddof...stands for "delta degrees of freedom". Adjusts the degrees of freedom in the variance calculation. If ddof=0, there are N degrees of freedom, with N being the number of elements in the sarray.

Throws an exception if: ddof >= sarray size sarray is of a non-numeric type

Returns flex_undefined if executed on empty or non-existent sarray. Undefined values in the array are skipped.

◆ str_to_datetime()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::str_to_datetime ( std::string  format)

Creates a new SArray with the string values casted to datetime.

"format" determines the string format for the input SArray.

◆ subslice()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::subslice ( flexible_type  start,
flexible_type  step,
flexible_type  stop 
)

Return the subslice of the sarray

Parameters
startThe start row of the slice, cycle around if negative
stepTake an element every step, can be negative
stopThe end row of the slice, cycle around if negative

◆ sum()

flexible_type turi::unity_sarray::sum ( )

Returns the sum of all elements in the sarray. An empty returns flex_undefined, which in python is numpy.nan. Only works for INTEGER and FLOAT. Throws an exception if invoked on an sarray of any other type. Overflows without shame. Undefined values in the array are skipped.

◆ tail()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::tail ( size_t  nrows = 10)

Returns some number of rows on the end of the SArray. The values are returned in the order they were found in the SArray.

NOTE: If there are more elements asked for than can fit into memory, this makes no attempt to stop crashing your computer.

◆ ternary_operator()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::ternary_operator ( std::shared_ptr< unity_sarray_base >  is_true,
std::shared_ptr< unity_sarray_base >  is_false 
)

is_true and is_false and this SArray must be the same size. Returns an SArray of the same size.

For each non-zero value in this SArray, it picks up the corresponding value from is_true. For each zero value in this SArray, it picks up the corresponding value from is_false.

◆ to_const()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::to_const ( const flexible_type value,
flex_type_enum  dtype 
)

Returns an SArray of the same length but with all constant values. Does so without materializing the SArray.

Note
This is really only useful for internal use

◆ to_vector()

std::vector<flexible_type> turi::unity_sarray::to_vector ( )
inline

Return the content as a vector. Convenience function.

Definition at line 735 of file unity_sarray.hpp.

◆ topk_index()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::topk_index ( size_t  k,
bool  reverse 
)

Returns a new SArray which has the top k elements selected. k should be reasonably small. O(k) memory required.

If reverse if true, the bottom k is returned instead

◆ transform()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::transform ( const std::string &  lambda,
flex_type_enum  type,
bool  skip_undefined,
uint64_t  seed 
)

Returns a new sarray which is a transform of this using a Python lambda function pickled into a string.

◆ transform_native()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::transform_native ( const function_closure_info closure,
flex_type_enum  type,
bool  skip_undefined,
uint64_t  seed 
)

Returns a new sarray which is a transform of this using a registered toolkit function.

◆ unpack()

std::shared_ptr<unity_sframe_base> turi::unity_sarray::unpack ( const std::string &  column_name_prefix,
const std::vector< flexible_type > &  unpacked_keys,
const std::vector< flex_type_enum > &  unpacked_columns_types,
const flexible_type na_value 
)

Unpack an SArray of dict/list/vector type to a set of new columns. For dictionary type, each unique key is a new column For vector/list type, each sub column in the vector is a new column

Parameters
column_name_prefixprefix for the unpacked column name
unpacked_keyslist of keys to unpack, this is list of string for dictionary type, and list of integers for list/array type. This list is used to limit the subset of values to unpack
unpacked_column_typeslist of types for the unpacked columns
na_valueif not undefined, replace all na_value with missing values

Returns a new SFrame that contains the unpacked columns

◆ unpack_dict()

std::shared_ptr<unity_sframe_base> turi::unity_sarray::unpack_dict ( const std::string &  column_name_prefix,
const std::vector< flexible_type > &  limit,
const flexible_type na_value 
)

Unpack a dict SArray to a set of new columns by extracting each key from dict and creating new column for each unique key. The key name becomes column name

Parameters
column_name_prefixprefix for the unpacked column name
limitlimited keys for the unpack
na_valueif not undefined, replace all na_value with missing values

Returns a new SFrame that contains the unpacked columns

◆ var()

flexible_type turi::unity_sarray::var ( size_t  ddof = 0)

Returns the variance of the elements in sarray as a flex_float.

Parameters
ddof...stands for "delta degrees of freedom". Adjusts the degrees of freedom in the variance calculation. If ddof=0, there are N degrees of freedom, with N being the number of elements in the sarray.

Throws an exception if: ddof >= sarray size sarray is of a non-numeric type

Returns flex_undefined if executed on empty or non-existent SArray. Undefined values in the array are skipped.

◆ vector_operator()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::vector_operator ( std::shared_ptr< unity_sarray_base >  other,
std::string  op 
)

Performs the equivalent of array [op] other, where other is an SArray. The operation must be one of the following: "+", "-", "*", "/", "<", ">", "<=", ">=", "==", "!=". The type of the new array is dependent on the semantics of the operation.

  • comparison operators always return integers
  • +,-,* of integer against integers always return integers
  • / of integer against integer always returns floats
  • +,-,*,/ of floats against floats always return floats
  • +,-,*,/ of integer against floats or floats against integers always return floats.

This function throws a string exception if there is a type mismatch ( for instance you cannot add a string value to an integer array), or if the operation is invalid.

UNDEFINED values in the array are ignored.

On success, a new array is returned. The new array is the same length and has the same segment structure.

◆ vector_slice()

std::shared_ptr<unity_sarray_base> turi::unity_sarray::vector_slice ( size_t  start,
size_t  end 
)

If this sarray contains vectors, this returns a new sarray comprising of a vertical slice of the vector from position start (inclusive) to position end (exclusive). Throws an exception if the sarray is not an vector.

If end==(start+1), the output is an SArray of doubles. if end > start, the output is an SArray of vectors, each of length (end - start). If a vector cannot be sliced (for instance the length of the vector is less than end), the resultant value will be UNDEFINED.

End must be greater than start. throws an exception otherwise.


The documentation for this class was generated from the following file: