Turi Create  4.0
turi::gl_sarray Class Reference

#include <core/data/sframe/gl_sarray.hpp>

Public Member Functions

 gl_sarray ()
 Constructs an empty SArray.
 
 gl_sarray (const gl_sarray &)
 Copy Constructor.
 
 gl_sarray (gl_sarray &&)
 Move Constructor.
 
gl_sarrayoperator= (const gl_sarray &)
 Copy Assignment.
 
gl_sarrayoperator= (gl_sarray &&)
 Move Assignment.
 
 gl_sarray (const std::string &directory)
 
 gl_sarray (const std::vector< flexible_type > &values, flex_type_enum dtype=flex_type_enum::UNDEFINED)
 
 gl_sarray (const std::initializer_list< flexible_type > &values)
 
gl_sarray contains (const flexible_type &other) const
 
flexible_type operator[] (int64_t i) const
 
gl_sarray operator[] (const gl_sarray &slice) const
 
gl_sarray operator[] (const std::initializer_list< int64_t > &slice) const
 
void materialize_to_callback (std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)> callback, size_t nthreads=(size_t)(-1))
 
gl_sarray_range range_iterator (size_t start=0, size_t end=(size_t)(-1)) const
 
void save (const std::string &directory, const std::string &format="binary") const
 
size_t size () const
 
bool empty () const
 
flex_type_enum dtype () const
 
void materialize () const
 
bool is_materialized () const
 
gl_sarray head (size_t n) const
 
gl_sarray tail (size_t n) const
 
gl_sarray count_words (bool to_lower=true, turi::flex_list delimiters={"\, "\", "\", "\", "\", " "}) const
 
gl_sarray count_ngrams (size_t n=2, std::string method="word", bool to_lower=true, bool ignore_space=true) const
 
gl_sarray dict_trim_by_keys (const std::vector< flexible_type > &keys, bool exclude=true) const
 
gl_sarray dict_trim_by_values (const flexible_type &lower=FLEX_UNDEFINED, const flexible_type &upper=FLEX_UNDEFINED) const
 
gl_sarray dict_keys () const
 
gl_sarray dict_values () const
 
gl_sarray dict_has_any_keys (const std::vector< flexible_type > &keys) const
 
gl_sarray dict_has_all_keys (const std::vector< flexible_type > &keys) const
 
gl_sarray apply (std::function< flexible_type(const flexible_type &)> fn, flex_type_enum dtype, bool skip_undefined=true) const
 
gl_sarray filter (std::function< bool(const flexible_type &)> fn, bool skip_undefined=true) const
 
gl_sarray sample (double fraction) const
 
gl_sarray sample (double fraction, size_t seed, bool exact=false) const
 
gl_sarray hash (size_t seed=0) const
 
bool all () const
 
bool any () const
 
flexible_type max () const
 
flexible_type min () const
 
flexible_type sum () const
 
flexible_type mean () const
 
flexible_type std () const
 
size_t nnz () const
 
size_t num_missing () const
 
gl_sarray datetime_to_str (const std::string &str_format="%Y-%m-%dT%H:%M:%S%ZP") const
 
gl_sarray str_to_datetime (const std::string &str_format="%Y-%m-%dT%H:%M:%S%ZP") const
 
gl_sarray pixel_array_to_image (size_t width, size_t height, size_t channels=3, bool undefined_on_failure=true) const
 
gl_sarray astype (flex_type_enum dtype, bool undefined_on_failure=true) const
 
gl_sarray clip (flexible_type lower=FLEX_UNDEFINED, flexible_type upper=FLEX_UNDEFINED) const
 
gl_sarray clip_lower (flexible_type threshold) const
 
gl_sarray clip_upper (flexible_type threshold) const
 
gl_sarray dropna () const
 
gl_sarray fillna (flexible_type value) const
 
gl_sarray topk_index (size_t topk=10, bool reverse=false) const
 
gl_sarray append (const gl_sarray &other) const
 
gl_sarray unique () const
 
gl_sarray item_length () const
 
gl_sframe split_datetime (const std::string &column_name_prefix="X", const std::vector< std::string > &limit={"year","month","day","hour","minute","second"}, bool tzone=false) const
 
gl_sframe unpack (const std::string &column_name_prefix="X", const std::vector< flex_type_enum > &column_types=std::vector< flex_type_enum >(), const flexible_type &na_value=FLEX_UNDEFINED, const std::vector< flexible_type > &limit=std::vector< flexible_type >()) const
 
gl_sarray sort (bool ascending=true) const
 
gl_sarray subslice (flexible_type start=FLEX_UNDEFINED, flexible_type stop=FLEX_UNDEFINED, flexible_type step=FLEX_UNDEFINED) const
 
gl_sarray cumulative_aggregate (std::shared_ptr< group_aggregate_value > aggregator) const
 
gl_sarray cumulative_sum () const
 
gl_sarray builtin_rolling_apply (const std::string &fn_name, ssize_t start, ssize_t end, size_t min_observations=size_t(-1)) const
 
void show (const std::string &path_to_client, const flexible_type &title, const flexible_type &xlabel, const flexible_type &ylabel) const
 
std::shared_ptr< visualization::Plot > plot (const flexible_type &title, const flexible_type &xlabel, const flexible_type &ylabel) const
 
virtual std::shared_ptr< unity_sarrayget_proxy () const
 
Numeric operator overloads.

Most operators are overloaded and will perform element-wise operations on the entire array.

For instance:

gl_sarray a{1,2,3,4,5};
// an array of 5 exclamation marks
auto ret = (2 * sa - 1).astype(flex_type_enum::STRING) + b;
// results in ret being the array ["1!", "3!", "5!", "7!", "9!"];

Comparison operators will return a gl_sarray of binary integers.

gl_sarray a{1,2,3,4,5};
auto ret = a > 3;
// ret will be an integer array containing [0,0,0,1,1]

Logical and bitwise operators are equivalent: & and && mean the same thing and | and || and provide logical element-wise "and" and "or"s.

gl_sarray a{1,2,3,4,5};
auto ret = a > 1 && a <= 4;
// ret will be an integer array containing [0,1,1,1,0]

These are useful for the logical filter operation:

gl_sarray a{1,2,3,4,5};
auto ret = b[a > 1 && a <= 4];
// ret will be an string array containing ["2","3","4"]

The logical and bitwise operators can be used with non-integral arrays in which case all empty values evaluate to False. i.e. for string, list, and dictionary SArrays, empty values are interpreted as false.

For instance:

gl_sarray a{"1","","2"}; // sarray of strings
gl_sarray b{1,1,0}; // sarray of integers
auto ret = a && b; // ret is now {1, 0, 0}
gl_sarray operator+ (const gl_sarray &other) const
 
gl_sarray operator- (const gl_sarray &other) const
 
gl_sarray operator* (const gl_sarray &other) const
 
gl_sarray operator/ (const gl_sarray &other) const
 
gl_sarray operator< (const gl_sarray &other) const
 
gl_sarray operator> (const gl_sarray &other) const
 
gl_sarray operator<= (const gl_sarray &other) const
 
gl_sarray operator>= (const gl_sarray &other) const
 
gl_sarray operator== (const gl_sarray &other) const
 
gl_sarray operator+ (const flexible_type &other) const
 
gl_sarray operator- (const flexible_type &other) const
 
gl_sarray operator* (const flexible_type &other) const
 
gl_sarray operator/ (const flexible_type &other) const
 
gl_sarray operator< (const flexible_type &other) const
 
gl_sarray operator> (const flexible_type &other) const
 
gl_sarray operator<= (const flexible_type &other) const
 
gl_sarray operator>= (const flexible_type &other) const
 
gl_sarray operator== (const flexible_type &other) const
 
gl_sarray operator+= (const gl_sarray &other)
 
gl_sarray operator-= (const gl_sarray &other)
 
gl_sarray operator*= (const gl_sarray &other)
 
gl_sarray operator/= (const gl_sarray &other)
 
gl_sarray operator+= (const flexible_type &other)
 
gl_sarray operator-= (const flexible_type &other)
 
gl_sarray operator*= (const flexible_type &other)
 
gl_sarray operator/= (const flexible_type &other)
 
gl_sarray operator && (const gl_sarray &other) const
 
gl_sarray operator|| (const gl_sarray &other) const
 
gl_sarray operator & (const gl_sarray &other) const
 
gl_sarray operator| (const gl_sarray &other) const
 

Static Public Member Functions

static gl_sarray from_const (const flexible_type &value, size_t size)
 
static gl_sarray from_sequence (size_t start, size_t end, bool reverse=false)
 
static gl_sarray read_json (const std::string &url)
 

Detailed Description

An immutable, homogeneously typed array object backed by persistent storage.

The gl_sarray is a contiguous column of a single type with missing value support, and works with disk to support the holding of data that is much larger than the machine's main memory. Runtime typing of the gl_sarray is managed through the flexible_type, which is an efficient runtime typed value. The types supported by the flexible_type are listed in flex_type_enum.

Construction

Abstractly the gl_sarray provides an interface to read and write flexible_type values where all values have the same type at runtime (for instance flex_type_enum::INTEGER). The special type flex_type_enum::UNDEFINED (or the value FLEX_UNDEFINED ) is used to denote a missing value and can be used in combination with any types.

For instance:

// creates an array of 5 integers
gl_sarray g({1,2,3,4,5});
// creates an array of 5 doubles
gl_sarray g({1.0,2.0,3.0,4.0,5.0});
// creates an array of 4 doubles with one missing value
gl_sarray g({1.0,2.0,3.0,FLEX_UNDEFINED,5.0});

While the gl_sarray is conceptually immutable, all that really means is that element-wise modifications are not permitted. However, full SArray assignments are permitted.

gl_sarray g({1,2,3,4,5});
gl_sarray s = g + 1;
// s is {2,3,4,5,6}

Usage

The gl_sarray API is designed to very closely mimic the Python SArray API and supports much of the Python-like capabilities, but in C++.

For instance, vector and operations:

gl_sarray s{1,2,3,4,5};
gl_sarray y{2.0,3.0,2.5,1.5,2.5};
auto t = (s + 10) / y;

Logical filters:

gl_sarray s{1,2,3,4,5};
gl_sarray selector{0,0,1,1,1}
auto t = s[selector];
// t is [3,4,5]
gl_sarray s{1,2,3,4,5};
auto t = s[s < 3];
// t is [1,2]

Python Range slicing:

gl_sarray s{1,2,3,4,5};
auto t = s[{0,3}];
auto u = s[{-3,-1}];
// t is [1,2]
// u is [3,4]

And many others.

The gl_sarray can be read inefficiently using operator[]

gl_sarray s{1,2,3,4,5};
int val = s[2];
// val == 3

Or iterated efficiently using the range_iterator

for (const auto& i: sa.range_iterator() {
...
}

The range_iterator materializes the SFrame if not already materialized, but materialize_to_callback can be used to read the SFrame without materialization.

The gl_sarray can constructed in a variety of means:

Python Binding

When used as an input argument in an SDK function, it permits a Python SArray to be passed as an argument. When used in an output argument, it will return a Python SArray.

For instance:

//
// Compiled as example.so
//
gl_sarray add_one_to_array(gl_sarray data) {
return s + 1;
}
REGISTER_FUNCTION(add_one_to_array, "data");

Will allow this to be done in Python:

import turicreate as gl
import example
sa = SArray([1,2,3,4,5])
ret = example.add_one_to_array(sa)
# ret is now [2,3,4,5,6]

Details

The gl_sarray is internally a reference object. i.e. in the code below, both a and b will point to the same underlying sarray. However since gl_sarray's are immutable, this does not introduce any interface quirks.

gl_sarray a{1,2,3};
gl_sarray b = a;

The gl_sarray is also lazy evaluated behind the scenes to minimize disk access. Thus regardless of the size of the SArray or the complexity of the lambda operation, this operation will run quickly.

b = (sa.apply(some_complicated_function) + 5) / 2;

This may have the unfortunate effect of hiding errors until materialization is forced to occur. i.e. it might be some time much later in your code that errors in some_complicated_function will trigger.

However, not all operations are lazy and certain operations will force materialization, and that is a constant target for optimization.

If you want to force materialization yourself, use materialize()

Definition at line 199 of file gl_sarray.hpp.

Constructor & Destructor Documentation

◆ gl_sarray() [1/3]

turi::gl_sarray::gl_sarray ( const std::string &  directory)
explicit

Constructs a gl_sarray from a binary SArray saved previously with save().

See also
save()

◆ gl_sarray() [2/3]

turi::gl_sarray::gl_sarray ( const std::vector< flexible_type > &  values,
flex_type_enum  dtype = flex_type_enum::UNDEFINED 
)

Constructs an gl_sarray from a in memory vector of values.

std::vector<flexible_type> values{1,2,3,4,5};
// auto infers data type
// resultant array is of floating point type.
// Automatic type casting is performed internally.

◆ gl_sarray() [3/3]

turi::gl_sarray::gl_sarray ( const std::initializer_list< flexible_type > &  values)

Constructs a gl_sarray from an initializer list of values.

Type is automatically determined.

// creates an array of 5 integers
gl_sarray g({1,2,3,4,5});
// creates an array of 5 doubles
gl_sarray g({1.0,2.0,3.0,4.0,5.0});
// non-contiguous type. Most general type is selected.
// This will result in an array of strings.
gl_sarray g({1,2.0,"3"});

Member Function Documentation

◆ all()

bool turi::gl_sarray::all ( ) const

Return true if every element of the gl_sarray evaluates to true. For numeric gl_sarray objects zeros and missing values ("None") evaluate to false, while all non-zero, non-missing values evaluate to true. For string, list, and dictionary gl_sarray objects, empty values (zero length strings, lists or dictionaries) or missing values ("None") evaluate to false. All other values evaluate to true. Returns true on an empty gl_sarray.

Example:

std::cout << gl_sarray({1, FLEX_UNDEFINED}).all();
std::cout << gl_sarray({1, 0}).all();
std::cout << gl_sarray({1, 2}).all();
std::cout << gl_sarray({"hello", "world"}).all();
std::cout << gl_sarray({"hello", ""}).all();
std::cout << gl_sarray({}).all();

Produces output:

0
0
1
1
0
1
See also
any

◆ any()

bool turi::gl_sarray::any ( ) const

Return true if any element of the gl_sarray evaluates to true. For numeric gl_sarray objects any non-zero value evaluates to true. For string, list, and dictionary gl_sarray objects, any element of non-zero length evaluates to true. Returns false on an empty gl_sarray.

Example:

std::cout << gl_sarray({1, FLEX_UNDEFINED}).any();
std::cout << gl_sarray({1, 0}).any();
std::cout << gl_sarray({0, 0}).any();
std::cout << gl_sarray({"hello", "world"}).any();
std::cout << gl_sarray({"hello", ""}).any();
std::cout << gl_sarray({"", ""}).any();
std::cout << gl_sarray({}).any();

Produces output:

1
1
0
1
1
0
0
See also
all

◆ append()

gl_sarray turi::gl_sarray::append ( const gl_sarray other) const

Append an gl_sarray to the current gl_sarray. Returns a new gl_sarray with the rows from both gl_sarray objects. Both gl_sarray objects must be of the same type.

Parameters
otherAnother gl_sarray whose rows are appended to current gl_sarray.

Example:

auto sa = gl_sarray({1, 2, 3});
auto sa2 = gl_sarray({4, 5, 6});
std::cout << sa.append(sa2);

Produces output:

dtype: int
Rows: 6
[1, 2, 3, 4, 5, 6]
See also
gl_sframe::append

◆ apply()

gl_sarray turi::gl_sarray::apply ( std::function< flexible_type(const flexible_type &)>  fn,
flex_type_enum  dtype,
bool  skip_undefined = true 
) const

Transform each element of the gl_sarray by a given function. The result gl_sarray is of type "dtype". "fn" should be a function that returns exactly one value which can be cast into the type specified by "dtype".

Parameters
fnThe function to transform each element. Must return exactly one value which can be cast into the type specified by "dtype".
dtypeThe data type of the new gl_sarray.
skip_undefinedOptional. If true, will not apply "fn" to any undefined values. Defaults to true.

Example:

auto sa = gl_sarray({1,2,3});
std::cout << sa.apply([](const flexible_type& x) { return x*1; },

Produces output:

dtype: int
Rows: 3
[2, 4, 6]
See also
gl_sframe::apply

◆ astype()

gl_sarray turi::gl_sarray::astype ( flex_type_enum  dtype,
bool  undefined_on_failure = true 
) const

Create a new gl_sarray with all values cast to the given type. Throws an exception if the types are not castable to the given type.

Parameters
dtypeThe type to cast the elements to in gl_sarray
undefined_on_failureOptional. Defaults to True. If set to true, runtime cast failures will be emitted as missing values rather than failing.

Example:

auto sa = gl_sarray({'1','2','3','4'});
std::cout << sa.astype(flex_type_enum::INTEGER);

Produces output:

dtype: int
Rows: 4
[1, 2, 3, 4]

Given an SArray of strings that look like dicts, convert to a dictionary type:

auto sa = gl_sarray({'flex_dict{{1:2 3,4}}', 'flex_dict{{a:b c,d}}'});
std::cout << sa.astype(flex_type_enum::DICT);

Produces output:

dtype: dict
Rows: 2
[{1: 2, 3: 4}, {'a': 'b', 'c': 'd'}]

◆ builtin_rolling_apply()

gl_sarray turi::gl_sarray::builtin_rolling_apply ( const std::string &  fn_name,
ssize_t  start,
ssize_t  end,
size_t  min_observations = size_t(-1) 
) const

Apply an aggregate function over a moving window.

Parameters
inputThe input SArray (expects to be materialized)
fn_namestring representation of the aggregation function to use. The mapping is the same string mapping used by the groupby aggregate function.
window_startThe start of the moving window relative to the current value being calculated, inclusive. For example, 2 values behind the current would be -2, and 0 indicates that the start of the window is the current value.
window_endThe end of the moving window relative to the current value being calculated, inclusive. Must be greater than window_start. For example, 0 would indicate that the current value is the end of the window, and 2 would indicate that the window ends at 2 data values after the current.
min_observationsThe minimum allowed number of non-NULL values in the moving window for the emitted value to be non-NULL. size_t(-1) indicates that all values must be non-NULL.

Returns an SArray of the same length as the input, with a type that matches the type output by the aggregation function.

Throws an exception if:

  • window_end < window_start
  • The window size is excessively large (currently hardcoded to UINT_MAX).
  • The given function name corresponds to a function that will not operate on the data type of the input SArray.
  • The aggregation function returns more than one non-NULL types.

Example:

gl_sarray a{0,1,2,3,4,5,6,7,8,9};
// Moving window encompasses 3 values behind current and current value.
auto result = a.rolling_apply(std::string("__builtin__avg__"), -3, 0);

Produces an SArray with these values:

{NULL,NULL,NULL,1.5,2.5,3.5,4.5,5.5,6.5,7.5}

◆ clip()

gl_sarray turi::gl_sarray::clip ( flexible_type  lower = FLEX_UNDEFINED,
flexible_type  upper = FLEX_UNDEFINED 
) const

Create a new gl_sarray with each value clipped to be within the given bounds. In this case, "clipped" means that values below the lower bound will be set to the lower bound value. Values above the upper bound will be set to the upper bound value. This function can operate on gl_sarray objects of numeric type as well as array type, in which case each individual element in each array is clipped. By default "lower" and "upper" are set to "float('nan')" which indicates the respective bound should be ignored. The method fails if invoked on an gl_sarray of non-numeric type.

Parameters
lowerOptional. The lower bound used to clip. Ignored if equal to FLEX_UNDEFINED (the default).
upperOptional. The upper bound used to clip. Ignored if equal to FLEX_UNDEFINED (the default).

Example:

auto sa = gl_sarray({1,2,3});
std::cout << sa.clip(2,2);

Produces output:

dtype: int
Rows: 3
[2, 2, 2]
See also
clip_lower
clip_upper

◆ clip_lower()

gl_sarray turi::gl_sarray::clip_lower ( flexible_type  threshold) const

Create new gl_sarray with all values clipped to the given lower bound. This function can operate on numeric arrays, as well as vector arrays, in which case each individual element in each vector is clipped. Throws an exception if the gl_sarray is empty or the types are non-numeric.

Parameters
thresholdThe lower bound used to clip values.

Example:

auto sa = gl_sarray({1,2,3});
std::cout << sa.clip_lower(2);

Produces output:

dtype: int
Rows: 3
[2, 2, 3]
See also
clip
clip_upper

◆ clip_upper()

gl_sarray turi::gl_sarray::clip_upper ( flexible_type  threshold) const

Create new gl_sarray with all values clipped to the given upper bound. This function can operate on numeric arrays, as well as vector arrays, in which case each individual element in each vector is clipped.

Parameters
thresholdThe upper bound used to clip values.

Example:

auto sa = gl_sarray({1,2,3});
std::cout << sa.clip_upper(2);

Produces output:

dtype: int
Rows: 3
[1, 2, 2]
See also
clip
clip_lower

◆ contains()

gl_sarray turi::gl_sarray::contains ( const flexible_type other) const

Performs an element-wise substring search of "item". The current array must contains strings and item must be a string. Produces a 1 for each row if item is a substring of the row and 0 otherwise.

◆ count_ngrams()

gl_sarray turi::gl_sarray::count_ngrams ( size_t  n = 2,
std::string  method = "word",
bool  to_lower = true,
bool  ignore_space = true 
) const

Return an SArray of dict type where each element contains the count for each of the n-grams that appear in the corresponding input element. The n-grams can be specified to be either character n-grams or word n-grams. The input SArray must contain strings. Parameters:

Parameters
nOptional. The number of words in each n-gram. An n value of 1 returns word counts. Defaults to 2.
methodOptional. Either "word" or "character". If “word”, the function performs a count of word n-grams. If “character”, does a character n-gram count. Defaults to "word".
to_lowerOptional. If true, all words are converted to lower case before counting. Defaults to true.
ignore_spaceOptional. If method is “character”, indicates if spaces between words are counted as part of the n-gram. For instance, with the input SArray element of “fun games”, if this parameter is set to False one tri-gram would be ‘n g’. If ignore_space is set to True, there would be no such tri-gram (there would still be ‘nga’). This parameter has no effect if the method is set to “word”. Defaults to true.
gl_sarray sa({"I like big dogs. I LIKE BIG DOGS."});
gl_sarray ret = count_ngrams(sa, 3);
// returns gl_sarray of dictionary type containing
// [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}]
gl_sarray sa(["Fun. Is. Fun"]);
gl_sarray ret = count_ngrams(sa, 3, "character")
// returns gl_sarray of dictionary type containing
[{'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}]

mp*

See also
count_words

◆ count_words()

gl_sarray turi::gl_sarray::count_words ( bool  to_lower = true,
turi::flex_list  delimiters = {"\, "\", "\", "\", "\", " "} 
) const

Count words in the gl_sarray.

Parameters
to_lowerOptional. If True, all words are converted to lower case before counting.

Return an gl_sarray of dictionary type where each element contains the word count for each word that appeared in the corresponding input element. The words are split on all whitespace and punctuation characters. Only works if this SArray is of string type. Parameters:

sa = turicreate.SArray(["The quick brown fox jumps.",
"Word word WORD, word!!!word"])
auto ret = count_words(sa)
// output array is of type flex_type_enum::DICT and contains
[{'quick': 1, 'brown': 1, 'jumps': 1, 'fox': 1, 'the': 1}, {'word': 5}]
See also
count_ngrams

◆ cumulative_aggregate()

gl_sarray turi::gl_sarray::cumulative_aggregate ( std::shared_ptr< group_aggregate_value aggregator) const

An abstraction to perform cumulative aggregates. y <- x.cumulative_aggregate(f, w_0)

The abstraction is as follows: y[i+1], w[i+1] = func(x[i], w[i]) where w[i] is some arbitary state.

Parameters
[in]Builtin aggregate to use (e.g, sum, min, max etc.)
Returns
SArray
sa = SArray([1, 2, 3, 4, 5])
sa.cumulative_aggregate(std::make_shared<groupby_operators::sum>());

produces an SArray that looks like the following: dtype: int [1, 3, 6, 10, 15]

◆ cumulative_sum()

gl_sarray turi::gl_sarray::cumulative_sum ( ) const

This returns an SArray where each element is a cumulative aggregate of all its previous elements. Only works in an SArray of numeric type or numeric-array types.

Returns
an SArray
sa = SArray([1, 2, 3, 4, 5])

produces an SArray that looks like the following: dtype: int [1, 3, 6, 10, 15]

◆ datetime_to_str()

gl_sarray turi::gl_sarray::datetime_to_str ( const std::string &  str_format = "%Y-%m-%dT%H:%M:%S%ZP") const

Create a new gl_sarray with all the values cast to str. The string format is specified by the 'str_format' parameter.

Parameters
str_formatThe format to output the string. Default format is "%Y-%m-%dT%H:%M:%S%ZP". See the strftime specification for details on the format string.

Example:

boost::posix_time::ptime t(boost::gregorian::date(2011, 1, 1));
boost::posix_time::ptime epoch(boost::gregorian::date(1970,1,1));
auto x = (t - epoch).total_seconds();
auto sa = gl_sarray({flex_date_time(x)});
std::cout << sa.datetime_to_str("%e %b %Y");

Produces output:

dtype: str
Rows: 1
[" 1 Jan 2011"]
See also
str_to_datetime

◆ dict_has_all_keys()

gl_sarray turi::gl_sarray::dict_has_all_keys ( const std::vector< flexible_type > &  keys) const

Create a boolean gl_sarray by checking the keys of an gl_sarray of dictionaries. An element of the output gl_sarray is True if the corresponding input element's dictionary has all of the given keys. Fails on gl_sarray objects whose data type is not "dict".

Parameters
keysA list of key values to check each dictionary against.

Example:

auto sa = gl_sarray({flex_dict{{"this",1},{"is",5},{"dog",7}},
flex_dict{{"this", 2},{"are", 1},{"cat", 5}}});
std::cout << sa.dict_has_all_keys({"is", "this"});

Produces output:

dtype: int
Rows: 2
[1, 0]
See also
dict_has_any_keys

◆ dict_has_any_keys()

gl_sarray turi::gl_sarray::dict_has_any_keys ( const std::vector< flexible_type > &  keys) const

Create a boolean gl_sarray by checking the keys of an gl_sarray of dictionaries. An element of the output gl_sarray is True if the corresponding input element's dictionary has any of the given keys. Fails on gl_sarray objects whose data type is not "dict".

Parameters
keysA list of key values to check each dictionary against.

Example:

auto sa = gl_sarray({flex_dict{{"this",1},{ "is",5},{ "dog",7}},
flex_dict{{"animal",1}},
flex_dict{{"this", 2},{ "are", 1},{ "cat", 5}}});
std::cout << sa.dict_has_any_keys({"is", "this", "are"});

Produces output:

dtype: int
Rows: 3
[1, 1, 0]
See also
dict_has_all_keys

◆ dict_keys()

gl_sarray turi::gl_sarray::dict_keys ( ) const

Create an gl_sarray that contains all the keys from each dictionary element as a list. Fails on gl_sarray objects whose data type is not "dict".

Example:

auto sa = gl_sarray({flex_dict{{"this",1},{ "is",5},{ "dog",7}},
flex_dict{{"this", 2},{ "are", 1},{ "cat", 5}}});
std::cout << sa.dict_keys();

Produces output:

dtype: list
Rows: 2
[['this', 'is', 'dog'], ['this', 'are', 'cat']]
See also
dict_values

◆ dict_trim_by_keys()

gl_sarray turi::gl_sarray::dict_trim_by_keys ( const std::vector< flexible_type > &  keys,
bool  exclude = true 
) const

Filter an SArray of dictionary type by the given keys. By default, all keys that are in the provided list in "keys" are excluded from the returned SArray.

Parameters
keysA collection of keys to trim down the elements in the SArray.
excludeOptional If True, all keys that are in the input key list are removed. If False, only keys that are in the input key list are retained. Defaults to true.
gl_sarray sa({flex_dict{{"this",1}, {"is",1}, {"dog",2}},
flex_dict{{"this", 2}, {"are",2}, {"cat", 1}} });
gl_Sarray ret = sa.dict_trim_by_keys({"this", "is", "and", "are"});
// returns an SArray of dictionary type containing
// [{'dog': 2}, {'cat': 1}]
See also
dict_trim_by_values

◆ dict_trim_by_values()

gl_sarray turi::gl_sarray::dict_trim_by_values ( const flexible_type lower = FLEX_UNDEFINED,
const flexible_type upper = FLEX_UNDEFINED 
) const

Filter dictionary values to a given range (inclusive). Trimming is only performed on values which can be compared to the bound values. Fails on SArrays whose data type is not dict.

Parameters
lowerOptional. The lowest dictionary value that would be retained in the result. If FLEX_UNDEFINED , lower bound is not applied. Defaults to FLEX_UNDEFINED.
upperOptional. The highest dictionary value that would be retained in the result. If FLEX_UNDEFINED, upper bound is not applied. Defaults to FLEX_UNDEFINED.

Example:

auto sa = gl_sarray({flex_dict{{"this",1},{"is",5},{"dog",7}},
flex_dict{{"this", 2},{"are",1},{"cat", 5}} });
std::cout << sa.dict_trim_by_values(2,5);
std::cout << sa.dict_trim_by_values(upper=5);

Produces output:

dtype: dict
Rows: 2
[{'is': 5}, {'this': 2, 'cat': 5}]
dtype: dict
Rows: 2
[{'this': 1, 'is': 5}, {'this': 2, 'are': 1, 'cat': 5}]
See also
dict_trim_by_keys

◆ dict_values()

gl_sarray turi::gl_sarray::dict_values ( ) const

Create an gl_sarray that contains all the values from each dictionary element as a list. Fails on gl_sarray objects whose data type is not "dict".

Example:

auto sa = gl_sarray({flex_dict{{"this",1},{"is",5},{"dog",7}},
flex_dict{{"this", 2},{"are", 1},{"cat", 5}}});
std::cout << sa.dict_values();

Produces output:

dtype: list
Rows: 2
[[1, 5, 7], [2, 1, 5]]
See also
dict_keys

◆ dropna()

gl_sarray turi::gl_sarray::dropna ( ) const

Create new gl_sarray containing only the non-missing values of the gl_sarray. A missing value shows up in an gl_sarray as 'FLEX_UNDEFINED'. This will also drop NAN values.

◆ dtype()

flex_type_enum turi::gl_sarray::dtype ( ) const

Returns data type of the gl_sarray.

gl_sarray sa{1,2,3,4,5};
flex_type_enum dtype = sa.dtype(); // dtype is flex_type_enum::INTEGER
gl_sarray sa{"1","2","3","4","5"};
flex_type_enum dtype = sa.dtype(); // dtype is flex_type_enum::STRING

◆ empty()

bool turi::gl_sarray::empty ( ) const

True if size() == 0.

◆ fillna()

gl_sarray turi::gl_sarray::fillna ( flexible_type  value) const

Create new gl_sarray with all missing values (FLEX_UNDEFINED or NaN) filled in with the given value. The size of the new gl_sarray will be the same as the original gl_sarray. If the given value is not the same type as the values in the gl_sarray, "fillna" will attempt to convert the value to the original gl_sarray's type. If this fails, an error will be raised.

Parameters
valueThe value used to replace all missing values

◆ filter()

gl_sarray turi::gl_sarray::filter ( std::function< bool(const flexible_type &)>  fn,
bool  skip_undefined = true 
) const

Filter this gl_sarray by a function. Returns a new gl_sarray filtered by this gl_sarray. If "fn" evaluates an element to true, this element is copied to the new gl_sarray. If not, it isn't. Throws an exception if the return type of "fn" is not castable to a boolean value.

Parameters
fnFunction that filters the gl_sarray. Must evaluate to bool or int.
skip_undefinedOptional. If true, will not apply fn to any undefined values.

Example:

auto sa = gl_sarray({1,2,3});
std::cout << sa.filter([](flexible_type x){ return x < 3; });

Produces output:

dtype: int
Rows: 2
[1, 2]

This function is equivalent to the combination of a logical_filter and an apply.

res = sa[sa.apply(fn)];

◆ from_const()

static gl_sarray turi::gl_sarray::from_const ( const flexible_type value,
size_t  size 
)
static

Returns a gl_sarray of size with a constant value.

Parameters
valueThe value to fill the array
sizeThe size of the array
// Construct an SArray consisting of 10 zeroes:

◆ from_sequence()

static gl_sarray turi::gl_sarray::from_sequence ( size_t  start,
size_t  end,
bool  reverse = false 
)
static

Returns a gl_sarray of a sequence of integer values.

Parameters
startThe starting value
endOne past the last value
reverseIf the values are in reverse
// returns a sequence of values from 0 to 99
// returns a sequence of values from 99 to 0

◆ get_proxy()

virtual std::shared_ptr<unity_sarray> turi::gl_sarray::get_proxy ( ) const
virtual

Gets the internal implementation object.

Reimplemented in turi::const_gl_sarray_reference, and turi::gl_sarray_reference.

◆ hash()

gl_sarray turi::gl_sarray::hash ( size_t  seed = 0) const

Returns an SArray with a hash of each element. seed can be used to change the hash function to allow this method to be used for random number generation.

Parameters
seedDefaults to 0. Can be changed to different values to get different hash results.

Example:

std::cout << sa.hash(123)

Produces output:

dtype: int
Rows: 10
[-2176393851141330893, 7600995152976636137, -5571280844667425574, -4385410391720336496, -4446257658862464208, -7571182417602171808, 3644372782970789199, 3084542717492096231, 4758268028978242780, -6520852338875851008]

◆ head()

gl_sarray turi::gl_sarray::head ( size_t  n) const

Returns an gl_sarray which contains the first n rows of this gl_sarray.

Parameters
nThe number of rows to fetch.
gl_sarray sa({0,1,2,3,4,5,6,7,8,9});
auto ret = sa.head(5); // an array of values [0,1,2,3,4]

◆ is_materialized()

bool turi::gl_sarray::is_materialized ( ) const

Returns whether or not the sarray has been materialized.

See also
materialize

◆ item_length()

gl_sarray turi::gl_sarray::item_length ( ) const

Length of each element in the current gl_sarray. Only works on gl_sarray objects of dict, array, or list type. If a given element is a missing value, then the output elements is also a missing value. This function is equivalent to the following:

sa_item_len = sa.apply([](const flexible_type& x) { return flexible_type(x.get_type() == flex_type_enum::UNDEFINED ? 0 : x.size();) });

Example:

auto sa = gl_sarray({flex_dict{{"is_restaurant", 1}, {"is_electronics", 0}},
flex_dict{{"is_restaurant", 1}, {"is_retail", 1}, {"is_electronics", 0}},
flex_dict{{"is_restaurant", 0}, {"is_retail", 1}, {"is_electronics", 0}},
flex_dict{{"is_restaurant", 0}},
flex_dict{{"is_restaurant", 1}, {"is_electronics", 1}},
std::cout << sa.item_length();

Produces output:

dtype: int
Rows: 6
[2, 3, 3, 1, 2, None]

◆ materialize()

void turi::gl_sarray::materialize ( ) const

For a gl_sarray that is lazily evaluated, force persist this sarray to disk, committing all lazy evaluated operations.

See also
is_materialized

◆ materialize_to_callback()

void turi::gl_sarray::materialize_to_callback ( std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)>  callback,
size_t  nthreads = (size_t)(-1) 
)

Calls a callback function passing each row of the SArray.

This does not materialize the array if not necessary. The callback may be called in parallel in which case the argument provides a thread number. The function should return false, but may return true at anytime to quit the iteration process. It may also throw exceptions which will be forwarded to the caller of this function.

Each call to the callback passes:

The sframe_rows object looks like a vector<vector<flexible_type>>. i.e. to look at all the rows, you need to write:

sa.materalize_to_callback([&](size_t, const std::shared_ptr<sframe_rows>& rows) {
for(const auto& row: *rows) {
// each row looks like an std::vector<flexible_type>
// and can be casted to to a vector<flexible_type> if necessary
// But this this is an sarray, the element you want is always in
// row[0]
}
});
Parameters
callbackThe callback to call
nthreadsNumber of threads. If not specified, #cpus is used

◆ max()

flexible_type turi::gl_sarray::max ( ) const

Get maximum numeric value in gl_sarray. Returns FLEX_UNDEFINED on an empty gl_sarray. Raises an exception if called on an gl_sarray with non-numeric type.

Example:

std::cout << gl_sarray({14, 62, 83, 72, 77, 96, 5, 25, 69, 66}).max();

Produces output:

96
See also
min

◆ mean()

flexible_type turi::gl_sarray::mean ( ) const

Mean of all the values in the gl_sarray, or mean image. Returns FLEX_UNDEFINED on an empty gl_sarray. Raises an exception if called on an gl_sarray with non-numeric type or non-Image type.

◆ min()

flexible_type turi::gl_sarray::min ( ) const

Get minimum numeric value in gl_sarray. Returns FLEX_UNDEFINED on an empty gl_sarray. Raises an exception if called on an gl_sarray with non-numeric type.

Example:

std::cout << gl_sarray({14, 62, 83, 72, 77, 96, 5, 25, 69, 66}).min();
See also
max

◆ nnz()

size_t turi::gl_sarray::nnz ( ) const

Number of non-zero elements in the gl_sarray.

◆ num_missing()

size_t turi::gl_sarray::num_missing ( ) const

Number of missing elements in the gl_sarray.

◆ operator[]() [1/3]

flexible_type turi::gl_sarray::operator[] ( int64_t  i) const

Returns the value at a particular array index; generally inefficient.

This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.

◆ operator[]() [2/3]

gl_sarray turi::gl_sarray::operator[] ( const gl_sarray slice) const

Performs a logical filter.

This function performs a logical filter: i.e. it subselects all the elements in this array where the corresponding value in the other array evaluates to true.

gl_sarray a{1,2,3,4,5};
auto ret = a[a > 1 && a <= 4];
// ret is now the array [2,3,4]

◆ operator[]() [3/3]

gl_sarray turi::gl_sarray::operator[] ( const std::initializer_list< int64_t > &  slice) const

Performs a slice Python style.

Parameters
sliceA list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array.

Given a gl_sarray

gl_sarray a{1,2,3,4,5,6,7,8,9,10};

Slicing a consecutive range:

auto ret = a[{1,4}]; // start at index 1, end at index 4
// ret is the array [2,3,4]

Slicing a range with a step:

auto ret = a[{1,2,8}]; // start at index 1, end at index 8 with step size 2
// ret is the array [2,4,6,8]

Using negative indexing:

auto ret = a[{-3,-1}]; // start at end - 3, end at index end - 1
// ret is the array [8,9]

◆ pixel_array_to_image()

gl_sarray turi::gl_sarray::pixel_array_to_image ( size_t  width,
size_t  height,
size_t  channels = 3,
bool  undefined_on_failure = true 
) const

Create a new gl_sarray with all the values cast to turi::image_type of uniform size.

Parameters
widthint The width of the new images.
heightint The height of the new images.
channelsint. Number of channels of the new images.
undefined_on_failureoptional. defaults to true. If true, return FLEX_UNDEFINED type instead of Image type on failure. If false, raises error upon failure.
allow_roundingoptional. Default to false. If true, rounds non-integer values when converting to Image type. If false, raises error upon rounding.

◆ plot()

std::shared_ptr<visualization::Plot> turi::gl_sarray::plot ( const flexible_type title,
const flexible_type xlabel,
const flexible_type ylabel 
) const

Return a visualization of the SArray.

◆ range_iterator()

gl_sarray_range turi::gl_sarray::range_iterator ( size_t  start = 0,
size_t  end = (size_t)(-1) 
) const

Returns a one pass range object with begin() and end() iterators.

This will materialize the array.

See materialize_to_callback for a lazy version.

Parameters
startThe starting index of the range
endThe ending index of the range
// create a sequence of 1,000 integer values
// get a range over the entire array
auto ra = sa.range_iterator();
auto iter = ra.begin();
while (iter != ra.end()) {
std::cout << *iter;
++iter;
}

Or more compactly with C++11 syntax:

for(const auto& val: sa.range_iterator()) {
std::cout << val << "\n";
}

The range returned only supports one pass. The outcome of a second call to begin() is undefined after any iterator is advanced.

See also
gl_sarray_range

◆ read_json()

static gl_sarray turi::gl_sarray::read_json ( const std::string &  url)
static

Constructs an SArray from a json record files.

A json record file contains an array of dictionaries. Resultant SArray is of dictionary type.

◆ sample() [1/2]

gl_sarray turi::gl_sarray::sample ( double  fraction) const

Create an gl_sarray which contains a subsample of the current gl_sarray.

Parameters
fractionThe fraction of the rows to fetch. Must be between 0 and 1.

Example:

auto sa = gl_sarray::from_sequence(0, 10);
std::cout << sa.sample(.3);

Produces output:

dtype: int
Rows: 3
[2, 6, 9]

◆ sample() [2/2]

gl_sarray turi::gl_sarray::sample ( double  fraction,
size_t  seed,
bool  exact = false 
) const

Create an gl_sarray which contains a subsample of the current gl_sarray.

Parameters
fractionThe fraction of the rows to fetch. Must be between 0 and 1.
seedThe random seed for the random number generator. Deterministic output is obtained if this is set to a constant.

Example:

auto sa = gl_sarray::from_sequence(0, 10);
std::cout << sa.sample(.3, 12345);

Produces output:

dtype: int
Rows: 3
[1,3,6,9]

◆ save()

void turi::gl_sarray::save ( const std::string &  directory,
const std::string &  format = "binary" 
) const

Saves the gl_sarray to file.

When format is "binary" (default), the saved SArray will be in a directory named with the targetfile parameter. When format is "text" or "csv", it is saved as a single human readable text file.

Parameters
filenameA local path or a remote URL. If format is 'text', it will be saved as a text file. If format is 'binary', a directory will be created at the location which will contain the SArray.
formatEither "binary", "text", "csv". Defaults to "binary". optional. Format in which to save the SFrame. Binary saved SArrays can be loaded much faster and without any format conversion losses. 'text' and 'csv' are synonymous: Each SArray row will be written as a single line in an output text file. If not given, will try to infer the format from filename given. If file name ends with 'csv', 'txt' or '.csv.gz', then save as 'csv' format, otherwise save as 'binary' format.

◆ show()

void turi::gl_sarray::show ( const std::string &  path_to_client,
const flexible_type title,
const flexible_type xlabel,
const flexible_type ylabel 
) const

Show a visualization of the SArray.

◆ size()

size_t turi::gl_sarray::size ( ) const

The size of the SArray.

◆ sort()

gl_sarray turi::gl_sarray::sort ( bool  ascending = true) const

Sort all values in this gl_sarray. Sort only works for sarray of type str, int and float, otherwise TypeError will be raised. Creates a new, sorted gl_sarray.

Parameters
ascendingOptional. Defaults to True. If true, the sarray values are sorted in ascending order, otherwise, descending order.

Example:

auto sa = SArray({3,2,1});
std::cout << sa.sort();

Produces output:

dtype: int
Rows: 3
[1, 2, 3]

◆ split_datetime()

gl_sframe turi::gl_sarray::split_datetime ( const std::string &  column_name_prefix = "X",
const std::vector< std::string > &  limit = {"year","month","day","hour","minute","second"},
bool  tzone = false 
) const

Splits an gl_sarray of datetime type to multiple columns, return a new gl_sframe that contains expanded columns. A gl_sarray of datetime will be split by default into an gl_sframe of 6 columns, one for each year/month/day/hour/minute/second element.

When splitting a gl_sarray of datetime type, new columns are named: prefix.year, prefix.month, etc. The prefix is set by the parameter "column_name_prefix" and defaults to 'X'. If column_name_prefix is FLEX_UNDEFINED or empty, then no prefix is used.

If tzone parameter is true, then timezone information is represented as one additional column which is a float shows the offset from GMT(0.0) or from UTC.

Parameters
column_name_prefixOptional. If provided, expanded column names would start with the given prefix. Defaults to "X".
limitOptional. Limits the set of datetime elements to expand. Elements are 'year','month','day','hour','minute', and 'second'.
tzoneOptional. A boolean parameter that determines whether to show timezone column or not. Defaults to false.

Example:

auto sa = gl_sarray({"20-Oct-2011", "10-Jan-2012"});
auto date_sarray = sa.str_to_datetime("%d-%b-%Y");
auto split_sf = date_sarray.split_datetime("", {"day","year"});
std::cout << split_sf;

Produces output:

Columns:
day integer
year integer
+----------------+----------------+
| day | year |
+----------------+----------------+
| 20 | 2011 |
| 10 | 2012 |
+----------------+----------------+
[2 rows x 2 columns]

◆ std()

flexible_type turi::gl_sarray::std ( ) const

Standard deviation of all the values in the gl_sarray. Returns FLEX_UNDEFINED on an empty gl_sarray. Raises an exception if called on an gl_sarray with non-numeric type.

◆ str_to_datetime()

gl_sarray turi::gl_sarray::str_to_datetime ( const std::string &  str_format = "%Y-%m-%dT%H:%M:%S%ZP") const

Create a new gl_sarray with all the values cast to datetime. The string format is specified by the 'str_format' parameter.

Parameters
str_formatThe format to parse the string. Default format is "%Y-%m-%dT%H:%M:%S%ZP". See the strptime specification for details on the format string.

Example:

auto sa = gl_sarray({"20-Oct-2011 09:30:10 GMT-05:30"});
std::cout << sa.str_to_datetime("%d-%b-%Y %H:%M:%S %ZP");

Produces output:

dtype: datetime
Rows: 1
[20111020T093010]
See also
datetime_to_str

◆ subslice()

gl_sarray turi::gl_sarray::subslice ( flexible_type  start = FLEX_UNDEFINED,
flexible_type  stop = FLEX_UNDEFINED,
flexible_type  step = FLEX_UNDEFINED 
) const

This returns an SArray with each element sliced accordingly to the slice specified.

Parameters
startThe start position of the slice
stopThe stop position of the slice
stepThe step size of the slice (default = 1)
Returns
an SArray with each individual vector/string/list sliced according to the arguments.

This is conceptually equivalent to the python equivalent of:

g.apply(lambda x: x[start:step:stop])

The SArray must be of type list, vector, or string.

For instance:

g = SArray({"abcdef","qwerty"});
std::cout << g.subslice(0, 2);

Produces output:

dtype: str
Rows: 2
["ab", "qw"]

Negative indeices:

std::cout << g.subslice(3,-1);

Produces output:

dtype: str
Rows: 2
["de", "rt"]

Arrays:

g = SArray({{1,2,3}, {4,5,6}});
std::cout << g.subslice(0, 1);

Produces output:

dtype: str
Rows: 2
[[1], [4]]

◆ sum()

flexible_type turi::gl_sarray::sum ( ) const

Sum of all values in this gl_sarray.

Raises an exception if called on an gl_sarray of strings, lists, or dictionaries. If the gl_sarray contains numeric arrays (flex_vec) and all the arrays are the same length, the sum over all the arrays will be returned. Returns FLEX_UNDEFINED on an empty gl_sarray. For large values, this may overflow without warning.

◆ tail()

gl_sarray turi::gl_sarray::tail ( size_t  n) const

Returns an gl_sarray which contains the last n rows of this gl_sarray.

Parameters
nThe number of rows to fetch.
gl_sarray sa({0,1,2,3,4,5,6,7,8,9});
auto ret = sa.tail(5); // an array of values [5,6,7,8,9]

◆ topk_index()

gl_sarray turi::gl_sarray::topk_index ( size_t  topk = 10,
bool  reverse = false 
) const

Create an gl_sarray indicating which elements are in the top k. Entries are '1' if the corresponding element in the current gl_sarray is a part of the top k elements, and '0' if that corresponding element is not. Order is descending by default.

Parameters
topkOptional. Defaults to 10. The number of elements to determine if 'top'
reverseOptional. Defaults to false. If true, return the topk elements in ascending order

◆ unique()

gl_sarray turi::gl_sarray::unique ( ) const

Get all unique values in the current gl_sarray. Raises an error if the gl_sarray is of dictionary type. Will not necessarily preserve the order of the given gl_sarray in the new gl_sarray.

See also
gl_sframe::unique

◆ unpack()

gl_sframe turi::gl_sarray::unpack ( const std::string &  column_name_prefix = "X",
const std::vector< flex_type_enum > &  column_types = std::vector< flex_type_enum >(),
const flexible_type na_value = FLEX_UNDEFINED,
const std::vector< flexible_type > &  limit = std::vector< flexible_type >() 
) const

Convert an gl_sarray of list, array, or dict type to an gl_sframe with multiple columns.

"unpack" expands an gl_sarray using the values of each vector/list/dict as elements in a new gl_sframe of multiple columns. For example, an gl_sarray of lists each of length 4 will be expanded into an gl_sframe of 4 columns, one for each list element. An gl_sarray of lists/arrays of varying size will be expand to a number of columns equal to the longest list/array. An gl_sarray of dictionaries will be expanded into as many columns as there are keys.

When unpacking an gl_sarray of list or vector type, new columns are named: "column_name_prefix".0, "column_name_prefix".1, etc. If unpacking a column of dict type, unpacked columns are named "column_name_prefix".key1, "column_name_prefix".key2, etc.

When unpacking an gl_sarray of list or dictionary types, missing values in the original element remain as missing values in the resultant columns. If the "na_value" parameter is specified, all values equal to this given value are also replaced with missing values. In an gl_sarray of vector type, NaN is interpreted as a missing value.

gl_sframe::pack_columns() is the reverse effect of unpack.

Parameters
column_name_prefixOptional. If provided, unpacked column names would start with the given prefix. Defaults to "X". If the empty string is used, no prefix is used.
column_typesOptional. Column types for the unpacked columns. If not provided, column types are automatically inferred from first 100 rows. Defaults to FLEX_UNDEFINED.
na_valueOptional. Convert all values that are equal to "na_value" to missing value if specified.
limitoptional limits in the set of list/vector/dict keys to unpack. For list/vector gl_sarrays, "limit" must contain integer indices. For dict gl_sarrays, "limit" must contain dictionary keys.

Example:

auto sa = gl_sarray({flex_dict{{"word", "a"},{"count", 1}},
flex_dict{{"word", "cat"},{"count", 2}},
flex_dict{{"word", "is"},{"count", 3}},
flex_dict{{"word", "coming"},{"count", 4}}});
std::cout << sa.unpack("");

Produces output:

Columns:
count int
word str
Rows: 4
Data:
+-------+--------+
| count | word |
+-------+--------+
| 1 | a |
| 2 | cat |
| 3 | is |
| 4 | coming |
+-------+--------+
[4 rows x 2 columns]

Unpack only the key "word":

std::cout << sa.unpack("X", {}, FLEX_UNDEFINED, {"word"});

Produces output:

Columns:
X.word str
Rows: 4
Data:
+--------+
| X.word |
+--------+
| a |
| cat |
| is |
| coming |
+--------+
[4 rows x 1 columns]

Convert all zeros to missing values:

auto sa2 = gl_sarray({flex_vec{1, 0, 1},
flex_vec{1, 1, 1},
flex_vec{0, 1}});
std::cout << sa2.unpack("X", {flex_type_enum::INTEGER,
flex_type_enum::INTEGER}, 0);

Produces output:

Columns:
X.0 int
X.1 int
X.2 int
Rows: 3
Data:
+------+------+------+
| X.0 | X.1 | X.2 |
+------+------+------+
| 1 | None | 1 |
| 1 | 1 | 1 |
| None | 1 | None |
+------+------+------+
[3 rows x 3 columns]

The documentation for this class was generated from the following file: