Turi Create  4.0
turi::gl_gframe Class Reference

#include <core/data/sframe/gl_gframe.hpp>

Public Member Functions

size_t size () const override
 
size_t num_columns () const override
 
std::vector< std::string > column_names () const override
 
std::vector< flex_type_enumcolumn_types () const override
 
void add_column (const flexible_type &data, const std::string &name) override
 
void add_column (const gl_sarray &data, const std::string &name) override
 
void add_columns (const gl_sframe &data) override
 
void remove_column (const std::string &name) override
 
void rename (const std::map< std::string, std::string > &old_to_new_names) override
 
void swap_columns (const std::string &column_1, const std::string &column_2) override
 
void construct_from_csvs (std::string csv_file, csv_parsing_config_map csv_config, str_flex_type_map column_type_hints)
 
void show (const std::string &path_to_client) const
 
std::shared_ptr< model_baseplot () const
 
gl_sframe operator[] (const gl_sarray &logical_filter) const
 
void materialize_to_callback (std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)> callback, size_t nthreads=(size_t)(-1))
 
gl_sframe_range range_iterator (size_t start=0, size_t end=(size_t)(-1)) const
 
bool empty () const
 
bool is_materialized () const
 
bool has_size () const
 
void materialize ()
 
void save (const std::string &path, const std::string &format="") const
 
void save_reference (const std::string &path) const
 
bool contains_column (const std::string &col_name) const
 
gl_sframe head (size_t n) const
 
gl_sframe tail (size_t n) const
 
gl_sarray apply (std::function< flexible_type(const sframe_rows::row &)> fn, flex_type_enum dtype) const
 
gl_sframe sample (double fraction) const
 
gl_sframe sample (double fraction, size_t seed, bool exact=false) const
 
std::pair< gl_sframe, gl_sframerandom_split (double fraction) const
 
std::pair< gl_sframe, gl_sframerandom_split (double fraction, size_t seed, bool exact=false) const
 
gl_sframe topk (const std::string &column_name, size_t k=10, bool reverse=false) const
 
size_t column_index (const std::string &column_name) const
 
const std::string & column_name (size_t index) const
 
gl_sarray select_column (const std::string &colname) const
 
gl_sframe select_columns (const std::vector< std::string > &colnames) const
 
virtual void replace_add_column (const gl_sarray &data, const std::string &name="")
 
gl_sframe append (const gl_sframe &other) const
 
gl_sframe groupby (const std::vector< std::string > &groupkeys, const std::map< std::string, aggregate::groupby_descriptor_type > &operators=std::map< std::string, aggregate::groupby_descriptor_type >()) const
 
gl_sframe join (const gl_sframe &right, const std::vector< std::string > &joinkeys, const std::string &how="inner") const
 
gl_sframe join (const gl_sframe &right, const std::map< std::string, std::string > &joinkeys, const std::string &how="inner") const
 
gl_sframe filter_by (const gl_sarray &values, const std::string &column_name, bool exclude=false) const
 
gl_sframe pack_columns (const std::vector< std::string > &columns, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const
 
gl_sframe pack_columns (const std::string &column_prefix, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const
 
gl_sframe split_datetime (const std::string &expand_column, const std::string &column_name_prefix="X", const std::vector< std::string > &limit=std::vector< std::string >(), bool tzone=false) const
 
gl_sframe unpack (const std::string &unpack_column, const std::string &column_name_prefix="X", const std::vector< flex_type_enum > &column_types=std::vector< flex_type_enum >(), const flexible_type &na_value=FLEX_UNDEFINED, const std::vector< flexible_type > &limit=std::vector< flexible_type >()) const
 
gl_sframe stack (const std::string &column_name, const std::string &new_column_names, bool drop_na=false) const
 
gl_sframe stack (const std::string &column_name, const std::vector< std::string > &new_column_names, bool drop_na=false) const
 
gl_sframe unstack (const std::string &columns, const std::string &new_column_name="") const
 
gl_sframe unstack (const std::vector< std::string > &columns, const std::string &new_column_name="") const
 
gl_sframe unique () const
 
gl_sframe sort (const std::string &column, bool ascending=true) const
 
gl_sframe sort (const std::vector< std::string > &columns, bool ascending=true) const
 
gl_sframe sort (const std::initializer_list< std::string > &columns, bool ascending=true) const
 
gl_sframe sort (const std::vector< std::pair< std::string, bool >> &column_and_ascending) const
 
gl_sframe dropna (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const
 
std::pair< gl_sframe, gl_sframedropna_split (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const
 
gl_sframe fillna (const std::string &column, flexible_type value) const
 
gl_sframe add_row_number (const std::string &column_name="id", size_t start=0) const
 
std::vector< flexible_typeoperator[] (int64_t i)
 
std::vector< flexible_typeoperator[] (int64_t i) const
 
gl_sframe operator[] (const std::initializer_list< int64_t > &slice)
 
gl_sframe operator[] (const std::initializer_list< int64_t > &slice) const
 
Column Indexing

Selects a single column of the SFrame.

This returns an internal array reference object that can be used exactly like a gl_sarray. The design is quite similar to the reference object used by std::vector<bool> for indexing.

For instance:

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {"1","2","3","4","5"}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
gl_sarray t = sf["a"]; // takes out column "a"

However, this operator can also be used for modifying existing columns, or creating new columns. For instance:

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {"1","2","3","4","5"}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
sf["a"] = sf["a"] + 1; // sf["a"] is now {2,3,4,5,6}
sf["d"] = sf["c"] - 1; // sf["d"] is now {0.0,1.0,2.0,3.0,4.0}

Entire constant columns can also be created the same way:

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {"1","2","3","4","5"}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
sf["ones"] = 1;

Since the returned object is meant to be a short-lived reference, the following is not permitted:

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {"1","2","3","4","5"}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
auto a_col = sf["a"];

since "auto" resolves to gl_sarray_reference which is intentionally, not copy-constructible.

For functional alternatives, See replace_add_column, add_column, add_column(const gl_sarray&, const std::string&), "add_column overload".

const_gl_sarray_reference operator[] (const std::string &column) const
 
gl_sarray_reference operator[] (const std::string &column)
 
Multi-Column Indexing

Subselects a subset of columns returning the an SFrame containing only those columns.

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {"1","2","3","4","5"}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
gl_sframe ret = sf[{"a", "b"}]
// ret has 2 columns "a" and "b"
gl_sframe operator[] (const std::vector< std::string > &columns) const
 
gl_sframe operator[] (const std::initializer_list< std::string > &columns)
 
gl_sframe operator[] (const std::initializer_list< std::string > &columns) const
 

Detailed Description

A proxy for the gl_sframe for the vertex and edge data of the SGRaph

Definition at line 24 of file gl_gframe.hpp.

Member Function Documentation

◆ add_column() [1/2]

void turi::gl_gframe::add_column ( const flexible_type data,
const std::string &  name 
)
overridevirtual

Add a new column with constant value. If type is VERTEX_GFRAME, the column is added as a new vertex field (or edge field if type is EDGE_GFRAME) in the gl_sgraph.

Parameters
datathe constant value to fill the column
namethe name of the new column
See also
gl_sgraph::add_vertex_field(const flexible_type&, const std::string&)
gl_sgraph::add_edge_field(const flexible_type&, const std::stirng&)

Reimplemented from turi::gl_sframe.

◆ add_column() [2/2]

void turi::gl_gframe::add_column ( const gl_sarray data,
const std::string &  name 
)
overridevirtual

Add a new column with given column name and data. If type is VERTEX_GFRAME, the column is added as a new vertex field (or edge field if type is EDGE_GFRAME) in the gl_sgraph.

Parameters
datathe constant value to fill the column
namethe name of the new column
See also
gl_sgraph::add_vertex_field(const gl_sarray&, const std::string&)
gl_sgraph::add_edge_field(const gl_sarray&, const std::string&)

Reimplemented from turi::gl_sframe.

◆ add_columns()

void turi::gl_gframe::add_columns ( const gl_sframe data)
overridevirtual

Batch version of add_column.

Parameters
dataa map from column name to column data

Reimplemented from turi::gl_sframe.

◆ add_row_number()

gl_sframe turi::gl_sframe::add_row_number ( const std::string &  column_name = "id",
size_t  start = 0 
) const
inherited

Returns a new gl_sframe with a new column that numbers each row sequentially. By default the count starts at 0, but this can be changed to a positive or negative number. The new column will be named with the given column name. An error will be raised if the given column name already exists in the gl_sframe.

Parameters
column_nameOptional. The name of the new column that will hold the row numbers.
startOptional. The number used to start the row number count.

Example:

{"b": {"a", "b", FLEX_UNDEFINED}} };
std::cout << sf.add_row_number() << std::endl;

Produces output:

+----+------+------+
| id | a | b |
+----+------+------+
| 0 | 1 | a |
| 1 | None | b |
| 2 | None | None |
+----+------+------+
[3 rows x 3 columns]

◆ append()

gl_sframe turi::gl_sframe::append ( const gl_sframe other) const
inherited

Add the rows of an gl_sframe to the end of this gl_sframe. Both gl_sframe objects must have the same set of columns with the same column names and column types.

Parameters
otherAnother gl_sframe whose rows are appended to the current gl_sframe.

Example:

auto sf = gl_sframe({{"id", {4, 6, 8}},
{"val", {"D", "F", "H"}}});
auto sf2 = gl_sframe({{"id", {1, 2, 3}},
{"val", {"A", "B", "C"}}});
auto sf = sf.append(sf2);
std::cout << sf;

Produces output:

+----+-----+
| id | val |
+----+-----+
| 4 | D |
| 6 | F |
| 8 | H |
| 1 | A |
| 2 | B |
| 3 | C |
+----+-----+
[6 rows x 2 columns]

◆ apply()

gl_sarray turi::gl_sframe::apply ( std::function< flexible_type(const sframe_rows::row &)>  fn,
flex_type_enum  dtype 
) const
inherited

Maps each row of the gl_sframe by a given function to a single value. The result gl_sarray is of type "dtype". "fn" should be a function that returns exactly one value which can be cast into the type specified by "dtype".

Parameters
fnThe function to transform each element. Must return exactly one value which can be cast into the type specified by "dtype".
dtypeThe data type of the new gl_sarray.

Example:

gl_sframe sf{{"a", {1,2,3,4,5}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
std::cout << sf.apply([](const sframe_rows::row& x) {
return x[0] * x[1];

Produces output:

dtype: float
Rows: 5
[1.0, 4.0, 9.0, 16.0, 25.0]
See also
gl_sarray::apply

◆ column_index()

size_t turi::gl_sframe::column_index ( const std::string &  column_name) const
inherited

Returns the index of column column_name.

◆ column_name()

const std::string& turi::gl_sframe::column_name ( size_t  index) const
inherited

Returns the name of column index.

◆ column_names()

std::vector<std::string> turi::gl_gframe::column_names ( ) const
overridevirtual

Returns a list of column names. If type is VERTEX_GFRAME, the value is also the names of the vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.

See also
gl_sgraph::get_vertex_fields
gl_sgraph::get_edge_fields

Reimplemented from turi::gl_sframe.

◆ column_types()

std::vector<flex_type_enum> turi::gl_gframe::column_types ( ) const
overridevirtual

Returns a list of column types. If type is VERTEX_GFRAME, the value is also the names of the vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.

See also
gl_sgraph::get_vertex_field_types
gl_sgraph::get_edge_field_types

Reimplemented from turi::gl_sframe.

◆ construct_from_csvs()

void turi::gl_sframe::construct_from_csvs ( std::string  csv_file,
csv_parsing_config_map  csv_config,
str_flex_type_map  column_type_hints 
)
inherited

Constructs a gl_sframe from a csv file

◆ contains_column()

bool turi::gl_sframe::contains_column ( const std::string &  col_name) const
inherited

Returns true if the column is present in the sframe, and false otherwise.

◆ dropna()

gl_sframe turi::gl_sframe::dropna ( const std::vector< std::string > &  columns = std::vector< std::string >(),
std::string  how = "any",
bool  recursive = false 
) const
inherited

Remove missing values from an gl_sframe. A missing value is either "FLEX_UNDEFINED" or "NaN". If "how" is "any", a row will be removed if any of the columns in the "columns" parameter contains at least one missing value. If "how" is "all", a row will be removed if all of the columns in the "columns" parameter are missing values. If the "columns" parameter is not specified, the default is to consider all columns when searching for missing values.

Parameters
columnsOptional. The columns to use when looking for missing values. By default, all columns are used.
howOptional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default.

For instance

{"b", {"a", "b", FLEX_UNDEFINED}} };
std::cout << sf.dropna() << std::endl;

Produces output:

+---+---+
| a | b |
+---+---+
| 1 | a |
+---+---+
[1 rows x 2 columns]
// Drop when all values are missing.
std::cout << sf.dropna({}, all) << std::endl;

Produces output:

+------+---+
| a | b |
+------+---+
| 1 | a |
| None | b |
+------+---+
[2 rows x 2 columns]

Example:

// Drop rows where column "a" has a missing value.
std::cout << sf.dropna({"a"}) << std::endl;

Produces output:

+---+---+
| a | b |
+---+---+
| 1 | a |
+---+---+
[1 rows x 2 columns]
See also
dropna_split

◆ dropna_split()

std::pair<gl_sframe, gl_sframe> turi::gl_sframe::dropna_split ( const std::vector< std::string > &  columns = std::vector< std::string >(),
std::string  how = "any",
bool  recursive = false 
) const
inherited

Split rows with missing values from this gl_sframe. This function has the same functionality as dropna, but returns a tuple of two gl_sframe objects. The first item is the expected output from dropna, and the second item contains all the rows filtered out by the "dropna" algorithm.

Parameters
columnsOptional. The columns to use when looking for missing values. By default, all columns are used.
howOptional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default.
recursiveOptional. It will recursively check whether a cell contains nan or not. This is handy for nested data structure like list, dictionary. For instance, {{FLEX_UNDEFINED, 1}, {1} will be treat as nan and will be removed if recursive is set to be true. Otherwise it won't be treated as nan-value.

Example:

{"b": {"a", "b", FLEX_UNDEFINED}} };
gl_sframe good, bad;
std::tie(good, bad) = sf.dropna_split();
std::cout << good << std::endl;

Produces output:

+---+---+
| a | b |
+---+---+
| 1 | a |
+---+---+
[1 rows x 2 columns]

Example:

std::cout << bad << std::endl;

Produces output:

+------+------+
| a | b |
+------+------+
| None | b |
| None | None |
+------+------+
[2 rows x 2 columns]
See also
dropna

◆ empty()

bool turi::gl_sframe::empty ( ) const
inherited

True if size() == 0.

◆ fillna()

gl_sframe turi::gl_sframe::fillna ( const std::string &  column,
flexible_type  value 
) const
inherited

Fill all missing values with a given value in a given column. If the "value" is not the same type as the values in "column", this method attempts to convert the value to the original column"s type. If this fails, an error is raised.

Parameters
columnThe name of the column to modify.
valueThe value used to replace all missing values.
recursiveThe recursive is used to set the manner of nan-value checking. If this value is true, a cell will be treated as missing value iff it contains nan. For instance, {{FLEX_UNDEFINED, 1}, {0}} and {FLEX_UNDEFINED, 1} will be all treated as nan-values.

Example:

{"b":["13.1", "17.2", FLEX_UNDEFINED]}};
sf = sf.fillna("a", 0);
std::cout << sf << std::endl;

Produces output:

+---+------+
| a | b |
+---+------+
| 1 | 13.1 |
| 0 | 17.2 |
| 0 | None |
+---+------+
[3 rows x 2 columns]
See also
dropna

◆ filter_by()

gl_sframe turi::gl_sframe::filter_by ( const gl_sarray values,
const std::string &  column_name,
bool  exclude = false 
) const
inherited

Filter an gl_sframe by values inside an iterable object. Result is an gl_sframe that only includes (or excludes) the rows that have a column with the given "column_name" which holds one of the values in the given "values" gl_sarray.

Parameters
valuesThe values to use to filter the gl_sframe. The resulting gl_sframe will only include rows that have one of these values in the given column.
column_nameThe column of the gl_sframe to match with the given "values".
excludeOptional. Defaults to false. If true, the result gl_sframe will contain all rows except those that have one of "values" in "column_name".

Example:

auto sf = gl_sframe({{"id", {1, 2, 3, 4}},
{"animal_type", {"dog", "cat", "cow", "horse"}},
{"name", {"bob", "jim", "jimbob", "bobjim"}}});
auto household_pets = {"cat", "hamster", "dog", "fish", "bird", "snake"};
std::cout << sf.filter_by(household_pets, "animal_type");
std::cout << sf.filter_by(household_pets, "animal_type", exclude=True);

Produces output:

+-------------+----+------+
| animal_type | id | name |
+-------------+----+------+
| dog | 1 | bob |
| cat | 2 | jim |
+-------------+----+------+
[2 rows x 3 columns]
+-------------+----+--------+
| animal_type | id | name |
+-------------+----+--------+
| horse | 4 | bobjim |
| cow | 3 | jimbob |
+-------------+----+--------+
[2 rows x 3 columns]

◆ groupby()

gl_sframe turi::gl_sframe::groupby ( const std::vector< std::string > &  groupkeys,
const std::map< std::string, aggregate::groupby_descriptor_type > &  operators = std::map< std::string, aggregate::groupby_descriptor_type >() 
) const
inherited

Perform a group on the key_columns followed by aggregations on the columns listed in operations. The operations parameter is a dictionary that indicates which aggregation operators to use and which columns to use them on. The available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT, SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience, aggregators MEAN, STD, and VARIANCE are available as synonyms for AVG, STDV, and VAR. See turi::aggregate for more detail on the aggregators.

Parameters
groupkeysColumns to group on. Type of key columns can be of any type other than dictionary.
operationsMap of columns and aggregation operations. Each key is a output column name and each value is an aggregator.

Suppose we have an SFrame (sf) with movie ratings by many users.

+---------+----------+--------+
| user_id | movie_id | rating |
+---------+----------+--------+
| 25904 | 1663 | 3 |
| 25907 | 1663 | 3 |
| 25923 | 1663 | 3 |
| 25924 | 1663 | 3 |
| 25928 | 1663 | 2 |
| 25933 | 1663 | 4 |
| 25934 | 1663 | 4 |
| 25935 | 1663 | 4 |
| 25936 | 1663 | 5 |
| 25937 | 1663 | 2 |
| ... | ... | ... |
+---------+----------+--------+
[10000 rows x 3 columns]

Compute the number of occurrences of each user.

auto user_count = sf.groupby({"user_id"},
{{"count", aggregate::COUNT()}});
std::cout << user_count;
+---------+-------+
| user_id | count |
+---------+-------+
| 62361 | 1 |
| 30727 | 1 |
| 40111 | 1 |
| 50513 | 1 |
| 35140 | 1 |
| 42352 | 1 |
| 29667 | 1 |
| 46242 | 1 |
| 58310 | 1 |
| 64614 | 1 |
| ... | ... |
+---------+-------+
[9852 rows x 2 columns]

Compute the mean and standard deviation of ratings per user.

auto user_rating_stats = sf.groupby({"user_id"},
{{ "mean_rating", aggregate::MEAN("rating")},
{"std_rating", aggregate::STD("rating")}});
std::cout << user_rating_stats;
+---------+-------------+------------+
| user_id | mean_rating | std_rating |
+---------+-------------+------------+
| 62361 | 5.0 | 0.0 |
| 30727 | 4.0 | 0.0 |
| 40111 | 2.0 | 0.0 |
| 50513 | 4.0 | 0.0 |
| 35140 | 4.0 | 0.0 |
| 42352 | 5.0 | 0.0 |
| 29667 | 4.0 | 0.0 |
| 46242 | 5.0 | 0.0 |
| 58310 | 2.0 | 0.0 |
| 64614 | 2.0 | 0.0 |
| ... | ... | ... |
+---------+-------------+------------+
[9852 rows x 3 columns]

Compute the movie with the minimum rating per user.

auto chosen_movies = sf.groupby({"user_id"},
{{ "worst_movies", aggregate::ARGMIN("rating","movie_id")}});
std::cout << chosen_movies;
+---------+-------------+
| user_id | worst_movies |
+---------+-------------+
| 62361 | 1663 |
| 30727 | 1663 |
| 40111 | 1663 |
| 50513 | 1663 |
| 35140 | 1663 |
| 42352 | 1663 |
| 29667 | 1663 |
| 46242 | 1663 |
| 58310 | 1663 |
| 64614 | 1663 |
| ... | ... |
+---------+-------------+
[9852 rows x 2 columns]

Compute the count, mean, and standard deviation of ratings per (user, time), automatically assigning output column names.

// make up some time column which is a combination of user id and movie id
sf["time"] = sf.apply([](const flexible_type& x) {
return (x[0] + x[1]) % 11 + 2000;
});
auto user_rating_stats = sf.groupby({"user_id", "time"},
{{"Count", aggregate::COUNT()},
{"Avg of rating", aggregate::AVG("rating")},
{"Stdv of rating", aggregate::STDV("rating")}});
std::cout << user_rating_stats;
+------+---------+-------+---------------+----------------+
| time | user_id | Count | Avg of rating | Stdv of rating |
+------+---------+-------+---------------+----------------+
| 2006 | 61285 | 1 | 4.0 | 0.0 |
| 2000 | 36078 | 1 | 4.0 | 0.0 |
| 2003 | 47158 | 1 | 3.0 | 0.0 |
| 2007 | 34446 | 1 | 3.0 | 0.0 |
| 2010 | 47990 | 1 | 3.0 | 0.0 |
| 2003 | 42120 | 1 | 5.0 | 0.0 |
| 2007 | 44940 | 1 | 4.0 | 0.0 |
| 2008 | 58240 | 1 | 4.0 | 0.0 |
| 2002 | 102 | 1 | 1.0 | 0.0 |
| 2009 | 52708 | 1 | 3.0 | 0.0 |
| ... | ... | ... | ... | ... |
+------+---------+-------+---------------+----------------+
[10000 rows x 5 columns]

The groupby function can take a variable length list of aggregation specifiers so if we want the count and the 0.25 and 0.75 quantiles of ratings:

auto user_rating_stats = sf.groupby({"user_id", "time"},
{{"Count", aggregate::COUNT()},
{"rating_quantiles", agggregate.QUANTILE("rating",{0.25, 0.75}) }});
std::cout << user_rating_stats;
+------+---------+-------+------------------------+
| time | user_id | Count | rating_quantiles |
+------+---------+-------+------------------------+
| 2006 | 61285 | 1 | [4.0, 4.0] |
| 2000 | 36078 | 1 | [4.0, 4.0] |
| 2003 | 47158 | 1 | [3.0, 3.0] |
| 2007 | 34446 | 1 | [3.0, 3.0] |
| 2010 | 47990 | 1 | [3.0, 3.0] |
| 2003 | 42120 | 1 | [5.0, 5.0] |
| 2007 | 44940 | 1 | [4.0, 4.0] |
| 2008 | 58240 | 1 | [4.0, 4.0] |
| 2002 | 102 | 1 | [1.0, 1.0] |
| 2009 | 52708 | 1 | [3.0, 3.0] |
| ... | ... | ... | ... |
+------+---------+-------+------------------------+
[10000 rows x 4 columns]

To put all items a user rated into one list value by their star rating:

auto user_rating_stats = sf.groupby({"user_id", "rating"},
{{"rated_movie_ids",aggregate::CONCAT("movie_id")}});
std::cout << user_rating_stats;
+--------+---------+----------------------+
| rating | user_id | rated_movie_ids |
+--------+---------+----------------------+
| 3 | 31434 | array("d", [1663.0]) |
| 5 | 25944 | array("d", [1663.0]) |
| 4 | 38827 | array("d", [1663.0]) |
| 4 | 51437 | array("d", [1663.0]) |
| 4 | 42549 | array("d", [1663.0]) |
| 4 | 49532 | array("d", [1663.0]) |
| 3 | 26124 | array("d", [1663.0]) |
| 4 | 46336 | array("d", [1663.0]) |
| 4 | 52133 | array("d", [1663.0]) |
| 5 | 62361 | array("d", [1663.0]) |
| ... | ... | ... |
+--------+---------+----------------------+
[9952 rows x 3 columns]

To put all items and rating of a given user together into a dictionary value:

auto user_rating_stats = sf.groupby({"user_id"},
{{"movie_rating",agg.CONCAT("movie_id", "rating")}});
std::cout << user_rating_stats;
+---------+--------------+
| user_id | movie_rating |
+---------+--------------+
| 62361 | {1663: 5} |
| 30727 | {1663: 4} |
| 40111 | {1663: 2} |
| 50513 | {1663: 4} |
| 35140 | {1663: 4} |
| 42352 | {1663: 5} |
| 29667 | {1663: 4} |
| 46242 | {1663: 5} |
| 58310 | {1663: 2} |
| 64614 | {1663: 2} |
| ... | ... |
+---------+--------------+
[9852 rows x 2 columns]
See also
aggregate

◆ has_size()

bool turi::gl_sframe::has_size ( ) const
inherited

Returns true if the size of the SFrame is known. If it is not known, calling size() may trigger materialization.

◆ head()

gl_sframe turi::gl_sframe::head ( size_t  n) const
inherited

Returns a gl_sframe which contains the first n rows of this gl_sframe.

Parameters
nThe number of rows to fetch.

◆ is_materialized()

bool turi::gl_sframe::is_materialized ( ) const
inherited

Returns whether or not the sarray has been materialized.

See also
materialize

◆ join() [1/2]

gl_sframe turi::gl_sframe::join ( const gl_sframe right,
const std::vector< std::string > &  joinkeys,
const std::string &  how = "inner" 
) const
inherited

Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.

Parameters
rightThe gl_sframe to join.
onThe column name(s) representing the set of join keys. Each row that has the same value in this set of columns will be merged together.
howOptional. The type of join to perform. "inner" is default.
  • "inner" : Equivalent to a SQL inner join. Result consists of the rows from the two frames whose join key values match exactly, merged together into one gl_sframe.
  • "left" : Equivalent to a SQL left outer join. Result is the union between the result of an inner join and the rest of the rows from the left gl_sframe, merged with missing values.
  • "right" : Equivalent to a SQL right outer join. Result is the union between the result of an inner join and the rest of the rows from the right gl_sframe, merged with missing values.
  • "outer" : Equivalent to a SQL full outer join. Result is the union between the result of a left outer join and a right outer join.

Example:

auto animals = gl_sframe({{"id", {1, 2, 3, 4}},
{"name", {"dog", "cat", "sheep", "cow"}}});
auto sounds = gl_sframe({{"id", {1, 3, 4, 5}},
{"sound", {"woof", "baa", "moo", "oink"}}});
std::cout << animals.join(sounds, {"id"});
std::cout << animals.join(sounds, {"id"}, "left");
std::cout << animals.join(sounds, {"id"}, "right");
std::cout << animals.join(sounds, {"id"}, "outer");

Produces output:

+----+-------+-------+
| id | name | sound |
+----+-------+-------+
| 1 | dog | woof |
| 3 | sheep | baa |
| 4 | cow | moo |
+----+-------+-------+
[3 rows x 3 columns]
+----+-------+-------+
| id | name | sound |
+----+-------+-------+
| 1 | dog | woof |
| 3 | sheep | baa |
| 4 | cow | moo |
| 2 | cat | None |
+----+-------+-------+
[4 rows x 3 columns]
+----+-------+-------+
| id | name | sound |
+----+-------+-------+
| 1 | dog | woof |
| 3 | sheep | baa |
| 4 | cow | moo |
| 5 | None | oink |
+----+-------+-------+
[4 rows x 3 columns]
+----+-------+-------+
| id | name | sound |
+----+-------+-------+
| 1 | dog | woof |
| 3 | sheep | baa |
| 4 | cow | moo |
| 5 | None | oink |
| 2 | cat | None |
+----+-------+-------+
[5 rows x 3 columns]

◆ join() [2/2]

gl_sframe turi::gl_sframe::join ( const gl_sframe right,
const std::map< std::string, std::string > &  joinkeys,
const std::string &  how = "inner" 
) const
inherited

Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.

Parameters
rightThe gl_sframe to join.
onThe column name(s) representing a map of join keys from left to right. Each key is taken as a column name on the left gl_sframe and each value is taken as the column name in the right gl_sframe.
howOptional. The type of join to perform. "inner" is default.
  • "inner" : Equivalent to a SQL inner join. Result consists of the rows from the two frames whose join key values match exactly, merged together into one gl_sframe.
  • "left" : Equivalent to a SQL left outer join. Result is the union between the result of an inner join and the rest of the rows from the left gl_sframe, merged with missing values.
  • "right" : Equivalent to a SQL right outer join. Result is the union between the result of an inner join and the rest of the rows from the right gl_sframe, merged with missing values.
  • "outer" : Equivalent to a SQL full outer join. Result is the union between the result of a left outer join and a right outer join.

Example:

auto animals = gl_sframe({{"id", {1, 2, 3, 4}},
{"name", {"dog", "cat", "sheep", "cow"}}});
auto sounds = gl_sframe({{"id", {1, 3, 4, 5}},
{"sound", {"woof", "baa", "moo", "oink"}}});
std::cout << animals.join(sounds, {"id", "id"});

Produces output:

+----+-------+-------+
| id | name | sound |
+----+-------+-------+
| 1 | dog | woof |
| 3 | sheep | baa |
| 4 | cow | moo |
+----+-------+-------+
[3 rows x 3 columns]

◆ materialize()

void turi::gl_sframe::materialize ( )
inherited

For a SFrame that is lazily evaluated, force persist this sframe to disk, committing all lazy evaluated operations.

See also
is_materialized

◆ materialize_to_callback()

void turi::gl_sframe::materialize_to_callback ( std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)>  callback,
size_t  nthreads = (size_t)(-1) 
)
inherited

Calls a callback function passing each row of the SArray.

This does not materialize the array if not necessary.

The callback may be called in parallel in which case the argument provides a thread number. The function should return false, but may return true at anytime to quit the iteration process. It may also throw exceptions which will be forwarded to the caller of this function.

Each call to the callback passes:

The sframe_rows object looks like a vector<vector<flexible_type>>. i.e. to look at all the rows, you need to write:

sf.materalize_to_callback([&](size_t, const std::shared_ptr<sframe_rows>& rows) {
for(const auto& row: *rows) {
// each row looks like an std::vector<flexible_type>
// and can be casted to to a vector<flexible_type> if necessayr
}
});
Parameters
callbackThe callback to call
nthreadsNumber of threads. If not specified, #cpus is used

◆ num_columns()

size_t turi::gl_gframe::num_columns ( ) const
overridevirtual

Returns number of columns. If type is VERTEX_GFRAME, the value is also the number of vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.

Reimplemented from turi::gl_sframe.

◆ operator[]() [1/5]

std::vector<flexible_type> turi::gl_sframe::operator[] ( int64_t  i)
inherited

Returns the value at a particular array index; generally inefficient.

This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.

◆ operator[]() [2/5]

std::vector<flexible_type> turi::gl_sframe::operator[] ( int64_t  i) const
inherited

Returns the value at a particular array index; generally inefficient.

This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.

◆ operator[]() [3/5]

gl_sframe turi::gl_sframe::operator[] ( const std::initializer_list< int64_t > &  slice)
inherited

Performs a slice Python style.

Parameters
sliceA list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array.

Given a gl_sframe

gl_sarray a{1,2,3,4,5,6,7,8,9,10};
gl_sframe sf{{"a", a}}

Slicing a consecutive range:

auto ret = a[{1,4}]; // start at index 1, end at index 4
// ret is a gl_sframe with one column a: [2,3,4]

Slicing a range with a step:

auto ret = a[{1,2,8}]; // start at index 1, end at index 8 with step size 2
// ret is a gl_sframe with one column a: [2,4,6,8]

Using negative indexing:

auto ret = a[{-3,-1}]; // start at end - 3, end at index end - 1
// ret a gl_sframe with one column a: [8,9]

◆ operator[]() [4/5]

gl_sframe turi::gl_sframe::operator[] ( const std::initializer_list< int64_t > &  slice) const
inherited

Performs a slice Python style.

Parameters
sliceA list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array.

Given a gl_sframe

gl_sarray a{1,2,3,4,5,6,7,8,9,10};
gl_sframe sf{{"a", a}}

Slicing a consecutive range:

auto ret = a[{1,4}]; // start at index 1, end at index 4
// ret is a gl_sframe with one column a: [2,3,4]

Slicing a range with a step:

auto ret = a[{1,2,8}]; // start at index 1, end at index 8 with step size 2
// ret is a gl_sframe with one column a: [2,4,6,8]

Using negative indexing:

auto ret = a[{-3,-1}]; // start at end - 3, end at index end - 1
// ret a gl_sframe with one column a: [8,9]

◆ operator[]() [5/5]

gl_sframe turi::gl_sframe::operator[] ( const gl_sarray logical_filter) const
inherited

Performs a logical filter.

This function performs a logical filter: i.e. it subselects all the elements in this array where the corresponding value in the other array evaluates to true.

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {"1","2","3","4","5"}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
auto ret = sf[sf["a"] > 1 && sf["a"] <= 4];
// ret is now the sframe with 3 columns:
// a: [2,3,4]
// b: ["2","3","4"]
// c: [2.0,3.0,4.0]

◆ pack_columns() [1/2]

gl_sframe turi::gl_sframe::pack_columns ( const std::vector< std::string > &  columns,
const std::string &  new_column_name,
flex_type_enum  dtype = flex_type_enum::LIST,
flexible_type  fill_na = FLEX_UNDEFINED 
) const
inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Pack two or more columns of the current gl_sframe into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.

The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST

Parameters
columnsA list of column names to be packed. There must at least two columns to pack.
new_column_namePacked column name.
dtypeOptional. The resulting packed column type. If not provided, dtype is list.
fill_naOptional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced.

Example: Suppose 'sf' is an an SFrame that maintains business category information.

auto sf = gl_sframe({{"business", {1,2,3,4}},
{"category.retail", {1, FLEX_UNDEFINED, 1, FLEX_UNDEFINED}},
{"category.food", {1, 1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
{"category.service", {FLEX_UNDEFINED, 1, 1, FLEX_UNDEFINED}},
{"category.shop", {1, 1, FLEX_UNDEFINED, 1}}});
std::cout << sf;
+----------+-----------------+---------------+------------------+---------------+
| business | category.retail | category.food | category.service | category.shop |
+----------+-----------------+---------------+------------------+---------------+
| 1 | 1 | 1 | None | 1 |
| 2 | None | 1 | 1 | 1 |
| 3 | 1 | None | 1 | None |
| 4 | None | 1 | None | 1 |
+----------+-----------------+---------------+------------------+---------------+
[4 rows x 5 columns]

To pack all category columns into a list:

std::cout << sf.pack_columns({"category.retail", "category.food",
"category.service", "category.shop"},
"category");
+----------+--------------------+
| business | category |
+----------+--------------------+
| 1 | [1, 1, None, 1] |
| 2 | [None, 1, 1, 1] |
| 3 | [1, None, 1, None] |
| 4 | [None, 1, None, 1] |
+----------+--------------------+
[4 rows x 2 columns]

To pack all category columns into a dictionary:

std::cout << sf.pack_columns({"category.retail", "category.food",
"category.service", "category.shop"},
"category",
+----------+--------------------------------+
| business | X2 |
+----------+--------------------------------+
| 1 | {'category.retail': 1, 'ca ... |
| 2 | {'category.food': 1, 'cate ... |
| 3 | {'category.retail': 1, 'ca ... |
| 4 | {'category.food': 1, 'cate ... |
+----------+--------------------------------+
[4 rows x 2 columns]
See also
gl_sframe::unpack

◆ pack_columns() [2/2]

gl_sframe turi::gl_sframe::pack_columns ( const std::string &  column_prefix,
const std::string &  new_column_name,
flex_type_enum  dtype = flex_type_enum::LIST,
flexible_type  fill_na = FLEX_UNDEFINED 
) const
inherited

Pack two or more columns of the current gl_sframe with a common column name prefix into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.

The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST

Parameters
column_prefixPacks all columns with the given prefix.
new_column_namePacked column name.
dtypeOptional. The resulting packed column type. If not provided, dtype is list.
fill_naOptional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced.

Example: Suppose 'sf' is an an SFrame that maintains business category information.

auto sf = gl_sframe({{"business", {1,2,3,4}},
{"category.retail", {1, FLEX_UNDEFINED, 1, FLEX_UNDEFINED}},
{"category.food", {1, 1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
{"category.service", {FLEX_UNDEFINED, 1, 1, FLEX_UNDEFINED}},
{"category.shop", {1, 1, FLEX_UNDEFINED, 1}}});
std::cout << sf;
+----------+-----------------+---------------+------------------+---------------+
| business | category.retail | category.food | category.service | category.shop |
+----------+-----------------+---------------+------------------+---------------+
| 1 | 1 | 1 | None | 1 |
| 2 | None | 1 | 1 | 1 |
| 3 | 1 | None | 1 | None |
| 4 | None | 1 | None | 1 |
+----------+-----------------+---------------+------------------+---------------+
[4 rows x 5 columns]

To pack all category columns into a list:

std::cout << sf.pack_columns("category", "category");
+----------------+----------------+
| business | category |
+----------------+----------------+
| 1 | [1,1,,1] |
| 2 | [,1,1,1] |
| 3 | [1,,1,] |
| 4 | [,,,1] |
+----------------+----------------+
[4 rows x 2 columns]

To pack all category columns into a dictionary:

std::cout << sf.pack_columns("category",
"category",
+----------+--------------------------------+
| business | X2 |
+----------+--------------------------------+
| 1 | {'category.retail': 1, 'ca ... |
| 2 | {'category.food': 1, 'cate ... |
| 3 | {'category.retail': 1, 'ca ... |
| 4 | {'category.food': 1, 'cate ... |
+----------+--------------------------------+
[4 rows x 2 columns]
See also
gl_sframe::unpack

◆ plot()

std::shared_ptr<model_base> turi::gl_sframe::plot ( ) const
inherited

Return a plot object of the SFrame (same visualization as show)

◆ random_split() [1/2]

std::pair<gl_sframe, gl_sframe> turi::gl_sframe::random_split ( double  fraction) const
inherited

Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.

Parameters
fractionApproximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1.
seedOptional. Seed for the random number generator used to split.

Example:

auto sf = gl_sframe({{"id", gl_sarray::from_sequence(0, 1024)}});
gl_sframe sf_train, sf_test;
std::tie(sf_train, sf_test) = sf.random_split(.95);
std::cout << sf_test.size() << " " << sf_train.size() << "\n";

Produces output:

102 922

◆ random_split() [2/2]

std::pair<gl_sframe, gl_sframe> turi::gl_sframe::random_split ( double  fraction,
size_t  seed,
bool  exact = false 
) const
inherited

Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.

Parameters
fractionApproximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1.
seedThe random seed for the random number generator. Deterministic output is obtained if this is set to a constant.

Example:

auto sf = gl_sframe({{"id", gl_sarray::from_sequence(0, 1024)}});
gl_sframe sf_train, sf_test;
std::tie(sf_train, sf_test) = sf.random_split(.95, 12345);
std::cout << sf_test.size() << " " << sf_train.size() << "\n";

Produces output:

44 980

◆ range_iterator()

gl_sframe_range turi::gl_sframe::range_iterator ( size_t  start = 0,
size_t  end = (size_t)(-1) 
) const
inherited

Returns a one pass range object with begin() and end() iterators.

This will materialize the array.

See materialize_to_callback for a lazy version.

Parameters
startThe starting index of the range
endThe ending index of the range
// create an SFrame
gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {"1","2","3","4","5"}},
{"c", {1.0,2.0,3.0,4.0,5.0}}};
// get a range over the entire frame
auto ra = sa.range_iterator();
auto iter = ra.begin();
while (iter != ra.end()) {
std::vector<flexible_type> val = *iter;
// do something to val
}

Or more compactly with C++11 syntax:

for(const auto& val: sa.range_iterator()) {
std::cout << val[0] << " " << val[1] << " " << val[2] << "\n";
}

The range returned only supports one pass. The outcome of a second call to begin() is undefined after any iterator is advanced.

When iterating over a gl_sframe with many columns, if only a small number of columns are needed, there is a performance benefit to subselecting just those columns first before iterating.

i.e. if I only need columns "a" and "b" from the SFrame above:

for(const auto& val: sa[{"a","b"}].range_iterator()) {
std::cout << val[0] << " " << val[1] << "\n";
}
See also
gl_sframe_range

◆ remove_column()

void turi::gl_gframe::remove_column ( const std::string &  name)
overridevirtual

Remove a column with the given name. If type is VERTEX_GFRAME, the column is removed from vertex data (or edge data if type is EDGE_GFRAME) from the gl_sgraph.

Parameters
namethe column name to be removed
See also
gl_sgraph::remove_vertex_field
gl_sgraph::remove_edge_field

Reimplemented from turi::gl_sframe.

◆ rename()

void turi::gl_gframe::rename ( const std::map< std::string, std::string > &  old_to_new_names)
overridevirtual

Rename columns.

Parameters
old_to_new_namesmap from old column name to new column name.
See also
gl_sgraph::rename_vertex_fields
gl_sgraph::rename_edge_fields

Reimplemented from turi::gl_sframe.

◆ replace_add_column()

virtual void turi::gl_sframe::replace_add_column ( const gl_sarray data,
const std::string &  name = "" 
)
virtualinherited

Add a column to this gl_sframe, replacing a column with the same name already exists. The number of elements in the data given must match the length of every other column of the gl_sframe. This operation modifies the current gl_sframe in place. If no name is given, a default name is chosen.

Parameters
dataThe column of data to add.
nameOptional. The name of the column. If no name is given, a default name is chosen.

This is equivalent to using operator[] for column assignment.

sf[name] = data;

Example:

auto sf = gl_sframe({{"id", {1, 2, 3}},
{"val", {"A", "B", "C"}}});
auto sa = gl_sarray({"cat", "dog", "fossa"});
sf.replace_add_column(sa, "species");
std::cout << sf;

Produces output:

+----+-----+---------+
| id | val | species |
+----+-----+---------+
| 1 | A | cat |
| 2 | B | dog |
| 3 | C | fossa |
+----+-----+---------+
[3 rows x 3 columns]
See also
add_column(const gl_sarray&, const std::string&),

◆ sample() [1/2]

gl_sframe turi::gl_sframe::sample ( double  fraction) const
inherited

Create an gl_sframe which contains a subsample of the current gl_sframe.

Parameters
fractionThe fraction of the rows to fetch. Must be between 0 and 1.

Example:

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {1.0,2.0,3.0,4.0,5.0}}};
std::cout << sf.sample(.3);

Produces output:

Columns:
a integer
b float
Rows: ?
Data:
+----------------+----------------+
| a | b |
+----------------+----------------+
| 4 | 4 |
| 5 | 5 |
+----------------+----------------+
? rows x 2 columns]

◆ sample() [2/2]

gl_sframe turi::gl_sframe::sample ( double  fraction,
size_t  seed,
bool  exact = false 
) const
inherited

Create an gl_sframe which contains a subsample of the current gl_sframe.

Parameters
fractionThe fraction of the rows to fetch. Must be between 0 and 1.
seedThe random seed for the random number generator. Deterministic output is obtained if this is set to a constant.

Example:

gl_sframe sf{{"a", {1,2,3,4,5}},
{"b", {1.0,2.0,3.0,4.0,5.0}}};
std::cout << sf.sample(.3, 12345);

Produces output:

Columns:
a integer
b float
Rows: ?
Data:
+----------------+----------------+
| a | b |
+----------------+----------------+
| 4 | 4 |
| 5 | 5 |
+----------------+----------------+
? rows x 2 columns]

◆ save()

void turi::gl_sframe::save ( const std::string &  path,
const std::string &  format = "" 
) const
inherited

Saves the SFrame to file.

When format is "binary", the saved SArray will be in a directory named with the targetfile parameter. When format is "text" or "csv", it is saved as a single human readable text file.

Parameters
filenameA local path or a remote URL. If format is 'text', it will be saved as a text file. If format is 'binary', a directory will be created at the location which will contain the SArray.
formatOptional. Either "binary", "csv" or "". Defaults to "". Format in which to save the SFrame. Binary saved SArrays can be loaded much faster and without any format conversion losses. If "csv", Each row will be written as a single line in an output text file. If format is an empty string (default), we will try to infer the format from filename given. If file name ends with "csv", or ".csv.gz", then the gl_sframe is saved as "csv" format, otherwise the gl_sframe is saved as 'binary' format.

◆ save_reference()

void turi::gl_sframe::save_reference ( const std::string &  path) const
inherited

Performs an incomplete save of an existing SFrame into a directory. This saved SFrame may reference SFrames in other locations in the same filesystem for certain columns/segments/etc.

Does not modify the current sframe.

◆ select_column()

gl_sarray turi::gl_sframe::select_column ( const std::string &  colname) const
inherited

Extracts one column of the gl_sframe.

This is equivalent to using operator[] for column indexing.

Equivalent to:

sf[colname];
See also
select_columns

◆ select_columns()

gl_sframe turi::gl_sframe::select_columns ( const std::vector< std::string > &  colnames) const
inherited

Extracts a collection of columns of the gl_sframe.

This is equivalent to using operator[] for selecting multiple columns

sf[colnames];
See also
select_column

◆ show()

void turi::gl_sframe::show ( const std::string &  path_to_client) const
inherited

Show a visualization of the SFrame.

◆ size()

size_t turi::gl_gframe::size ( ) const
overridevirtual

Returns number of rows. If type is VERTEX_GFRAME, the value is also the number of vertices (or edges if type is EDGE_GFRAME) in the gl_sgraph.

See also
gl_sgraph::num_vertices
gl_sgraph::num_edges

Reimplemented from turi::gl_sframe.

◆ sort() [1/4]

gl_sframe turi::gl_sframe::sort ( const std::string &  column,
bool  ascending = true 
) const
inherited

Sort current gl_sframe by a single column, using the given sort order.

Only columns that are type of str, int and float can be sorted.

Parameters
columnThe name of the column to be sorted.
ascendingOptional. Sort all columns in the given order.

Example:

gl_sframe sf{ {"a", {1,3,2,1}},
{"b", {"a","c","b","b"}},
{"c", {"x","y","z","y"}} };
std::cout << sf.sort("a") << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 1 | a | x |
| 1 | b | y |
| 2 | b | z |
| 3 | c | y |
+---+---+---+
[4 rows x 3 columns]

Example:

// To sort by column "a", descending
std::cout << sf.sort("a", false) << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 3 | c | y |
| 2 | b | z |
| 1 | a | x |
| 1 | b | y |
+---+---+---+
[4 rows x 3 columns]
See also
topk

◆ sort() [2/4]

gl_sframe turi::gl_sframe::sort ( const std::vector< std::string > &  columns,
bool  ascending = true 
) const
inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Sort current gl_sframe by a multiple columns, using the given sort order.

Parameters
columnsThe names of the columns to be sorted.
ascendingOptional. Sort all columns in the given order.

The result will be sorted first by first column, followed by second column, and so on. All columns will be sorted in the same order as governed by the "ascending" parameter.

Example:

// To sort by column "a" and "b", all ascending
std::cout << sf.sort({"a", "b"}) << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 1 | a | x |
| 1 | b | y |
| 2 | b | z |
| 3 | c | y |
+---+---+---+
[4 rows x 3 columns]
See also
topk

◆ sort() [3/4]

gl_sframe turi::gl_sframe::sort ( const std::initializer_list< std::string > &  columns,
bool  ascending = true 
) const
inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

◆ sort() [4/4]

gl_sframe turi::gl_sframe::sort ( const std::vector< std::pair< std::string, bool >> &  column_and_ascending) const
inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Sort current gl_sframe by a multiple columns, using different sort order for each column.

Parameters
column_and_ascendingA map from column name to sort order (ascending is true)

To sort by column "a" ascending, and then by column "c" descending To control the sort ordering for each column individually, "sort_columns" must be a list of (str, bool) pairs. Given this case, the first value is the column name and the second value is a boolean indicating whether the sort order is ascending.

Example:

// To sort by column "a" ascending, and then by column "c" descending
std::cout << sf.sort({{"a", true}, {"c", false}}) << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 1 | b | y |
| 1 | a | x |
| 2 | b | z |
| 3 | c | y |
+---+---+---+
[4 rows x 3 columns]

◆ split_datetime()

gl_sframe turi::gl_sframe::split_datetime ( const std::string &  expand_column,
const std::string &  column_name_prefix = "X",
const std::vector< std::string > &  limit = std::vector< std::string >(),
bool  tzone = false 
) const
inherited

Splits a datetime column of gl_sframe to multiple columns, with each value in a separate column. Returns a new gl_sframe with the column replaced with a list of new columns. The expanded column must be of datetime type. For more details regarding name generation and other, refer to gl_sarray::split_datetime

This function is a convenience function which is equivalent to calling gl_sarray::split_datetime on the column, deleting the column and adding the expanded columns back to the sframe.

Parameters
expand_columnName of the column to expand.
column_name_prefixOptional. If provided, expanded column names would start with the given prefix. If not provided, the default value is the name of the expanded column.
limitOptional. Limits the set of datetime elements to expand. Elements are 'year','month','day','hour','minute', and 'second'.
tzoneOptional. A boolean parameter that determines whether to show the timezone column or not. Defaults to false.

Example:

auto sa = gl_sarray({"20-Oct-2011", "10-Jan-2012"});
sf["date"] = sa.str_to_datetime("%d-%b-%Y");
auto split_sf = sf.split_datetime("date", "", {"day","year"});
std::cout << split_sf;

Produces output:

Columns:
day integer
year integer
+----------------+----------------+
| day | year |
+----------------+----------------+
| 20 | 2011 |
| 10 | 2012 |
+----------------+----------------+
[2 rows x 2 columns]

◆ stack() [1/2]

gl_sframe turi::gl_sframe::stack ( const std::string &  column_name,
const std::string &  new_column_names,
bool  drop_na = false 
) const
inherited

Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.

The stack works only for columns of list, or array type (for the dict type, see the overload). One new column is created as a result of stacking, where each row holds one element of the array or list value, and the rest columns from the same original row repeated.

The new gl_sframe includes the newly created column and all columns other than the one that is stacked.

Parameters
column_namesThe column(s) to stack. This column must be of dict/list/array type
new_column_nameOptional. The new column name. If not given, column names are generated automatically.
drop_naOptional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s).

Suppose 'sf' is an SFrame that contains a user and his/her friends, where 'friends' columns is an array type. Stack on 'friends' column would create a user/friend list for each user/friend pair:

auto sf = gl_sframe({{"topic",{1,2,3}},
{"friends",{{2,3,4}, {5,6}, {4,5,10,FLEX_UNDEFINED}}}
});
std::cout << sf;
std::cout << sf.stack("friends", "friend");

Produces output:

+------+------------------+
| user | friends |
+------+------------------+
| 1 | [2, 3, 4] |
| 2 | [5, 6] |
| 3 | [4, 5, 10, None] |
+------+------------------+
[3 rows x 2 columns]
+------+--------+
| user | friend |
+------+--------+
| 1 | 2 |
| 1 | 3 |
| 1 | 4 |
| 2 | 5 |
| 2 | 6 |
| 3 | 4 |
| 3 | 5 |
| 3 | 1 |
| 3 | None |
+------+--------+
[9 rows x 2 columns]
See also
gl_sframe::unstack(const std::vector<std::string>&, const std::string&) const
stack(const std::string&, const std::vector<std::string>&, bool)const

◆ stack() [2/2]

gl_sframe turi::gl_sframe::stack ( const std::string &  column_name,
const std::vector< std::string > &  new_column_names,
bool  drop_na = false 
) const
inherited

Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.

The stack works only for columns of dictionary type (for the list or array types, see the overload). Two new columns are created as a result of stacking: one column holds the key and another column holds the value. The rest of the columns are repeated for each key/value pair.

The new gl_sframe includes the newly created columns and all columns other than the one that is stacked.

Parameters
column_namesThe column(s) to stack. This column must be of dict/list/array type
new_column_namesOptional. The new column names. Must be an vector of 2 values corresponding to the "key" column and the "value" column. If not given, column names are generated automatically.
drop_naOptional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s).

Suppose 'sf' is an SFrame that contains a column of dict type. Stack would stack all keys in one column and all values in another column:

auto sf = gl_sframe({{"topic",{1,2,3,4}},
{"words", {flex_dict{{"a",3},{"cat",2}},
flex_dict{{"a",1},{"the",2}},
flex_dict{{"the",1},{"dog",3}},
}});
std::cout << sf.stack("words", new_column_name={"word", "count"});

Produces output:

+-------+----------------------+
| topic | words |
+-------+----------------------+
| 1 | {'a': 3, 'cat': 2} |
| 2 | {'a': 1, 'the': 2} |
| 3 | {'the': 1, 'dog': 3} |
| 4 | {} |
+-------+----------------------+
[4 rows x 2 columns]
+-------+------+-------+
| topic | word | count |
+-------+------+-------+
| 1 | a | 3 |
| 1 | cat | 2 |
| 2 | a | 1 |
| 2 | the | 2 |
| 3 | the | 1 |
| 3 | dog | 3 |
| 4 | None | None |
+-------+------+-------+
[7 rows x 3 columns]
Observe that since topic 4 had no words, an empty row is inserted.
To drop that row, set dropna=True in the parameters to stack.
See also
unstack(const std::string&, const std::string&) const
stack(const std::string&, const std::string&, bool)const

◆ swap_columns()

void turi::gl_gframe::swap_columns ( const std::string &  column_1,
const std::string &  column_2 
)
overridevirtual

Swap the order of two columns

Reimplemented from turi::gl_sframe.

◆ tail()

gl_sframe turi::gl_sframe::tail ( size_t  n) const
inherited

Returns a gl_sframe which contains the last n rows of this gl_sframe.

Parameters
nThe number of rows to fetch.

◆ topk()

gl_sframe turi::gl_sframe::topk ( const std::string &  column_name,
size_t  k = 10,
bool  reverse = false 
) const
inherited

Get top k rows according to the given column. Result is according to and sorted by "column_name" in the given order (default is descending). When "k" is small, "topk" is more efficient than "sort".

Parameters
column_nameThe column to sort on
kOptional. Defaults to 10 The number of rows to return.
reverseOptional. Defaults to False. If true, return the top k rows in ascending order, otherwise, in descending order.

Example:

auto sf = gl_sframe({{"id", gl_sarray::from_sequence(1000)}});
auto sf["value"] = 0 - sf["id"];
std::cout << sf.topk("id", k=3);

Produces output:

+--------+--------+
| id | value |
+--------+--------+
| 999 | -999 |
| 998 | -998 |
| 997 | -997 |
+--------+--------+
[3 rows x 2 columns]

Example:

std::cout << sf.topk("value", k=3);

Produces output:

+--------+--------+
| id | value |
+--------+--------+
| 1 | -1 |
| 2 | -2 |
| 3 | -3 |
+--------+--------+
[3 rows x 2 columns]
See also
sort

◆ unique()

gl_sframe turi::gl_sframe::unique ( ) const
inherited

Remove duplicate rows of the gl_sframe. Will not necessarily preserve the order of the given gl_sframe in the new gl_sframe.

Example:

gl_sframe sf{ {"id", {1,2,3,3,4}},
{"value", {1,2,3,3,4}} };
std::cout << sf.unique() << std::endl;

Produces output:

+----+-------+
| id | value |
+----+-------+
| 2 | 2 |
| 4 | 4 |
| 3 | 3 |
| 1 | 1 |
+----+-------+
[4 rows x 2 columns]
See also
gl_sarray::unique

◆ unpack()

gl_sframe turi::gl_sframe::unpack ( const std::string &  unpack_column,
const std::string &  column_name_prefix = "X",
const std::vector< flex_type_enum > &  column_types = std::vector< flex_type_enum >(),
const flexible_type na_value = FLEX_UNDEFINED,
const std::vector< flexible_type > &  limit = std::vector< flexible_type >() 
) const
inherited

Expand one column of this gl_sframe to multiple columns with each value in a separate column. Returns a new gl_sframe with the unpacked column replaced with a list of new columns. The column must be of list/array/dict type. For more details regarding name generation, missing value handling and other, refer to gl_sarray::unpack

Parameters
unpack_columnName of the unpacked column
column_name_prefixOptional. If provided, unpacked column names would start with the given prefix. Defaults to "X". If the empty string is used, no prefix is used.
column_typesOptional. Column types for the unpacked columns. If not provided, column types are automatically inferred from first 100 rows. Defaults to FLEX_UNDEFINED.
na_valueOptional. Convert all values that are equal to "na_value" to missing value if specified.
limitoptional limits in the set of list/vector/dict keys to unpack. For list/vector gl_sarrays, "limit" must contain integer indices. For dict gl_sarrays, "limit" must contain dictionary keys.

Example:

sf = gl_sframe({{"id", {1,2,3}},
{"wc": {flex_dict{{"a", 1}},
flex_dict{{"b", 2}},
flex_dict{{"a", 1},{"b", 2}}
}
}});
std::cout << sf;
+----+------------------+
| id | wc |
+----+------------------+
| 1 | {'a': 1} |
| 2 | {'b': 2} |
| 3 | {'a': 1, 'b': 2} |
+----+------------------+
[3 rows x 2 columns]

To unpack:

std::cout << sf.unpack("wc");
+----+------+------+
| id | wc.a | wc.b |
+----+------+------+
| 1 | 1 | None |
| 2 | None | 2 |
| 3 | 1 | 2 |
+----+------+------+
[3 rows x 3 columns]

To not have prefix in the generated column name::

std::cout << sf.unpack("wc", "");
+----+------+------+
| id | wc.a | wc.b |
+----+------+------+
| 1 | 1 | None |
| 2 | None | 2 |
| 3 | 1 | 2 |
+----+------+------+
[3 rows x 3 columns]

To limit subset of keys to unpack:

std::cout << sf.unpack("wc", "", {}, FLEX_UNDEFINED, {"b"});
+----+------+
| id | b |
+----+------+
| 1 | None |
| 2 | 2 |
| 3 | 2 |
+----+------+
[3 rows x 3 columns]
See also
gl_sframe::pack_columns
gl_sarray::unpack

◆ unstack() [1/2]

gl_sframe turi::gl_sframe::unstack ( const std::string &  columns,
const std::string &  new_column_name = "" 
) const
inherited

Concatenate values from one columns into one column, grouping by all other columns. The resulting column could be of type list or array. If "column" is a numeric column, the result will be of vector type. If "column" is a non-numeric column, the new column will be of list type.

Parameters
columnThe column that is to be concatenated. If str, then collapsed column type is either array or list.
new_column_nameOptional. New column name. If not given, a name is generated automatically.

Example:

auto sf = gl_sframe({{"friend", {2, 3, 4, 5, 6, 4, 5, 2, 3}},
{"user", {1, 1, 1, 2, 2, 2, 3, 4, 4}}});
std::cout << sf.unstack("friend", "friends");

Produces output:

+------+-----------------------------+
| user | friends |
+------+-----------------------------+
| 3 | [5.0] |
| 1 | [2.0, 4.0, 3.0] |
| 2 | [5.0, 6.0, 4.0] |
| 4 | [2.0, 3.0] |
+------+-----------------------------+
[4 rows x 2 columns]
See also
stack(const std::string&, const std::string&, bool)const
groupby

◆ unstack() [2/2]

gl_sframe turi::gl_sframe::unstack ( const std::vector< std::string > &  columns,
const std::string &  new_column_name = "" 
) const
inherited

Concatenate values two columns into one column, grouping by all other columns. The new column will be of dict type where the keys are taken from the first column in the list, and the values taken from the second column in the list.

Parameters
columnThe columns that are to be concatenated.
new_column_nameOptional. New column name. If not given, a name is generated automatically.

Example:

auto sf = gl_sframe({{"count",{4, 2, 1, 1, 2, FLEX_UNDEFINED}},
{"topic",{"cat", "cat", "dog", "elephant", "elephant", "fish"}},
{"word", {"a", "c", "c", "a", "b", FLEX_UNDEFINED}}});
std::cout << sf.unstack({"word", "count"}, "words");

Produces output:

+----------+------------------+
| topic | words |
+----------+------------------+
| elephant | {'a': 1, 'b': 2} |
| dog | {'c': 1} |
| cat | {'a': 4, 'c': 2} |
| fish | None |
+----------+------------------+
[4 rows x 2 columns]
See also
stack
groupby

The documentation for this class was generated from the following file: