#include <core/data/sframe/gl_gframe.hpp>

Public Member Functions
size_t	size () const override

size_t	num_columns () const override

std::vector< std::string >	column_names () const override

std::vector< flex_type_enum >	column_types () const override

void	add_column (const flexible_type &data, const std::string &name) override

void	add_column (const gl_sarray &data, const std::string &name) override

void	add_columns (const gl_sframe &data) override

void	remove_column (const std::string &name) override

void	rename (const std::map< std::string, std::string > &old_to_new_names) override

void	swap_columns (const std::string &column_1, const std::string &column_2) override

void	construct_from_csvs (std::string csv_file, csv_parsing_config_map csv_config, str_flex_type_map column_type_hints)

void	show (const std::string &path_to_client) const

std::shared_ptr< model_base >	plot () const

gl_sframe	operator[] (const gl_sarray &logical_filter) const

void	materialize_to_callback (std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)> callback, size_t nthreads=(size_t)(-1))

gl_sframe_range	range_iterator (size_t start=0, size_t end=(size_t)(-1)) const

bool	empty () const

bool	is_materialized () const

bool	has_size () const

void	materialize ()

void	save (const std::string &path, const std::string &format="") const

void	save_reference (const std::string &path) const

bool	contains_column (const std::string &col_name) const

gl_sframe	head (size_t n) const

gl_sframe	tail (size_t n) const

gl_sarray	apply (std::function< flexible_type(const sframe_rows::row &)> fn, flex_type_enum dtype) const

gl_sframe	sample (double fraction) const

gl_sframe	sample (double fraction, size_t seed, bool exact=false) const

std::pair< gl_sframe, gl_sframe >	random_split (double fraction) const

std::pair< gl_sframe, gl_sframe >	random_split (double fraction, size_t seed, bool exact=false) const

gl_sframe	topk (const std::string &column_name, size_t k=10, bool reverse=false) const

size_t	column_index (const std::string &column_name) const

const std::string &	column_name (size_t index) const

gl_sarray	select_column (const std::string &colname) const

gl_sframe	select_columns (const std::vector< std::string > &colnames) const

virtual void	replace_add_column (const gl_sarray &data, const std::string &name="")

gl_sframe	append (const gl_sframe &other) const

gl_sframe	groupby (const std::vector< std::string > &groupkeys, const std::map< std::string, aggregate::groupby_descriptor_type > &operators=std::map< std::string, aggregate::groupby_descriptor_type >()) const

gl_sframe	join (const gl_sframe &right, const std::vector< std::string > &joinkeys, const std::string &how="inner") const

gl_sframe	join (const gl_sframe &right, const std::map< std::string, std::string > &joinkeys, const std::string &how="inner") const

gl_sframe	filter_by (const gl_sarray &values, const std::string &column_name, bool exclude=false) const

gl_sframe	pack_columns (const std::vector< std::string > &columns, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const

gl_sframe	pack_columns (const std::string &column_prefix, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const

gl_sframe	split_datetime (const std::string &expand_column, const std::string &column_name_prefix="X", const std::vector< std::string > &limit=std::vector< std::string >(), bool tzone=false) const

gl_sframe	unpack (const std::string &unpack_column, const std::string &column_name_prefix="X", const std::vector< flex_type_enum > &column_types=std::vector< flex_type_enum >(), const flexible_type &na_value=FLEX_UNDEFINED, const std::vector< flexible_type > &limit=std::vector< flexible_type >()) const

gl_sframe	stack (const std::string &column_name, const std::string &new_column_names, bool drop_na=false) const

gl_sframe	stack (const std::string &column_name, const std::vector< std::string > &new_column_names, bool drop_na=false) const

gl_sframe	unstack (const std::string &columns, const std::string &new_column_name="") const

gl_sframe	unstack (const std::vector< std::string > &columns, const std::string &new_column_name="") const

gl_sframe	unique () const

gl_sframe	sort (const std::string &column, bool ascending=true) const

gl_sframe	sort (const std::vector< std::string > &columns, bool ascending=true) const

gl_sframe	sort (const std::initializer_list< std::string > &columns, bool ascending=true) const

gl_sframe	sort (const std::vector< std::pair< std::string, bool >> &column_and_ascending) const

gl_sframe	dropna (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const

std::pair< gl_sframe, gl_sframe >	dropna_split (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const

gl_sframe	fillna (const std::string &column, flexible_type value) const

gl_sframe	add_row_number (const std::string &column_name="id", size_t start=0) const


std::vector< flexible_type >	operator[] (int64_t i)

std::vector< flexible_type >	operator[] (int64_t i) const


gl_sframe	operator[] (const std::initializer_list< int64_t > &slice)

gl_sframe	operator[] (const std::initializer_list< int64_t > &slice) const

Column Indexing
Selects a single column of the SFrame. This returns an internal array reference object that can be used exactly like a gl_sarray. The design is quite similar to the reference object used by std::vector<bool> for indexing. For instance: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; gl_sarray t = sf["a"]; // takes out column "a" However, this operator can also be used for modifying existing columns, or creating new columns. For instance: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; sf["a"] = sf["a"] + 1; // sf["a"] is now {2,3,4,5,6} sf["d"] = sf["c"] - 1; // sf["d"] is now {0.0,1.0,2.0,3.0,4.0} Entire constant columns can also be created the same way: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; sf["ones"] = 1; Since the returned object is meant to be a short-lived reference, the following is not permitted: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; auto a_col = sf["a"]; since "auto" resolves to gl_sarray_reference which is intentionally, not copy-constructible. For functional alternatives, See replace_add_column, add_column, add_column(const gl_sarray&, const std::string&), "add_column overload".
const_gl_sarray_reference	operator[] (const std::string &column) const

gl_sarray_reference	operator[] (const std::string &column)

Multi-Column Indexing
Subselects a subset of columns returning the an SFrame containing only those columns. gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; gl_sframe ret = sf[{"a", "b"}] // ret has 2 columns "a" and "b"
gl_sframe	operator[] (const std::vector< std::string > &columns) const

gl_sframe	operator[] (const std::initializer_list< std::string > &columns)

gl_sframe	operator[] (const std::initializer_list< std::string > &columns) const

Detailed Description

A proxy for the gl_sframe for the vertex and edge data of the SGRaph

Definition at line 24 of file gl_gframe.hpp.

Member Function Documentation

◆ add_column() [1/2]

void turi::gl_gframe::add_column	(	const flexible_type &	data,
		const std::string &	name
	)

overridevirtual

Add a new column with constant value. If type is VERTEX_GFRAME, the column is added as a new vertex field (or edge field if type is EDGE_GFRAME) in the gl_sgraph.

Parameters

data	the constant value to fill the column
name	the name of the new column

See also: gl_sgraph::add_vertex_field(const flexible_type&, const std::string&); gl_sgraph::add_edge_field(const flexible_type&, const std::stirng&)

Reimplemented from turi::gl_sframe.

◆ add_column() [2/2]

void turi::gl_gframe::add_column	(	const gl_sarray &	data,
		const std::string &	name
	)

overridevirtual

Add a new column with given column name and data. If type is VERTEX_GFRAME, the column is added as a new vertex field (or edge field if type is EDGE_GFRAME) in the gl_sgraph.

Parameters

data	the constant value to fill the column
name	the name of the new column

See also: gl_sgraph::add_vertex_field(const gl_sarray&, const std::string&); gl_sgraph::add_edge_field(const gl_sarray&, const std::string&)

Reimplemented from turi::gl_sframe.

◆ add_columns()

void turi::gl_gframe::add_columns ( const gl_sframe & data )

overridevirtual

Batch version of add_column.

Parameters

data	a map from column name to column data

Reimplemented from turi::gl_sframe.

◆ add_row_number()

gl_sframe turi::gl_sframe::add_row_number	(	const std::string &	column_name = `"id"`,
		size_t	start = `0`
	)		const

inherited

Returns a new gl_sframe with a new column that numbers each row sequentially. By default the count starts at 0, but this can be changed to a positive or negative number. The new column will be named with the given column name. An error will be raised if the given column name already exists in the gl_sframe.

Parameters

column_name	Optional. The name of the new column that will hold the row numbers.
start	Optional. The number used to start the row number count.

Example:

sf = gl_sframe{{"a": {1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
               {"b": {"a", "b", FLEX_UNDEFINED}} };
std::cout << sf.add_row_number() << std::endl;

Produces output:

+----+------+------+
| id |  a   |  b   |
+----+------+------+
| 0  |  1   |  a   |
| 1  | None |  b   |
| 2  | None | None |
+----+------+------+
[3 rows x 3 columns]

◆ append()

gl_sframe turi::gl_sframe::append ( const gl_sframe & other ) const

inherited

Add the rows of an gl_sframe to the end of this gl_sframe. Both gl_sframe objects must have the same set of columns with the same column names and column types.

Parameters

other Another gl_sframe whose rows are appended to the current gl_sframe.

Example:

auto sf = gl_sframe({{"id", {4, 6, 8}},
                     {"val", {"D", "F", "H"}}});
auto sf2 = gl_sframe({{"id", {1, 2, 3}},
                      {"val", {"A", "B", "C"}}});
auto sf = sf.append(sf2);
std::cout <<  sf;

Produces output:

+----+-----+
| id | val |
+----+-----+
| 4  |  D  |
| 6  |  F  |
| 8  |  H  |
| 1  |  A  |
| 2  |  B  |
| 3  |  C  |
+----+-----+
[6 rows x 2 columns]

◆ apply()

gl_sarray turi::gl_sframe::apply	(	std::function< flexible_type(const sframe_rows::row &)>	fn,
		flex_type_enum	dtype
	)		const

inherited

Maps each row of the gl_sframe by a given function to a single value. The result gl_sarray is of type "dtype". "fn" should be a function that returns exactly one value which can be cast into the type specified by "dtype".

Parameters

fn	The function to transform each element. Must return exactly one value which can be cast into the type specified by "dtype".
dtype	The data type of the new gl_sarray.

Example:

gl_sframe sf{{"a", {1,2,3,4,5}},
             {"c", {1.0,2.0,3.0,4.0,5.0}}};
std::cout << sf.apply([](const sframe_rows::row& x) {
                        return x[0] * x[1];
                      }, flex_type_enum::FLOAT);

Produces output:

dtype: float
Rows: 5
[1.0, 4.0, 9.0, 16.0, 25.0]

See also: gl_sarray::apply

◆ column_index()

size_t turi::gl_sframe::column_index ( const std::string & column_name ) const

inherited

Returns the index of column column_name.

◆ column_name()

const std::string& turi::gl_sframe::column_name ( size_t index ) const

inherited

Returns the name of column index.

◆ column_names()

std::vector<std::string> turi::gl_gframe::column_names ( ) const

overridevirtual

Returns a list of column names. If type is VERTEX_GFRAME, the value is also the names of the vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.

See also: gl_sgraph::get_vertex_fields; gl_sgraph::get_edge_fields

Reimplemented from turi::gl_sframe.

◆ column_types()

std::vector<flex_type_enum> turi::gl_gframe::column_types ( ) const

overridevirtual

Returns a list of column types. If type is VERTEX_GFRAME, the value is also the names of the vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.

See also: gl_sgraph::get_vertex_field_types; gl_sgraph::get_edge_field_types

Reimplemented from turi::gl_sframe.

◆ construct_from_csvs()

void turi::gl_sframe::construct_from_csvs	(	std::string	csv_file,
		csv_parsing_config_map	csv_config,
		str_flex_type_map	column_type_hints
	)

inherited

Constructs a gl_sframe from a csv file

◆ contains_column()

bool turi::gl_sframe::contains_column ( const std::string & col_name ) const

inherited

Returns true if the column is present in the sframe, and false otherwise.

◆ dropna()

gl_sframe turi::gl_sframe::dropna	(	const std::vector< std::string > &	columns = `std::vector< std::string >()`,
		std::string	how = `"any"`,
		bool	recursive = `false`
	)		const

inherited

Remove missing values from an gl_sframe. A missing value is either "FLEX_UNDEFINED" or "NaN". If "how" is "any", a row will be removed if any of the columns in the "columns" parameter contains at least one missing value. If "how" is "all", a row will be removed if all of the columns in the "columns" parameter are missing values. If the "columns" parameter is not specified, the default is to consider all columns when searching for missing values.

Parameters

columns	Optional. The columns to use when looking for missing values. By default, all columns are used.
how	Optional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default.

For instance

gl_sframe sf { {"a", {1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
               {"b", {"a", "b", FLEX_UNDEFINED}} };
std::cout << sf.dropna() << std::endl;

Produces output:

+---+---+
| a | b |
+---+---+
| 1 | a |
+---+---+
[1 rows x 2 columns]

// Drop when all values are missing.

std::cout << sf.dropna({}, all) << std::endl;

Produces output:

+------+---+
|  a   | b |
+------+---+
|  1   | a |
| None | b |
+------+---+
[2 rows x 2 columns]

Example:

// Drop rows where column "a" has a missing value.

std::cout << sf.dropna({"a"}) << std::endl;

Produces output:

+---+---+
| a | b |
+---+---+
| 1 | a |
+---+---+
[1 rows x 2 columns]

See also: dropna_split

◆ dropna_split()

std::pair<gl_sframe, gl_sframe> turi::gl_sframe::dropna_split	(	const std::vector< std::string > &	columns = `std::vector< std::string >()`,
		std::string	how = `"any"`,
		bool	recursive = `false`
	)		const

inherited

Split rows with missing values from this gl_sframe. This function has the same functionality as dropna, but returns a tuple of two gl_sframe objects. The first item is the expected output from dropna, and the second item contains all the rows filtered out by the "dropna" algorithm.

Parameters

columns	Optional. The columns to use when looking for missing values. By default, all columns are used.
how	Optional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default.
recursive	Optional. It will recursively check whether a cell contains nan or not. This is handy for nested data structure like list, dictionary. For instance, {{FLEX_UNDEFINED, 1}, {1} will be treat as nan and will be removed if recursive is set to be true. Otherwise it won't be treated as nan-value.

Example:

gl_sframe sf { {"a": {1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
               {"b": {"a", "b", FLEX_UNDEFINED}} };
gl_sframe good, bad;
std::tie(good, bad) = sf.dropna_split();
std::cout << good << std::endl;

Produces output:

+---+---+
| a | b |
+---+---+
| 1 | a |
+---+---+
[1 rows x 2 columns]

Example:

std::cout << bad << std::endl;

Produces output:

+------+------+
|  a   |  b   |
+------+------+
| None |  b   |
| None | None |
+------+------+
[2 rows x 2 columns]

See also: dropna

◆ empty()

bool turi::gl_sframe::empty ( ) const

inherited

True if size() == 0.

◆ fillna()

gl_sframe turi::gl_sframe::fillna	(	const std::string &	column,
		flexible_type	value
	)		const

inherited

Fill all missing values with a given value in a given column. If the "value" is not the same type as the values in "column", this method attempts to convert the value to the original column"s type. If this fails, an error is raised.

Parameters

column	The name of the column to modify.
value	The value used to replace all missing values.
recursive	The recursive is used to set the manner of nan-value checking. If this value is true, a cell will be treated as missing value iff it contains nan. For instance, {{FLEX_UNDEFINED, 1}, {0}} and {FLEX_UNDEFINED, 1} will be all treated as nan-values.

Example:

gl_sframe sf {{"a": {1, FLEX_UNDEFINED, FLEX_UNDEFINED},
              {"b":["13.1", "17.2", FLEX_UNDEFINED]}};
sf = sf.fillna("a", 0);
std::cout << sf << std::endl;

Produces output:

+---+------+
| a |  b   |
+---+------+
| 1 | 13.1 |
| 0 | 17.2 |
| 0 | None |
+---+------+
[3 rows x 2 columns]

See also: dropna

◆ filter_by()

gl_sframe turi::gl_sframe::filter_by	(	const gl_sarray &	values,
		const std::string &	column_name,
		bool	exclude = `false`
	)		const

inherited

Filter an gl_sframe by values inside an iterable object. Result is an gl_sframe that only includes (or excludes) the rows that have a column with the given "column_name" which holds one of the values in the given "values" gl_sarray.

Parameters

values	The values to use to filter the gl_sframe. The resulting gl_sframe will only include rows that have one of these values in the given column.
column_name	The column of the gl_sframe to match with the given "values".
exclude	Optional. Defaults to false. If true, the result gl_sframe will contain all rows except those that have one of "values" in "column_name".

Example:

auto sf = gl_sframe({{"id", {1, 2, 3, 4}},
                     {"animal_type", {"dog", "cat", "cow", "horse"}},
                     {"name", {"bob", "jim", "jimbob", "bobjim"}}});
auto household_pets = {"cat", "hamster", "dog", "fish", "bird", "snake"};
std::cout << sf.filter_by(household_pets, "animal_type");
std::cout << sf.filter_by(household_pets, "animal_type", exclude=True);

Produces output:

+-------------+----+------+
| animal_type | id | name |
+-------------+----+------+
|     dog     | 1  | bob  |
|     cat     | 2  | jim  |
+-------------+----+------+
[2 rows x 3 columns]
+-------------+----+--------+
| animal_type | id |  name  |
+-------------+----+--------+
|    horse    | 4  | bobjim |
|     cow     | 3  | jimbob |
+-------------+----+--------+
[2 rows x 3 columns]

◆ groupby()

gl_sframe turi::gl_sframe::groupby	(	const std::vector< std::string > &	groupkeys,
		const std::map< std::string, aggregate::groupby_descriptor_type > &	operators = `std::map< std::string, aggregate::groupby_descriptor_type >()`
	)		const

inherited

Perform a group on the key_columns followed by aggregations on the columns listed in operations. The operations parameter is a dictionary that indicates which aggregation operators to use and which columns to use them on. The available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT, SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience, aggregators MEAN, STD, and VARIANCE are available as synonyms for AVG, STDV, and VAR. See turi::aggregate for more detail on the aggregators.

Parameters

groupkeys	Columns to group on. Type of key columns can be of any type other than dictionary.
operations	Map of columns and aggregation operations. Each key is a output column name and each value is an aggregator.

Suppose we have an SFrame (sf) with movie ratings by many users.

+---------+----------+--------+
| user_id | movie_id | rating |
+---------+----------+--------+
|  25904  |   1663   |   3    |
|  25907  |   1663   |   3    |
|  25923  |   1663   |   3    |
|  25924  |   1663   |   3    |
|  25928  |   1663   |   2    |
|  25933  |   1663   |   4    |
|  25934  |   1663   |   4    |
|  25935  |   1663   |   4    |
|  25936  |   1663   |   5    |
|  25937  |   1663   |   2    |
|   ...   |   ...    |  ...   |
+---------+----------+--------+
[10000 rows x 3 columns]

Compute the number of occurrences of each user.

auto user_count = sf.groupby({"user_id"},
                             {{"count", aggregate::COUNT()}});
std::cout << user_count;

+---------+-------+
| user_id | count |
+---------+-------+
|  62361  |   1   |
|  30727  |   1   |
|  40111  |   1   |
|  50513  |   1   |
|  35140  |   1   |
|  42352  |   1   |
|  29667  |   1   |
|  46242  |   1   |
|  58310  |   1   |
|  64614  |   1   |
|   ...   |  ...  |
+---------+-------+
[9852 rows x 2 columns]

Compute the mean and standard deviation of ratings per user.

auto  user_rating_stats = sf.groupby({"user_id"},
                                     {{ "mean_rating", aggregate::MEAN("rating")},
                                      {"std_rating", aggregate::STD("rating")}});
std::cout << user_rating_stats;

+---------+-------------+------------+
| user_id | mean_rating | std_rating |
+---------+-------------+------------+
|  62361  |     5.0     |    0.0     |
|  30727  |     4.0     |    0.0     |
|  40111  |     2.0     |    0.0     |
|  50513  |     4.0     |    0.0     |
|  35140  |     4.0     |    0.0     |
|  42352  |     5.0     |    0.0     |
|  29667  |     4.0     |    0.0     |
|  46242  |     5.0     |    0.0     |
|  58310  |     2.0     |    0.0     |
|  64614  |     2.0     |    0.0     |
|   ...   |     ...     |    ...     |
+---------+-------------+------------+
[9852 rows x 3 columns]

Compute the movie with the minimum rating per user.

auto chosen_movies = sf.groupby({"user_id"},
                                {{ "worst_movies", aggregate::ARGMIN("rating","movie_id")}});
std::cout <<  chosen_movies;

+---------+-------------+
| user_id | worst_movies |
+---------+-------------+
|  62361  |     1663    |
|  30727  |     1663    |
|  40111  |     1663    |
|  50513  |     1663    |
|  35140  |     1663    |
|  42352  |     1663    |
|  29667  |     1663    |
|  46242  |     1663    |
|  58310  |     1663    |
|  64614  |     1663    |
|   ...   |     ...     |
+---------+-------------+
[9852 rows x 2 columns]

Compute the count, mean, and standard deviation of ratings per (user, time), automatically assigning output column names.

// make up some time column which is a combination of user id and movie id
sf["time"] = sf.apply([](const flexible_type& x) {
                         return (x[0] + x[1]) % 11 + 2000;
                       });
auto user_rating_stats = sf.groupby({"user_id", "time"},
                                    {{"Count", aggregate::COUNT()},
                                     {"Avg of rating", aggregate::AVG("rating")},
                                     {"Stdv of rating", aggregate::STDV("rating")}});
std::cout <<  user_rating_stats;

+------+---------+-------+---------------+----------------+
| time | user_id | Count | Avg of rating | Stdv of rating |
+------+---------+-------+---------------+----------------+
| 2006 |  61285  |   1   |      4.0      |      0.0       |
| 2000 |  36078  |   1   |      4.0      |      0.0       |
| 2003 |  47158  |   1   |      3.0      |      0.0       |
| 2007 |  34446  |   1   |      3.0      |      0.0       |
| 2010 |  47990  |   1   |      3.0      |      0.0       |
| 2003 |  42120  |   1   |      5.0      |      0.0       |
| 2007 |  44940  |   1   |      4.0      |      0.0       |
| 2008 |  58240  |   1   |      4.0      |      0.0       |
| 2002 |   102   |   1   |      1.0      |      0.0       |
| 2009 |  52708  |   1   |      3.0      |      0.0       |
| ...  |   ...   |  ...  |      ...      |      ...       |
+------+---------+-------+---------------+----------------+
[10000 rows x 5 columns]

The groupby function can take a variable length list of aggregation specifiers so if we want the count and the 0.25 and 0.75 quantiles of ratings:

auto user_rating_stats = sf.groupby({"user_id", "time"},
                                    {{"Count", aggregate::COUNT()},
                                     {"rating_quantiles", agggregate.QUANTILE("rating",{0.25, 0.75}) }});
std::cout <<  user_rating_stats;

+------+---------+-------+------------------------+
| time | user_id | Count |    rating_quantiles    |
+------+---------+-------+------------------------+
| 2006 |  61285  |   1   |      [4.0, 4.0]        |
| 2000 |  36078  |   1   |      [4.0, 4.0]        |
| 2003 |  47158  |   1   |      [3.0, 3.0]        |
| 2007 |  34446  |   1   |      [3.0, 3.0]        |
| 2010 |  47990  |   1   |      [3.0, 3.0]        |
| 2003 |  42120  |   1   |      [5.0, 5.0]        |
| 2007 |  44940  |   1   |      [4.0, 4.0]        |
| 2008 |  58240  |   1   |      [4.0, 4.0]        |
| 2002 |   102   |   1   |      [1.0, 1.0]        |
| 2009 |  52708  |   1   |      [3.0, 3.0]        |
| ...  |   ...   |  ...  |          ...           |
+------+---------+-------+------------------------+
[10000 rows x 4 columns]

To put all items a user rated into one list value by their star rating:

auto  user_rating_stats = sf.groupby({"user_id", "rating"},
                                     {{"rated_movie_ids",aggregate::CONCAT("movie_id")}});
std::cout <<  user_rating_stats;

+--------+---------+----------------------+
| rating | user_id |     rated_movie_ids  |
+--------+---------+----------------------+
|   3    |  31434  | array("d", [1663.0]) |
|   5    |  25944  | array("d", [1663.0]) |
|   4    |  38827  | array("d", [1663.0]) |
|   4    |  51437  | array("d", [1663.0]) |
|   4    |  42549  | array("d", [1663.0]) |
|   4    |  49532  | array("d", [1663.0]) |
|   3    |  26124  | array("d", [1663.0]) |
|   4    |  46336  | array("d", [1663.0]) |
|   4    |  52133  | array("d", [1663.0]) |
|   5    |  62361  | array("d", [1663.0]) |
|  ...   |   ...   |         ...          |
+--------+---------+----------------------+
[9952 rows x 3 columns]

To put all items and rating of a given user together into a dictionary value:

auto  user_rating_stats = sf.groupby({"user_id"},
                                     {{"movie_rating",agg.CONCAT("movie_id", "rating")}});
std::cout <<  user_rating_stats;

+---------+--------------+
| user_id | movie_rating |
+---------+--------------+
|  62361  |  {1663: 5}   |
|  30727  |  {1663: 4}   |
|  40111  |  {1663: 2}   |
|  50513  |  {1663: 4}   |
|  35140  |  {1663: 4}   |
|  42352  |  {1663: 5}   |
|  29667  |  {1663: 4}   |
|  46242  |  {1663: 5}   |
|  58310  |  {1663: 2}   |
|  64614  |  {1663: 2}   |
|   ...   |     ...      |
+---------+--------------+
[9852 rows x 2 columns]

See also: aggregate

◆ has_size()

bool turi::gl_sframe::has_size ( ) const

inherited

Returns true if the size of the SFrame is known. If it is not known, calling size() may trigger materialization.

◆ head()

gl_sframe turi::gl_sframe::head ( size_t n ) const

inherited

Returns a gl_sframe which contains the first n rows of this gl_sframe.

Parameters

n	The number of rows to fetch.

◆ is_materialized()

bool turi::gl_sframe::is_materialized ( ) const

inherited

Returns whether or not the sarray has been materialized.

See also: materialize

◆ join() [1/2]

gl_sframe turi::gl_sframe::join	(	const gl_sframe &	right,
		const std::vector< std::string > &	joinkeys,
		const std::string &	how = `"inner"`
	)		const

inherited

Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.

Parameters

right	The gl_sframe to join.
on	The column name(s) representing the set of join keys. Each row that has the same value in this set of columns will be merged together.
how	Optional. The type of join to perform. "inner" is default. "inner" : Equivalent to a SQL inner join. Result consists of the rows from the two frames whose join key values match exactly, merged together into one gl_sframe. "left" : Equivalent to a SQL left outer join. Result is the union between the result of an inner join and the rest of the rows from the left gl_sframe, merged with missing values. "right" : Equivalent to a SQL right outer join. Result is the union between the result of an inner join and the rest of the rows from the right gl_sframe, merged with missing values. "outer" : Equivalent to a SQL full outer join. Result is the union between the result of a left outer join and a right outer join.

Example:

auto animals = gl_sframe({{"id", {1, 2, 3, 4}},
                          {"name", {"dog", "cat", "sheep", "cow"}}});
auto sounds = gl_sframe({{"id", {1, 3, 4, 5}},
                         {"sound", {"woof", "baa", "moo", "oink"}}});
std::cout <<  animals.join(sounds, {"id"});
std::cout <<  animals.join(sounds, {"id"}, "left");
std::cout <<  animals.join(sounds, {"id"}, "right");
std::cout <<  animals.join(sounds, {"id"}, "outer");

Produces output:

+----+-------+-------+
| id |  name | sound |
+----+-------+-------+
| 1  |  dog  |  woof |
| 3  | sheep |  baa  |
| 4  |  cow  |  moo  |
+----+-------+-------+
[3 rows x 3 columns]
+----+-------+-------+
| id |  name | sound |
+----+-------+-------+
| 1  |  dog  |  woof |
| 3  | sheep |  baa  |
| 4  |  cow  |  moo  |
| 2  |  cat  |  None |
+----+-------+-------+
[4 rows x 3 columns]
+----+-------+-------+
| id |  name | sound |
+----+-------+-------+
| 1  |  dog  |  woof |
| 3  | sheep |  baa  |
| 4  |  cow  |  moo  |
| 5  |  None |  oink |
+----+-------+-------+
[4 rows x 3 columns]
+----+-------+-------+
| id |  name | sound |
+----+-------+-------+
| 1  |  dog  |  woof |
| 3  | sheep |  baa  |
| 4  |  cow  |  moo  |
| 5  |  None |  oink |
| 2  |  cat  |  None |
+----+-------+-------+
[5 rows x 3 columns]

◆ join() [2/2]

gl_sframe turi::gl_sframe::join	(	const gl_sframe &	right,
		const std::map< std::string, std::string > &	joinkeys,
		const std::string &	how = `"inner"`
	)		const

inherited

Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.

Parameters

right	The gl_sframe to join.
on	The column name(s) representing a map of join keys from left to right. Each key is taken as a column name on the left gl_sframe and each value is taken as the column name in the right gl_sframe.
how	Optional. The type of join to perform. "inner" is default. "inner" : Equivalent to a SQL inner join. Result consists of the rows from the two frames whose join key values match exactly, merged together into one gl_sframe. "left" : Equivalent to a SQL left outer join. Result is the union between the result of an inner join and the rest of the rows from the left gl_sframe, merged with missing values. "right" : Equivalent to a SQL right outer join. Result is the union between the result of an inner join and the rest of the rows from the right gl_sframe, merged with missing values. "outer" : Equivalent to a SQL full outer join. Result is the union between the result of a left outer join and a right outer join.

Example:

auto animals = gl_sframe({{"id", {1, 2, 3, 4}},
                          {"name", {"dog", "cat", "sheep", "cow"}}});
auto sounds = gl_sframe({{"id", {1, 3, 4, 5}},
                         {"sound", {"woof", "baa", "moo", "oink"}}});
std::cout <<  animals.join(sounds, {"id", "id"});

Produces output:

+----+-------+-------+
| id |  name | sound |
+----+-------+-------+
| 1  |  dog  |  woof |
| 3  | sheep |  baa  |
| 4  |  cow  |  moo  |
+----+-------+-------+
[3 rows x 3 columns]

◆ materialize()

void turi::gl_sframe::materialize ( )

inherited

For a SFrame that is lazily evaluated, force persist this sframe to disk, committing all lazy evaluated operations.

See also: is_materialized

◆ materialize_to_callback()

void turi::gl_sframe::materialize_to_callback	(	std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)>	callback,
		size_t	nthreads = `(size_t)(-1)`
	)

inherited

Calls a callback function passing each row of the SArray.

This does not materialize the array if not necessary.

The callback may be called in parallel in which case the argument provides a thread number. The function should return false, but may return true at anytime to quit the iteration process. It may also throw exceptions which will be forwarded to the caller of this function.

Each call to the callback passes:

a thread id,
a shared_ptr to an sframe_rows object

The sframe_rows object looks like a vector<vector<flexible_type>>. i.e. to look at all the rows, you need to write:

sf.materalize_to_callback([&](size_t, const std::shared_ptr<sframe_rows>& rows) {
  for(const auto& row: *rows) {
     // each row looks like an std::vector<flexible_type>
     // and can be casted to to a vector<flexible_type> if necessayr
  }
});

Parameters

callback	The callback to call
nthreads	Number of threads. If not specified, #cpus is used

◆ num_columns()

size_t turi::gl_gframe::num_columns ( ) const

overridevirtual

Returns number of columns. If type is VERTEX_GFRAME, the value is also the number of vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.

Reimplemented from turi::gl_sframe.

◆ operator[]() [1/5]

std::vector<flexible_type> turi::gl_sframe::operator[] ( int64_t i )

inherited

Returns the value at a particular array index; generally inefficient.

This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.

◆ operator[]() [2/5]

std::vector<flexible_type> turi::gl_sframe::operator[] ( int64_t i ) const

inherited

Returns the value at a particular array index; generally inefficient.

This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.

◆ operator[]() [3/5]

gl_sframe turi::gl_sframe::operator[] ( const std::initializer_list< int64_t > & slice )

inherited

Performs a slice Python style.

Parameters

slice A list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array.

Given a gl_sframe

gl_sarray a{1,2,3,4,5,6,7,8,9,10};

gl_sframe sf{{"a", a}}

Slicing a consecutive range:

auto ret = a[{1,4}]; // start at index 1, end at index 4

// ret is a gl_sframe with one column a: [2,3,4]

Slicing a range with a step:

auto ret = a[{1,2,8}]; // start at index 1, end at index 8 with step size 2

// ret is a gl_sframe with one column a: [2,4,6,8]

Using negative indexing:

auto ret = a[{-3,-1}]; // start at end - 3, end at index end - 1

// ret a gl_sframe with one column a: [8,9]

◆ operator[]() [4/5]

gl_sframe turi::gl_sframe::operator[] ( const std::initializer_list< int64_t > & slice ) const

inherited

Performs a slice Python style.

Parameters

slice A list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array.

Given a gl_sframe

gl_sarray a{1,2,3,4,5,6,7,8,9,10};

gl_sframe sf{{"a", a}}

Slicing a consecutive range:

auto ret = a[{1,4}]; // start at index 1, end at index 4

// ret is a gl_sframe with one column a: [2,3,4]

Slicing a range with a step:

auto ret = a[{1,2,8}]; // start at index 1, end at index 8 with step size 2

// ret is a gl_sframe with one column a: [2,4,6,8]

Using negative indexing:

auto ret = a[{-3,-1}]; // start at end - 3, end at index end - 1

// ret a gl_sframe with one column a: [8,9]

◆ operator[]() [5/5]

gl_sframe turi::gl_sframe::operator[] ( const gl_sarray & logical_filter ) const

inherited

Performs a logical filter.

This function performs a logical filter: i.e. it subselects all the elements in this array where the corresponding value in the other array evaluates to true.

gl_sframe sf{{"a", {1,2,3,4,5}},
             {"b", {"1","2","3","4","5"}},
             {"c", {1.0,2.0,3.0,4.0,5.0}}};
auto ret = sf[sf["a"] > 1 && sf["a"] <= 4];
// ret is now the sframe with 3 columns:
// a: [2,3,4]
// b: ["2","3","4"]
// c: [2.0,3.0,4.0]

◆ pack_columns() [1/2]

gl_sframe turi::gl_sframe::pack_columns	(	const std::vector< std::string > &	columns,
		const std::string &	new_column_name,
		flex_type_enum	dtype = `flex_type_enum::LIST`,
		flexible_type	fill_na = `FLEX_UNDEFINED`
	)		const

inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Pack two or more columns of the current gl_sframe into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.

The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST

flex_type_enum::DICT : pack to a dictionary gl_sarray where column name becomes dictionary key and column value becomes dictionary value
flex_type_enum::VECTOR : pack all values from the packing columns into an array
flex_type_enum::LIST : pack all values from the packing columns into a list.

Parameters

columns	A list of column names to be packed. There must at least two columns to pack.
new_column_name	Packed column name.
dtype	Optional. The resulting packed column type. If not provided, dtype is list.
fill_na	Optional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced.

Example: Suppose 'sf' is an an SFrame that maintains business category information.

auto sf = gl_sframe({{"business", {1,2,3,4}},
                     {"category.retail", {1, FLEX_UNDEFINED, 1, FLEX_UNDEFINED}},
                     {"category.food", {1, 1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
                     {"category.service", {FLEX_UNDEFINED, 1, 1, FLEX_UNDEFINED}},
                     {"category.shop", {1, 1, FLEX_UNDEFINED, 1}}});
std::cout <<  sf;

+----------+-----------------+---------------+------------------+---------------+
| business | category.retail | category.food | category.service | category.shop |
+----------+-----------------+---------------+------------------+---------------+
|    1     |        1        |       1       |       None       |       1       |
|    2     |       None      |       1       |        1         |       1       |
|    3     |        1        |      None     |        1         |      None     |
|    4     |       None      |       1       |       None       |       1       |
+----------+-----------------+---------------+------------------+---------------+
[4 rows x 5 columns]

To pack all category columns into a list:

std::cout <<  sf.pack_columns({"category.retail", "category.food",
                               "category.service", "category.shop"},
                               "category");

+----------+--------------------+
| business |      category      |
+----------+--------------------+
|    1     |  [1, 1, None, 1]   |
|    2     |  [None, 1, 1, 1]   |
|    3     | [1, None, 1, None] |
|    4     | [None, 1, None, 1] |
+----------+--------------------+
[4 rows x 2 columns]

To pack all category columns into a dictionary:

std::cout << sf.pack_columns({"category.retail", "category.food",
                              "category.service", "category.shop"},
                              "category",
                              flex_type_enum::DICT);

+----------+--------------------------------+
| business |               X2               |
+----------+--------------------------------+
|    1     | {'category.retail': 1, 'ca ... |
|    2     | {'category.food': 1, 'cate ... |
|    3     | {'category.retail': 1, 'ca ... |
|    4     | {'category.food': 1, 'cate ... |
+----------+--------------------------------+
[4 rows x 2 columns]

See also: gl_sframe::unpack

◆ pack_columns() [2/2]

gl_sframe turi::gl_sframe::pack_columns	(	const std::string &	column_prefix,
		const std::string &	new_column_name,
		flex_type_enum	dtype = `flex_type_enum::LIST`,
		flexible_type	fill_na = `FLEX_UNDEFINED`
	)		const

inherited

Pack two or more columns of the current gl_sframe with a common column name prefix into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.

The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST

flex_type_enum::DICT : pack to a dictionary gl_sarray where column name becomes dictionary key and column value becomes dictionary value
flex_type_enum::VECTOR : pack all values from the packing columns into an array
flex_type_enum::LIST : pack all values from the packing columns into a list.

Parameters

column_prefix	Packs all columns with the given prefix.
new_column_name	Packed column name.
dtype	Optional. The resulting packed column type. If not provided, dtype is list.
fill_na	Optional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced.

Example: Suppose 'sf' is an an SFrame that maintains business category information.

auto sf = gl_sframe({{"business", {1,2,3,4}},
                     {"category.retail", {1, FLEX_UNDEFINED, 1, FLEX_UNDEFINED}},
                     {"category.food", {1, 1, FLEX_UNDEFINED, FLEX_UNDEFINED}},
                     {"category.service", {FLEX_UNDEFINED, 1, 1, FLEX_UNDEFINED}},
                     {"category.shop", {1, 1, FLEX_UNDEFINED, 1}}});
std::cout <<  sf;

+----------+-----------------+---------------+------------------+---------------+
| business | category.retail | category.food | category.service | category.shop |
+----------+-----------------+---------------+------------------+---------------+
|    1     |        1        |       1       |       None       |       1       |
|    2     |       None      |       1       |        1         |       1       |
|    3     |        1        |      None     |        1         |      None     |
|    4     |       None      |       1       |       None       |       1       |
+----------+-----------------+---------------+------------------+---------------+
[4 rows x 5 columns]

To pack all category columns into a list:

std::cout << sf.pack_columns("category", "category");

+----------------+----------------+
| business       | category       |
+----------------+----------------+
| 1              | [1,1,,1]       |
| 2              | [,1,1,1]       |
| 3              | [1,,1,]        |
| 4              | [,,,1]         |
+----------------+----------------+
[4 rows x 2 columns]

To pack all category columns into a dictionary:

std::cout << sf.pack_columns("category",
                             "category",
                             flex_type_enum::DICT);

+----------+--------------------------------+
| business |               X2               |
+----------+--------------------------------+
|    1     | {'category.retail': 1, 'ca ... |
|    2     | {'category.food': 1, 'cate ... |
|    3     | {'category.retail': 1, 'ca ... |
|    4     | {'category.food': 1, 'cate ... |
+----------+--------------------------------+
[4 rows x 2 columns]

See also: gl_sframe::unpack

◆ plot()

std::shared_ptr<model_base> turi::gl_sframe::plot ( ) const

inherited

Return a plot object of the SFrame (same visualization as show)

◆ random_split() [1/2]

std::pair<gl_sframe, gl_sframe> turi::gl_sframe::random_split ( double fraction ) const

inherited

Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.

Parameters

fraction	Approximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1.
seed	Optional. Seed for the random number generator used to split.

Example:

auto sf = gl_sframe({{"id", gl_sarray::from_sequence(0, 1024)}});
gl_sframe sf_train, sf_test;
std::tie(sf_train, sf_test) = sf.random_split(.95);
std::cout <<  sf_test.size() << " " << sf_train.size() << "\n";

Produces output:

102 922

◆ random_split() [2/2]

std::pair<gl_sframe, gl_sframe> turi::gl_sframe::random_split	(	double	fraction,
		size_t	seed,
		bool	exact = `false`
	)		const

inherited

Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.

Parameters

fraction	Approximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1.
seed	The random seed for the random number generator. Deterministic output is obtained if this is set to a constant.

Example:

auto sf = gl_sframe({{"id", gl_sarray::from_sequence(0, 1024)}});
gl_sframe sf_train, sf_test;
std::tie(sf_train, sf_test) = sf.random_split(.95, 12345);
std::cout <<  sf_test.size() << " " << sf_train.size() << "\n";

Produces output:

44 980

◆ range_iterator()

gl_sframe_range turi::gl_sframe::range_iterator	(	size_t	start = `0`,
		size_t	end = `(size_t)(-1)`
	)		const

inherited

Returns a one pass range object with begin() and end() iterators.

This will materialize the array.

See materialize_to_callback for a lazy version.

Parameters

start	The starting index of the range
end	The ending index of the range

// create an SFrame
gl_sframe sf{{"a", {1,2,3,4,5}},
             {"b", {"1","2","3","4","5"}},
             {"c", {1.0,2.0,3.0,4.0,5.0}}};
// get a range over the entire frame
auto ra = sa.range_iterator();
auto iter = ra.begin();
while (iter != ra.end()) {
  std::vector<flexible_type> val = *iter;
  // do something to val
}

Or more compactly with C++11 syntax:

for(const auto& val: sa.range_iterator()) {
  std::cout << val[0] << " " << val[1] << " " << val[2] << "\n";
}

The range returned only supports one pass. The outcome of a second call to begin() is undefined after any iterator is advanced.

When iterating over a gl_sframe with many columns, if only a small number of columns are needed, there is a performance benefit to subselecting just those columns first before iterating.

i.e. if I only need columns "a" and "b" from the SFrame above:

for(const auto& val: sa[{"a","b"}].range_iterator()) {
  std::cout << val[0] << " " << val[1] << "\n";
}

See also: gl_sframe_range

◆ remove_column()

void turi::gl_gframe::remove_column ( const std::string & name )

overridevirtual

Remove a column with the given name. If type is VERTEX_GFRAME, the column is removed from vertex data (or edge data if type is EDGE_GFRAME) from the gl_sgraph.

Parameters

name	the column name to be removed

See also: gl_sgraph::remove_vertex_field; gl_sgraph::remove_edge_field

Reimplemented from turi::gl_sframe.

◆ rename()

void turi::gl_gframe::rename ( const std::map< std::string, std::string > & old_to_new_names )

overridevirtual

Rename columns.

Parameters

old_to_new_names map from old column name to new column name.

See also: gl_sgraph::rename_vertex_fields; gl_sgraph::rename_edge_fields

Reimplemented from turi::gl_sframe.

◆ replace_add_column()

virtual void turi::gl_sframe::replace_add_column	(	const gl_sarray &	data,
		const std::string &	name = `""`
	)

virtualinherited

Add a column to this gl_sframe, replacing a column with the same name already exists. The number of elements in the data given must match the length of every other column of the gl_sframe. This operation modifies the current gl_sframe in place. If no name is given, a default name is chosen.

Parameters

data	The column of data to add.
name	Optional. The name of the column. If no name is given, a default name is chosen.

This is equivalent to using operator[] for column assignment.

sf[name] = data;

Example:

auto sf = gl_sframe({{"id", {1, 2, 3}},
                     {"val", {"A", "B", "C"}}});
auto sa = gl_sarray({"cat", "dog", "fossa"});
sf.replace_add_column(sa, "species");
std::cout <<  sf;

Produces output:

+----+-----+---------+
| id | val | species |
+----+-----+---------+
| 1  |  A  |   cat   |
| 2  |  B  |   dog   |
| 3  |  C  |  fossa  |
+----+-----+---------+
[3 rows x 3 columns]

See also: add_column(const gl_sarray&, const std::string&),

◆ sample() [1/2]

gl_sframe turi::gl_sframe::sample ( double fraction ) const

inherited

Create an gl_sframe which contains a subsample of the current gl_sframe.

Parameters

fraction The fraction of the rows to fetch. Must be between 0 and 1.

Example:

gl_sframe sf{{"a", {1,2,3,4,5}},
             {"b", {1.0,2.0,3.0,4.0,5.0}}};
std::cout <<  sf.sample(.3);

Produces output:

Columns:
    a  integer
    b  float
Rows: ?
Data:
+----------------+----------------+
| a              | b              |
+----------------+----------------+
| 4              | 4              |
| 5              | 5              |
+----------------+----------------+
? rows x 2 columns]

◆ sample() [2/2]

gl_sframe turi::gl_sframe::sample	(	double	fraction,
		size_t	seed,
		bool	exact = `false`
	)		const

inherited

Create an gl_sframe which contains a subsample of the current gl_sframe.

Parameters

fraction	The fraction of the rows to fetch. Must be between 0 and 1.
seed	The random seed for the random number generator. Deterministic output is obtained if this is set to a constant.

Example:

gl_sframe sf{{"a", {1,2,3,4,5}},
             {"b", {1.0,2.0,3.0,4.0,5.0}}};
std::cout <<  sf.sample(.3, 12345);

Produces output:

Columns:
    a  integer
    b  float
Rows: ?
Data:
+----------------+----------------+
| a              | b              |
+----------------+----------------+
| 4              | 4              |
| 5              | 5              |
+----------------+----------------+
? rows x 2 columns]

◆ save()

void turi::gl_sframe::save	(	const std::string &	path,
		const std::string &	format = `""`
	)		const

inherited

Saves the SFrame to file.

When format is "binary", the saved SArray will be in a directory named with the targetfile parameter. When format is "text" or "csv", it is saved as a single human readable text file.

Parameters

filename A local path or a remote URL. If format is 'text', it will be saved as a text file. If format is 'binary', a directory will be created at the location which will contain the SArray.

format Optional. Either "binary", "csv" or "". Defaults to "". Format in which to save the SFrame. Binary saved SArrays can be loaded much faster and without any format conversion losses. If "csv", Each row will be written as a single line in an output text file. If format is an empty string (default), we will try to infer the format from filename given. If file name ends with "csv", or ".csv.gz", then the gl_sframe is saved as "csv" format, otherwise the gl_sframe is saved as 'binary' format.

◆ save_reference()

void turi::gl_sframe::save_reference ( const std::string & path ) const

inherited

Performs an incomplete save of an existing SFrame into a directory. This saved SFrame may reference SFrames in other locations in the same filesystem for certain columns/segments/etc.

Does not modify the current sframe.

◆ select_column()

gl_sarray turi::gl_sframe::select_column ( const std::string & colname ) const

inherited

Extracts one column of the gl_sframe.

This is equivalent to using operator[] for column indexing.

Equivalent to:

sf[colname];

See also: select_columns

◆ select_columns()

gl_sframe turi::gl_sframe::select_columns ( const std::vector< std::string > & colnames ) const

inherited

Extracts a collection of columns of the gl_sframe.

This is equivalent to using operator[] for selecting multiple columns

sf[colnames];

See also: select_column

◆ show()

void turi::gl_sframe::show ( const std::string & path_to_client ) const

inherited

Show a visualization of the SFrame.

◆ size()

size_t turi::gl_gframe::size ( ) const

overridevirtual

Returns number of rows. If type is VERTEX_GFRAME, the value is also the number of vertices (or edges if type is EDGE_GFRAME) in the gl_sgraph.

See also: gl_sgraph::num_vertices; gl_sgraph::num_edges

Reimplemented from turi::gl_sframe.

◆ sort() [1/4]

gl_sframe turi::gl_sframe::sort	(	const std::string &	column,
		bool	ascending = `true`
	)		const

inherited

Sort current gl_sframe by a single column, using the given sort order.

Only columns that are type of str, int and float can be sorted.

Parameters

column	The name of the column to be sorted.
ascending	Optional. Sort all columns in the given order.

Example:

gl_sframe sf{ {"a", {1,3,2,1}},
              {"b", {"a","c","b","b"}},
              {"c", {"x","y","z","y"}} };
std::cout << sf.sort("a") << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 1 | a | x |
| 1 | b | y |
| 2 | b | z |
| 3 | c | y |
+---+---+---+
[4 rows x 3 columns]

Example:

// To sort by column "a", descending

std::cout << sf.sort("a", false) << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 3 | c | y |
| 2 | b | z |
| 1 | a | x |
| 1 | b | y |
+---+---+---+
[4 rows x 3 columns]

See also: topk

◆ sort() [2/4]

gl_sframe turi::gl_sframe::sort	(	const std::vector< std::string > &	columns,
		bool	ascending = `true`
	)		const

inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Sort current gl_sframe by a multiple columns, using the given sort order.

Parameters

columns	The names of the columns to be sorted.
ascending	Optional. Sort all columns in the given order.

The result will be sorted first by first column, followed by second column, and so on. All columns will be sorted in the same order as governed by the "ascending" parameter.

Example:

// To sort by column "a" and "b", all ascending

std::cout << sf.sort({"a", "b"}) << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 1 | a | x |
| 1 | b | y |
| 2 | b | z |
| 3 | c | y |
+---+---+---+
[4 rows x 3 columns]

See also: topk

◆ sort() [3/4]

gl_sframe turi::gl_sframe::sort	(	const std::initializer_list< std::string > &	columns,
		bool	ascending = `true`
	)		const

inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

◆ sort() [4/4]

gl_sframe turi::gl_sframe::sort ( const std::vector< std::pair< std::string, bool >> & column_and_ascending ) const

inherited

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Sort current gl_sframe by a multiple columns, using different sort order for each column.

Parameters

column_and_ascending A map from column name to sort order (ascending is true)

To sort by column "a" ascending, and then by column "c" descending To control the sort ordering for each column individually, "sort_columns" must be a list of (str, bool) pairs. Given this case, the first value is the column name and the second value is a boolean indicating whether the sort order is ascending.

Example:

// To sort by column "a" ascending, and then by column "c" descending

std::cout << sf.sort({{"a", true}, {"c", false}}) << std::endl;

Produces output:

+---+---+---+
| a | b | c |
+---+---+---+
| 1 | b | y |
| 1 | a | x |
| 2 | b | z |
| 3 | c | y |
+---+---+---+
[4 rows x 3 columns]

◆ split_datetime()

gl_sframe turi::gl_sframe::split_datetime	(	const std::string &	expand_column,
		const std::string &	column_name_prefix = `"X"`,
		const std::vector< std::string > &	limit = `std::vector< std::string >()`,
		bool	tzone = `false`
	)		const

inherited

Splits a datetime column of gl_sframe to multiple columns, with each value in a separate column. Returns a new gl_sframe with the column replaced with a list of new columns. The expanded column must be of datetime type. For more details regarding name generation and other, refer to gl_sarray::split_datetime

This function is a convenience function which is equivalent to calling gl_sarray::split_datetime on the column, deleting the column and adding the expanded columns back to the sframe.

Parameters

expand_column	Name of the column to expand.
column_name_prefix	Optional. If provided, expanded column names would start with the given prefix. If not provided, the default value is the name of the expanded column.
limit	Optional. Limits the set of datetime elements to expand. Elements are 'year','month','day','hour','minute', and 'second'.
tzone	Optional. A boolean parameter that determines whether to show the timezone column or not. Defaults to false.

Example:

auto sa = gl_sarray({"20-Oct-2011", "10-Jan-2012"});
gl_sframe sf;
sf["date"] = sa.str_to_datetime("%d-%b-%Y");
auto split_sf = sf.split_datetime("date", "", {"day","year"});
std::cout << split_sf;

Produces output:

Columns:
    day  integer
    year integer
+----------------+----------------+
| day            | year           |
+----------------+----------------+
| 20             | 2011           |
| 10             | 2012           |
+----------------+----------------+
[2 rows x 2 columns]

◆ stack() [1/2]

gl_sframe turi::gl_sframe::stack	(	const std::string &	column_name,
		const std::string &	new_column_names,
		bool	drop_na = `false`
	)		const

inherited

Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.

The stack works only for columns of list, or array type (for the dict type, see the overload). One new column is created as a result of stacking, where each row holds one element of the array or list value, and the rest columns from the same original row repeated.

The new gl_sframe includes the newly created column and all columns other than the one that is stacked.

Parameters

column_names	The column(s) to stack. This column must be of dict/list/array type
new_column_name	Optional. The new column name. If not given, column names are generated automatically.
drop_na	Optional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s).

Suppose 'sf' is an SFrame that contains a user and his/her friends, where 'friends' columns is an array type. Stack on 'friends' column would create a user/friend list for each user/friend pair:

auto  sf = gl_sframe({{"topic",{1,2,3}},
                      {"friends",{{2,3,4}, {5,6}, {4,5,10,FLEX_UNDEFINED}}}
                     });
std::cout <<  sf;
std::cout <<  sf.stack("friends", "friend");

Produces output:

+------+------------------+
| user |     friends      |
+------+------------------+
|  1   |     [2, 3, 4]    |
|  2   |      [5, 6]      |
|  3   | [4, 5, 10, None] |
+------+------------------+
[3 rows x 2 columns]
+------+--------+
| user | friend |
+------+--------+
|  1   |  2     |
|  1   |  3     |
|  1   |  4     |
|  2   |  5     |
|  2   |  6     |
|  3   |  4     |
|  3   |  5     |
|  3   |  1     |
|  3   |  None  |
+------+--------+
[9 rows x 2 columns]

See also: gl_sframe::unstack(const std::vector<std::string>&, const std::string&) const; stack(const std::string&, const std::vector<std::string>&, bool)const

◆ stack() [2/2]

gl_sframe turi::gl_sframe::stack	(	const std::string &	column_name,
		const std::vector< std::string > &	new_column_names,
		bool	drop_na = `false`
	)		const

inherited

Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.

The stack works only for columns of dictionary type (for the list or array types, see the overload). Two new columns are created as a result of stacking: one column holds the key and another column holds the value. The rest of the columns are repeated for each key/value pair.

The new gl_sframe includes the newly created columns and all columns other than the one that is stacked.

Parameters

column_names	The column(s) to stack. This column must be of dict/list/array type
new_column_names	Optional. The new column names. Must be an vector of 2 values corresponding to the "key" column and the "value" column. If not given, column names are generated automatically.
drop_na	Optional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s).

Suppose 'sf' is an SFrame that contains a column of dict type. Stack would stack all keys in one column and all values in another column:

auto  sf = gl_sframe({{"topic",{1,2,3,4}},
                      {"words", {flex_dict{{"a",3},{"cat",2}},
                                 flex_dict{{"a",1},{"the",2}},
                                 flex_dict{{"the",1},{"dog",3}},
                                 flex_dict()}
                       }});
std::cout <<  sf.stack("words", new_column_name={"word", "count"});

Produces output:

+-------+----------------------+
| topic |        words         |
+-------+----------------------+
|   1   |  {'a': 3, 'cat': 2}  |
|   2   |  {'a': 1, 'the': 2}  |
|   3   | {'the': 1, 'dog': 3} |
|   4   |          {}          |
+-------+----------------------+
[4 rows x 2 columns]
+-------+------+-------+
| topic | word | count |
+-------+------+-------+
|   1   |  a   |   3   |
|   1   | cat  |   2   |
|   2   |  a   |   1   |
|   2   | the  |   2   |
|   3   | the  |   1   |
|   3   | dog  |   3   |
|   4   | None |  None |
+-------+------+-------+
[7 rows x 3 columns]
Observe that since topic 4 had no words, an empty row is inserted.
To drop that row, set dropna=True in the parameters to stack.

See also: unstack(const std::string&, const std::string&) const; stack(const std::string&, const std::string&, bool)const

◆ swap_columns()

void turi::gl_gframe::swap_columns	(	const std::string &	column_1,
		const std::string &	column_2
	)

overridevirtual

Swap the order of two columns

Reimplemented from turi::gl_sframe.

◆ tail()

gl_sframe turi::gl_sframe::tail ( size_t n ) const

inherited

Returns a gl_sframe which contains the last n rows of this gl_sframe.

Parameters

n	The number of rows to fetch.

◆ topk()

gl_sframe turi::gl_sframe::topk	(	const std::string &	column_name,
		size_t	k = `10`,
		bool	reverse = `false`
	)		const

inherited

Get top k rows according to the given column. Result is according to and sorted by "column_name" in the given order (default is descending). When "k" is small, "topk" is more efficient than "sort".

Parameters

column_name	The column to sort on
k	Optional. Defaults to 10 The number of rows to return.
reverse	Optional. Defaults to False. If true, return the top k rows in ascending order, otherwise, in descending order.

Example:

auto sf = gl_sframe({{"id", gl_sarray::from_sequence(1000)}});
auto sf["value"] = 0 - sf["id"];
std::cout <<  sf.topk("id", k=3);

Produces output:

+--------+--------+
|   id   |  value |
+--------+--------+
|   999  |  -999  |
|   998  |  -998  |
|   997  |  -997  |
+--------+--------+
[3 rows x 2 columns]

Example:

std::cout << sf.topk("value", k=3);

Produces output:

+--------+--------+
|   id   |  value |
+--------+--------+
|   1    |  -1    |
|   2    |  -2    |
|   3    |  -3    |
+--------+--------+
[3 rows x 2 columns]

See also: sort

◆ unique()

gl_sframe turi::gl_sframe::unique ( ) const

inherited

Remove duplicate rows of the gl_sframe. Will not necessarily preserve the order of the given gl_sframe in the new gl_sframe.

Example:

gl_sframe sf{ {"id", {1,2,3,3,4}},
              {"value", {1,2,3,3,4}} };
std::cout << sf.unique() << std::endl;

Produces output:

+----+-------+
| id | value |
+----+-------+
| 2  |   2   |
| 4  |   4   |
| 3  |   3   |
| 1  |   1   |
+----+-------+
[4 rows x 2 columns]

See also: gl_sarray::unique

◆ unpack()

gl_sframe turi::gl_sframe::unpack	(	const std::string &	unpack_column,
		const std::string &	column_name_prefix = `"X"`,
		const std::vector< flex_type_enum > &	column_types = `std::vector< flex_type_enum >()`,
		const flexible_type &	na_value = `FLEX_UNDEFINED`,
		const std::vector< flexible_type > &	limit = `std::vector< flexible_type >()`
	)		const

inherited

Expand one column of this gl_sframe to multiple columns with each value in a separate column. Returns a new gl_sframe with the unpacked column replaced with a list of new columns. The column must be of list/array/dict type. For more details regarding name generation, missing value handling and other, refer to gl_sarray::unpack

Parameters

unpack_column	Name of the unpacked column
column_name_prefix	Optional. If provided, unpacked column names would start with the given prefix. Defaults to "X". If the empty string is used, no prefix is used.
column_types	Optional. Column types for the unpacked columns. If not provided, column types are automatically inferred from first 100 rows. Defaults to FLEX_UNDEFINED.
na_value	Optional. Convert all values that are equal to "na_value" to missing value if specified.
limit	optional limits in the set of list/vector/dict keys to unpack. For list/vector gl_sarrays, "limit" must contain integer indices. For dict gl_sarrays, "limit" must contain dictionary keys.

Example:

sf = gl_sframe({{"id", {1,2,3}},
                {"wc": {flex_dict{{"a", 1}},
                        flex_dict{{"b", 2}},
                        flex_dict{{"a", 1},{"b", 2}}
                        }
                }});
std::cout << sf;

+----+------------------+
| id |        wc        |
+----+------------------+
| 1  |     {'a': 1}     |
| 2  |     {'b': 2}     |
| 3  | {'a': 1, 'b': 2} |
+----+------------------+
[3 rows x 2 columns]

To unpack:

std::cout << sf.unpack("wc");

+----+------+------+
| id | wc.a | wc.b |
+----+------+------+
| 1  |  1   | None |
| 2  | None |  2   |
| 3  |  1   |  2   |
+----+------+------+
[3 rows x 3 columns]

To not have prefix in the generated column name::

std::cout << sf.unpack("wc", "");

+----+------+------+
| id | wc.a | wc.b |
+----+------+------+
| 1  |  1   | None |
| 2  | None |  2   |
| 3  |  1   |  2   |
+----+------+------+
[3 rows x 3 columns]

To limit subset of keys to unpack:

std::cout << sf.unpack("wc", "", {}, FLEX_UNDEFINED, {"b"});

+----+------+
| id |   b  |
+----+------+
| 1  | None |
| 2  |  2   |
| 3  |  2   |
+----+------+
[3 rows x 3 columns]

See also: gl_sframe::pack_columns; gl_sarray::unpack

◆ unstack() [1/2]

gl_sframe turi::gl_sframe::unstack	(	const std::string &	columns,
		const std::string &	new_column_name = `""`
	)		const

inherited

Concatenate values from one columns into one column, grouping by all other columns. The resulting column could be of type list or array. If "column" is a numeric column, the result will be of vector type. If "column" is a non-numeric column, the new column will be of list type.

Parameters

column	The column that is to be concatenated. If str, then collapsed column type is either array or list.
new_column_name	Optional. New column name. If not given, a name is generated automatically.

Example:

auto  sf = gl_sframe({{"friend", {2, 3, 4, 5, 6, 4, 5, 2, 3}},
                      {"user", {1, 1, 1, 2, 2, 2, 3, 4, 4}}});
std::cout <<  sf.unstack("friend", "friends");

Produces output:

+------+-----------------------------+
| user |           friends           |
+------+-----------------------------+
|  3   |            [5.0]            |
|  1   |       [2.0, 4.0, 3.0]       |
|  2   |       [5.0, 6.0, 4.0]       |
|  4   |          [2.0, 3.0]         |
+------+-----------------------------+
[4 rows x 2 columns]

See also: stack(const std::string&, const std::string&, bool)const; groupby

◆ unstack() [2/2]

gl_sframe turi::gl_sframe::unstack	(	const std::vector< std::string > &	columns,
		const std::string &	new_column_name = `""`
	)		const

inherited

Concatenate values two columns into one column, grouping by all other columns. The new column will be of dict type where the keys are taken from the first column in the list, and the values taken from the second column in the list.

Parameters

column	The columns that are to be concatenated.
new_column_name	Optional. New column name. If not given, a name is generated automatically.

Example:

auto  sf = gl_sframe({{"count",{4, 2, 1, 1, 2, FLEX_UNDEFINED}},
                      {"topic",{"cat", "cat", "dog", "elephant", "elephant", "fish"}},
                      {"word", {"a", "c", "c", "a", "b", FLEX_UNDEFINED}}});
std::cout <<  sf.unstack({"word", "count"}, "words");

Produces output:

+----------+------------------+
|  topic   |      words       |
+----------+------------------+
| elephant | {'a': 1, 'b': 2} |
|   dog    |     {'c': 1}     |
|   cat    | {'a': 4, 'c': 2} |
|   fish   |       None       |
+----------+------------------+
[4 rows x 2 columns]

See also: stack; groupby

The documentation for this class was generated from the following file:

core/data/sframe/gl_gframe.hpp

Public Member Functions

Detailed Description

Member Function Documentation

◆ add_column() [1/2]

◆ add_column() [2/2]

◆ add_columns()

◆ add_row_number()

◆ append()

◆ apply()

◆ column_index()

◆ column_name()

◆ column_names()

◆ column_types()

◆ construct_from_csvs()

◆ contains_column()

◆ dropna()

◆ dropna_split()

◆ empty()

◆ fillna()

◆ filter_by()

◆ groupby()

◆ has_size()

◆ head()

◆ is_materialized()

◆ join() [1/2]

◆ join() [2/2]

◆ materialize()

◆ materialize_to_callback()

◆ num_columns()

◆ operator[]() [1/5]

◆ operator[]() [2/5]

◆ operator[]() [3/5]

◆ operator[]() [4/5]

◆ operator[]() [5/5]

◆ pack_columns() [1/2]

◆ pack_columns() [2/2]

◆ plot()

◆ random_split() [1/2]

◆ random_split() [2/2]

◆ range_iterator()

◆ remove_column()

◆ rename()

◆ replace_add_column()

◆ sample() [1/2]

◆ sample() [2/2]

◆ save()

◆ save_reference()

◆ select_column()

◆ select_columns()

◆ show()

◆ size()

◆ sort() [1/4]

◆ sort() [2/4]

◆ sort() [3/4]

◆ sort() [4/4]

◆ split_datetime()

◆ stack() [1/2]

◆ stack() [2/2]

◆ swap_columns()

◆ tail()

◆ topk()

◆ unique()

◆ unpack()

◆ unstack() [1/2]

◆ unstack() [2/2]