Turi Create
4.0
|
#include <core/data/sframe/gl_gframe.hpp>
Public Member Functions | |
size_t | size () const override |
size_t | num_columns () const override |
std::vector< std::string > | column_names () const override |
std::vector< flex_type_enum > | column_types () const override |
void | add_column (const flexible_type &data, const std::string &name) override |
void | add_column (const gl_sarray &data, const std::string &name) override |
void | add_columns (const gl_sframe &data) override |
void | remove_column (const std::string &name) override |
void | rename (const std::map< std::string, std::string > &old_to_new_names) override |
void | swap_columns (const std::string &column_1, const std::string &column_2) override |
void | construct_from_csvs (std::string csv_file, csv_parsing_config_map csv_config, str_flex_type_map column_type_hints) |
void | show (const std::string &path_to_client) const |
std::shared_ptr< model_base > | plot () const |
gl_sframe | operator[] (const gl_sarray &logical_filter) const |
void | materialize_to_callback (std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)> callback, size_t nthreads=(size_t)(-1)) |
gl_sframe_range | range_iterator (size_t start=0, size_t end=(size_t)(-1)) const |
bool | empty () const |
bool | is_materialized () const |
bool | has_size () const |
void | materialize () |
void | save (const std::string &path, const std::string &format="") const |
void | save_reference (const std::string &path) const |
bool | contains_column (const std::string &col_name) const |
gl_sframe | head (size_t n) const |
gl_sframe | tail (size_t n) const |
gl_sarray | apply (std::function< flexible_type(const sframe_rows::row &)> fn, flex_type_enum dtype) const |
gl_sframe | sample (double fraction) const |
gl_sframe | sample (double fraction, size_t seed, bool exact=false) const |
std::pair< gl_sframe, gl_sframe > | random_split (double fraction) const |
std::pair< gl_sframe, gl_sframe > | random_split (double fraction, size_t seed, bool exact=false) const |
gl_sframe | topk (const std::string &column_name, size_t k=10, bool reverse=false) const |
size_t | column_index (const std::string &column_name) const |
const std::string & | column_name (size_t index) const |
gl_sarray | select_column (const std::string &colname) const |
gl_sframe | select_columns (const std::vector< std::string > &colnames) const |
virtual void | replace_add_column (const gl_sarray &data, const std::string &name="") |
gl_sframe | append (const gl_sframe &other) const |
gl_sframe | groupby (const std::vector< std::string > &groupkeys, const std::map< std::string, aggregate::groupby_descriptor_type > &operators=std::map< std::string, aggregate::groupby_descriptor_type >()) const |
gl_sframe | join (const gl_sframe &right, const std::vector< std::string > &joinkeys, const std::string &how="inner") const |
gl_sframe | join (const gl_sframe &right, const std::map< std::string, std::string > &joinkeys, const std::string &how="inner") const |
gl_sframe | filter_by (const gl_sarray &values, const std::string &column_name, bool exclude=false) const |
gl_sframe | pack_columns (const std::vector< std::string > &columns, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const |
gl_sframe | pack_columns (const std::string &column_prefix, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const |
gl_sframe | split_datetime (const std::string &expand_column, const std::string &column_name_prefix="X", const std::vector< std::string > &limit=std::vector< std::string >(), bool tzone=false) const |
gl_sframe | unpack (const std::string &unpack_column, const std::string &column_name_prefix="X", const std::vector< flex_type_enum > &column_types=std::vector< flex_type_enum >(), const flexible_type &na_value=FLEX_UNDEFINED, const std::vector< flexible_type > &limit=std::vector< flexible_type >()) const |
gl_sframe | stack (const std::string &column_name, const std::string &new_column_names, bool drop_na=false) const |
gl_sframe | stack (const std::string &column_name, const std::vector< std::string > &new_column_names, bool drop_na=false) const |
gl_sframe | unstack (const std::string &columns, const std::string &new_column_name="") const |
gl_sframe | unstack (const std::vector< std::string > &columns, const std::string &new_column_name="") const |
gl_sframe | unique () const |
gl_sframe | sort (const std::string &column, bool ascending=true) const |
gl_sframe | sort (const std::vector< std::string > &columns, bool ascending=true) const |
gl_sframe | sort (const std::initializer_list< std::string > &columns, bool ascending=true) const |
gl_sframe | sort (const std::vector< std::pair< std::string, bool >> &column_and_ascending) const |
gl_sframe | dropna (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const |
std::pair< gl_sframe, gl_sframe > | dropna_split (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const |
gl_sframe | fillna (const std::string &column, flexible_type value) const |
gl_sframe | add_row_number (const std::string &column_name="id", size_t start=0) const |
std::vector< flexible_type > | operator[] (int64_t i) |
std::vector< flexible_type > | operator[] (int64_t i) const |
gl_sframe | operator[] (const std::initializer_list< int64_t > &slice) |
gl_sframe | operator[] (const std::initializer_list< int64_t > &slice) const |
Column Indexing | |
Selects a single column of the SFrame. This returns an internal array reference object that can be used exactly like a gl_sarray. The design is quite similar to the reference object used by std::vector<bool> for indexing. For instance: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; gl_sarray t = sf["a"]; // takes out column "a" However, this operator can also be used for modifying existing columns, or creating new columns. For instance: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; sf["a"] = sf["a"] + 1; // sf["a"] is now {2,3,4,5,6} sf["d"] = sf["c"] - 1; // sf["d"] is now {0.0,1.0,2.0,3.0,4.0} Entire constant columns can also be created the same way: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; sf["ones"] = 1; Since the returned object is meant to be a short-lived reference, the following is not permitted: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; auto a_col = sf["a"]; since "auto" resolves to gl_sarray_reference which is intentionally, not copy-constructible. For functional alternatives, See replace_add_column, add_column, add_column(const gl_sarray&, const std::string&), "add_column overload". | |
const_gl_sarray_reference | operator[] (const std::string &column) const |
gl_sarray_reference | operator[] (const std::string &column) |
Multi-Column Indexing | |
gl_sframe | operator[] (const std::vector< std::string > &columns) const |
gl_sframe | operator[] (const std::initializer_list< std::string > &columns) |
gl_sframe | operator[] (const std::initializer_list< std::string > &columns) const |
A proxy for the gl_sframe for the vertex and edge data of the SGRaph
Definition at line 24 of file gl_gframe.hpp.
|
overridevirtual |
Add a new column with constant value. If type is VERTEX_GFRAME, the column is added as a new vertex field (or edge field if type is EDGE_GFRAME) in the gl_sgraph.
data | the constant value to fill the column |
name | the name of the new column |
Reimplemented from turi::gl_sframe.
|
overridevirtual |
Add a new column with given column name and data. If type is VERTEX_GFRAME, the column is added as a new vertex field (or edge field if type is EDGE_GFRAME) in the gl_sgraph.
data | the constant value to fill the column |
name | the name of the new column |
Reimplemented from turi::gl_sframe.
|
overridevirtual |
Batch version of add_column.
data | a map from column name to column data |
Reimplemented from turi::gl_sframe.
|
inherited |
Returns a new gl_sframe with a new column that numbers each row sequentially. By default the count starts at 0, but this can be changed to a positive or negative number. The new column will be named with the given column name. An error will be raised if the given column name already exists in the gl_sframe.
column_name | Optional. The name of the new column that will hold the row numbers. |
start | Optional. The number used to start the row number count. |
Example:
Produces output:
Add the rows of an gl_sframe to the end of this gl_sframe. Both gl_sframe objects must have the same set of columns with the same column names and column types.
Example:
Produces output:
|
inherited |
Maps each row of the gl_sframe by a given function to a single value. The result gl_sarray is of type "dtype". "fn" should be a function that returns exactly one value which can be cast into the type specified by "dtype".
fn | The function to transform each element. Must return exactly one value which can be cast into the type specified by "dtype". |
dtype | The data type of the new gl_sarray. |
Example:
Produces output:
|
inherited |
Returns the index of column column_name
.
|
inherited |
Returns the name of column index
.
|
overridevirtual |
Returns a list of column names. If type is VERTEX_GFRAME, the value is also the names of the vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.
Reimplemented from turi::gl_sframe.
|
overridevirtual |
Returns a list of column types. If type is VERTEX_GFRAME, the value is also the names of the vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.
Reimplemented from turi::gl_sframe.
|
inherited |
Constructs a gl_sframe from a csv file
|
inherited |
Returns true if the column is present in the sframe, and false otherwise.
|
inherited |
Remove missing values from an gl_sframe. A missing value is either "FLEX_UNDEFINED" or "NaN". If "how" is "any", a row will be removed if any of the columns in the "columns" parameter contains at least one missing value. If "how" is "all", a row will be removed if all of the columns in the "columns" parameter are missing values. If the "columns" parameter is not specified, the default is to consider all columns when searching for missing values.
columns | Optional. The columns to use when looking for missing values. By default, all columns are used. |
how | Optional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default. |
For instance
Produces output:
Produces output:
Example:
Produces output:
|
inherited |
Split rows with missing values from this gl_sframe. This function has the same functionality as dropna, but returns a tuple of two gl_sframe objects. The first item is the expected output from dropna, and the second item contains all the rows filtered out by the "dropna" algorithm.
columns | Optional. The columns to use when looking for missing values. By default, all columns are used. |
how | Optional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default. |
recursive | Optional. It will recursively check whether a cell contains nan or not. This is handy for nested data structure like list, dictionary. For instance, {{FLEX_UNDEFINED, 1}, {1} will be treat as nan and will be removed if recursive is set to be true. Otherwise it won't be treated as nan-value. |
Example:
Produces output:
Example:
Produces output:
|
inherited |
True if size() == 0.
|
inherited |
Fill all missing values with a given value in a given column. If the "value" is not the same type as the values in "column", this method attempts to convert the value to the original column"s type. If this fails, an error is raised.
column | The name of the column to modify. |
value | The value used to replace all missing values. |
recursive | The recursive is used to set the manner of nan-value checking. If this value is true, a cell will be treated as missing value iff it contains nan. For instance, {{FLEX_UNDEFINED, 1}, {0}} and {FLEX_UNDEFINED, 1} will be all treated as nan-values. |
Example:
Produces output:
|
inherited |
Filter an gl_sframe by values inside an iterable object. Result is an gl_sframe that only includes (or excludes) the rows that have a column with the given "column_name" which holds one of the values in the given "values" gl_sarray.
values | The values to use to filter the gl_sframe. The resulting gl_sframe will only include rows that have one of these values in the given column. |
column_name | The column of the gl_sframe to match with the given "values". |
exclude | Optional. Defaults to false. If true, the result gl_sframe will contain all rows except those that have one of "values" in "column_name". |
Example:
Produces output:
|
inherited |
Perform a group on the key_columns followed by aggregations on the columns listed in operations. The operations parameter is a dictionary that indicates which aggregation operators to use and which columns to use them on. The available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT, SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience, aggregators MEAN, STD, and VARIANCE are available as synonyms for AVG, STDV, and VAR. See turi::aggregate for more detail on the aggregators.
groupkeys | Columns to group on. Type of key columns can be of any type other than dictionary. |
operations | Map of columns and aggregation operations. Each key is a output column name and each value is an aggregator. |
Suppose we have an SFrame (sf) with movie ratings by many users.
Compute the number of occurrences of each user.
Compute the mean and standard deviation of ratings per user.
Compute the movie with the minimum rating per user.
Compute the count, mean, and standard deviation of ratings per (user, time), automatically assigning output column names.
The groupby function can take a variable length list of aggregation specifiers so if we want the count and the 0.25 and 0.75 quantiles of ratings:
To put all items a user rated into one list value by their star rating:
To put all items and rating of a given user together into a dictionary value:
|
inherited |
Returns true if the size of the SFrame is known. If it is not known, calling size() may trigger materialization.
|
inherited |
|
inherited |
Returns whether or not the sarray has been materialized.
|
inherited |
Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.
right | The gl_sframe to join. |
on | The column name(s) representing the set of join keys. Each row that has the same value in this set of columns will be merged together. |
how | Optional. The type of join to perform. "inner" is default.
|
Example:
Produces output:
|
inherited |
Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.
right | The gl_sframe to join. |
on | The column name(s) representing a map of join keys from left to right. Each key is taken as a column name on the left gl_sframe and each value is taken as the column name in the right gl_sframe. |
how | Optional. The type of join to perform. "inner" is default.
|
Example:
Produces output:
|
inherited |
For a SFrame that is lazily evaluated, force persist this sframe to disk, committing all lazy evaluated operations.
|
inherited |
Calls a callback function passing each row of the SArray.
This does not materialize the array if not necessary.
The callback may be called in parallel in which case the argument provides a thread number. The function should return false, but may return true at anytime to quit the iteration process. It may also throw exceptions which will be forwarded to the caller of this function.
Each call to the callback passes:
The sframe_rows object looks like a vector<vector<flexible_type>>. i.e. to look at all the rows, you need to write:
callback | The callback to call |
nthreads | Number of threads. If not specified, #cpus is used |
|
overridevirtual |
Returns number of columns. If type is VERTEX_GFRAME, the value is also the number of vertex fields (or edge fields if type is EDGE_GFRAME) in the gl_sgraph.
Reimplemented from turi::gl_sframe.
|
inherited |
Returns the value at a particular array index; generally inefficient.
This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.
|
inherited |
Returns the value at a particular array index; generally inefficient.
This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.
|
inherited |
Performs a slice Python style.
slice | A list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array. |
Given a gl_sframe
Slicing a consecutive range:
Slicing a range with a step:
Using negative indexing:
|
inherited |
Performs a slice Python style.
slice | A list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array. |
Given a gl_sframe
Slicing a consecutive range:
Slicing a range with a step:
Using negative indexing:
Performs a logical filter.
This function performs a logical filter: i.e. it subselects all the elements in this array where the corresponding value in the other array evaluates to true.
|
inherited |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Pack two or more columns of the current gl_sframe into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.
The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST
columns | A list of column names to be packed. There must at least two columns to pack. |
new_column_name | Packed column name. |
dtype | Optional. The resulting packed column type. If not provided, dtype is list. |
fill_na | Optional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced. |
Example: Suppose 'sf' is an an SFrame that maintains business category information.
To pack all category columns into a list:
To pack all category columns into a dictionary:
|
inherited |
Pack two or more columns of the current gl_sframe with a common column name prefix into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.
The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST
column_prefix | Packs all columns with the given prefix. |
new_column_name | Packed column name. |
dtype | Optional. The resulting packed column type. If not provided, dtype is list. |
fill_na | Optional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced. |
Example: Suppose 'sf' is an an SFrame that maintains business category information.
To pack all category columns into a list:
To pack all category columns into a dictionary:
|
inherited |
Return a plot object of the SFrame (same visualization as show
)
Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.
fraction | Approximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1. |
seed | Optional. Seed for the random number generator used to split. |
Example:
Produces output:
|
inherited |
Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.
fraction | Approximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1. |
seed | The random seed for the random number generator. Deterministic output is obtained if this is set to a constant. |
Example:
Produces output:
|
inherited |
Returns a one pass range object with begin() and end() iterators.
This will materialize the array.
See materialize_to_callback for a lazy version.
start | The starting index of the range |
end | The ending index of the range |
Or more compactly with C++11 syntax:
The range returned only supports one pass. The outcome of a second call to begin() is undefined after any iterator is advanced.
When iterating over a gl_sframe with many columns, if only a small number of columns are needed, there is a performance benefit to subselecting just those columns first before iterating.
i.e. if I only need columns "a" and "b" from the SFrame above:
|
overridevirtual |
Remove a column with the given name. If type is VERTEX_GFRAME, the column is removed from vertex data (or edge data if type is EDGE_GFRAME) from the gl_sgraph.
name | the column name to be removed |
Reimplemented from turi::gl_sframe.
|
overridevirtual |
Rename columns.
old_to_new_names | map from old column name to new column name. |
Reimplemented from turi::gl_sframe.
|
virtualinherited |
Add a column to this gl_sframe, replacing a column with the same name already exists. The number of elements in the data given must match the length of every other column of the gl_sframe. This operation modifies the current gl_sframe in place. If no name is given, a default name is chosen.
data | The column of data to add. |
name | Optional. The name of the column. If no name is given, a default name is chosen. |
This is equivalent to using operator[] for column assignment.
Example:
Produces output:
|
inherited |
Create an gl_sframe which contains a subsample of the current gl_sframe.
fraction | The fraction of the rows to fetch. Must be between 0 and 1. |
Example:
Produces output:
|
inherited |
Create an gl_sframe which contains a subsample of the current gl_sframe.
fraction | The fraction of the rows to fetch. Must be between 0 and 1. |
seed | The random seed for the random number generator. Deterministic output is obtained if this is set to a constant. |
Example:
Produces output:
|
inherited |
Saves the SFrame to file.
When format is "binary", the saved SArray will be in a directory named with the targetfile
parameter. When format is "text" or "csv", it is saved as a single human readable text file.
filename | A local path or a remote URL. If format is 'text', it will be saved as a text file. If format is 'binary', a directory will be created at the location which will contain the SArray. |
format | Optional. Either "binary", "csv" or "". Defaults to "". Format in which to save the SFrame. Binary saved SArrays can be loaded much faster and without any format conversion losses. If "csv", Each row will be written as a single line in an output text file. If format is an empty string (default), we will try to infer the format from filename given. If file name ends with "csv", or ".csv.gz", then the gl_sframe is saved as "csv" format, otherwise the gl_sframe is saved as 'binary' format. |
|
inherited |
Performs an incomplete save of an existing SFrame into a directory. This saved SFrame may reference SFrames in other locations in the same filesystem for certain columns/segments/etc.
Does not modify the current sframe.
|
inherited |
Extracts one column of the gl_sframe.
This is equivalent to using operator[] for column indexing.
Equivalent to:
|
inherited |
Extracts a collection of columns of the gl_sframe.
This is equivalent to using operator[] for selecting multiple columns
|
inherited |
Show a visualization of the SFrame.
|
overridevirtual |
Returns number of rows. If type is VERTEX_GFRAME, the value is also the number of vertices (or edges if type is EDGE_GFRAME) in the gl_sgraph.
Reimplemented from turi::gl_sframe.
|
inherited |
Sort current gl_sframe by a single column, using the given sort order.
Only columns that are type of str, int and float can be sorted.
column | The name of the column to be sorted. |
ascending | Optional. Sort all columns in the given order. |
Example:
Produces output:
Example:
Produces output:
|
inherited |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
Sort current gl_sframe by a multiple columns, using the given sort order.
columns | The names of the columns to be sorted. |
ascending | Optional. Sort all columns in the given order. |
The result will be sorted first by first column, followed by second column, and so on. All columns will be sorted in the same order as governed by the "ascending" parameter.
Example:
Produces output:
|
inherited |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
|
inherited |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Sort current gl_sframe by a multiple columns, using different sort order for each column.
column_and_ascending | A map from column name to sort order (ascending is true) |
To sort by column "a" ascending, and then by column "c" descending To control the sort ordering for each column individually, "sort_columns" must be a list of (str, bool) pairs. Given this case, the first value is the column name and the second value is a boolean indicating whether the sort order is ascending.
Example:
Produces output:
|
inherited |
Splits a datetime column of gl_sframe to multiple columns, with each value in a separate column. Returns a new gl_sframe with the column replaced with a list of new columns. The expanded column must be of datetime type. For more details regarding name generation and other, refer to gl_sarray::split_datetime
This function is a convenience function which is equivalent to calling gl_sarray::split_datetime on the column, deleting the column and adding the expanded columns back to the sframe.
expand_column | Name of the column to expand. |
column_name_prefix | Optional. If provided, expanded column names would start with the given prefix. If not provided, the default value is the name of the expanded column. |
limit | Optional. Limits the set of datetime elements to expand. Elements are 'year','month','day','hour','minute', and 'second'. |
tzone | Optional. A boolean parameter that determines whether to show the timezone column or not. Defaults to false. |
Example:
Produces output:
|
inherited |
Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.
The stack works only for columns of list, or array type (for the dict type, see the overload). One new column is created as a result of stacking, where each row holds one element of the array or list value, and the rest columns from the same original row repeated.
The new gl_sframe includes the newly created column and all columns other than the one that is stacked.
column_names | The column(s) to stack. This column must be of dict/list/array type |
new_column_name | Optional. The new column name. If not given, column names are generated automatically. |
drop_na | Optional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s). |
Suppose 'sf' is an SFrame that contains a user and his/her friends, where 'friends' columns is an array type. Stack on 'friends' column would create a user/friend list for each user/friend pair:
Produces output:
|
inherited |
Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.
The stack works only for columns of dictionary type (for the list or array types, see the overload). Two new columns are created as a result of stacking: one column holds the key and another column holds the value. The rest of the columns are repeated for each key/value pair.
The new gl_sframe includes the newly created columns and all columns other than the one that is stacked.
column_names | The column(s) to stack. This column must be of dict/list/array type |
new_column_names | Optional. The new column names. Must be an vector of 2 values corresponding to the "key" column and the "value" column. If not given, column names are generated automatically. |
drop_na | Optional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s). |
Suppose 'sf' is an SFrame that contains a column of dict type. Stack would stack all keys in one column and all values in another column:
Produces output:
|
overridevirtual |
Swap the order of two columns
Reimplemented from turi::gl_sframe.
|
inherited |
|
inherited |
Get top k rows according to the given column. Result is according to and sorted by "column_name" in the given order (default is descending). When "k" is small, "topk" is more efficient than "sort".
column_name | The column to sort on |
k | Optional. Defaults to 10 The number of rows to return. |
reverse | Optional. Defaults to False. If true, return the top k rows in ascending order, otherwise, in descending order. |
Example:
Produces output:
Example:
Produces output:
|
inherited |
Remove duplicate rows of the gl_sframe. Will not necessarily preserve the order of the given gl_sframe in the new gl_sframe.
Example:
Produces output:
|
inherited |
Expand one column of this gl_sframe to multiple columns with each value in a separate column. Returns a new gl_sframe with the unpacked column replaced with a list of new columns. The column must be of list/array/dict type. For more details regarding name generation, missing value handling and other, refer to gl_sarray::unpack
unpack_column | Name of the unpacked column |
column_name_prefix | Optional. If provided, unpacked column names would start with the given prefix. Defaults to "X". If the empty string is used, no prefix is used. |
column_types | Optional. Column types for the unpacked columns. If not provided, column types are automatically inferred from first 100 rows. Defaults to FLEX_UNDEFINED. |
na_value | Optional. Convert all values that are equal to "na_value" to missing value if specified. |
limit | optional limits in the set of list/vector/dict keys to unpack. For list/vector gl_sarrays, "limit" must contain integer indices. For dict gl_sarrays, "limit" must contain dictionary keys. |
Example:
To unpack:
To not have prefix in the generated column name::
To limit subset of keys to unpack:
|
inherited |
Concatenate values from one columns into one column, grouping by all other columns. The resulting column could be of type list or array. If "column" is a numeric column, the result will be of vector type. If "column" is a non-numeric column, the new column will be of list type.
column | The column that is to be concatenated. If str, then collapsed column type is either array or list. |
new_column_name | Optional. New column name. If not given, a name is generated automatically. |
Example:
Produces output:
|
inherited |
Concatenate values two columns into one column, grouping by all other columns. The new column will be of dict type where the keys are taken from the first column in the list, and the values taken from the second column in the list.
column | The columns that are to be concatenated. |
new_column_name | Optional. New column name. If not given, a name is generated automatically. |
Example:
Produces output: