Turi Create
4.0
|
#include <core/data/sframe/gl_sframe.hpp>
Public Member Functions | |
gl_sframe () | |
Constructs an empty gl_sframe. | |
gl_sframe (const gl_sframe &) | |
Copy Constructor. | |
gl_sframe (gl_sframe &&) | |
Move Constructor. | |
gl_sframe (const std::string &directory) | |
void | construct_from_csvs (std::string csv_file, csv_parsing_config_map csv_config, str_flex_type_map column_type_hints) |
gl_sframe & | operator= (const gl_sframe &) |
Copy assignment. | |
gl_sframe & | operator= (gl_sframe &&) |
Move assignment. | |
void | show (const std::string &path_to_client) const |
std::shared_ptr< model_base > | plot () const |
gl_sframe (const std::map< std::string, std::vector< flexible_type > > &data) | |
gl_sframe (const std::map< std::string, gl_sarray > &data) | |
gl_sframe (std::initializer_list< std::pair< std::string, gl_sarray >>) | |
gl_sframe | operator[] (const gl_sarray &logical_filter) const |
void | materialize_to_callback (std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)> callback, size_t nthreads=(size_t)(-1)) |
gl_sframe_range | range_iterator (size_t start=0, size_t end=(size_t)(-1)) const |
virtual size_t | size () const |
bool | empty () const |
bool | is_materialized () const |
bool | has_size () const |
void | materialize () |
void | save (const std::string &path, const std::string &format="") const |
void | save_reference (const std::string &path) const |
virtual std::vector< flex_type_enum > | column_types () const |
virtual size_t | num_columns () const |
virtual std::vector< std::string > | column_names () const |
bool | contains_column (const std::string &col_name) const |
gl_sframe | head (size_t n) const |
gl_sframe | tail (size_t n) const |
gl_sarray | apply (std::function< flexible_type(const sframe_rows::row &)> fn, flex_type_enum dtype) const |
gl_sframe | sample (double fraction) const |
gl_sframe | sample (double fraction, size_t seed, bool exact=false) const |
std::pair< gl_sframe, gl_sframe > | random_split (double fraction) const |
std::pair< gl_sframe, gl_sframe > | random_split (double fraction, size_t seed, bool exact=false) const |
gl_sframe | topk (const std::string &column_name, size_t k=10, bool reverse=false) const |
size_t | column_index (const std::string &column_name) const |
const std::string & | column_name (size_t index) const |
gl_sarray | select_column (const std::string &colname) const |
gl_sframe | select_columns (const std::vector< std::string > &colnames) const |
virtual void | replace_add_column (const gl_sarray &data, const std::string &name="") |
virtual void | add_column (const flexible_type &data, const std::string &name="") |
virtual void | add_column (const gl_sarray &data, const std::string &name="") |
virtual void | add_columns (const gl_sframe &data) |
virtual void | remove_column (const std::string &name) |
virtual void | swap_columns (const std::string &column_1, const std::string &column_2) |
virtual void | rename (const std::map< std::string, std::string > &old_to_new_names) |
gl_sframe | append (const gl_sframe &other) const |
gl_sframe | groupby (const std::vector< std::string > &groupkeys, const std::map< std::string, aggregate::groupby_descriptor_type > &operators=std::map< std::string, aggregate::groupby_descriptor_type >()) const |
gl_sframe | join (const gl_sframe &right, const std::vector< std::string > &joinkeys, const std::string &how="inner") const |
gl_sframe | join (const gl_sframe &right, const std::map< std::string, std::string > &joinkeys, const std::string &how="inner") const |
gl_sframe | filter_by (const gl_sarray &values, const std::string &column_name, bool exclude=false) const |
gl_sframe | pack_columns (const std::vector< std::string > &columns, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const |
gl_sframe | pack_columns (const std::string &column_prefix, const std::string &new_column_name, flex_type_enum dtype=flex_type_enum::LIST, flexible_type fill_na=FLEX_UNDEFINED) const |
gl_sframe | split_datetime (const std::string &expand_column, const std::string &column_name_prefix="X", const std::vector< std::string > &limit=std::vector< std::string >(), bool tzone=false) const |
gl_sframe | unpack (const std::string &unpack_column, const std::string &column_name_prefix="X", const std::vector< flex_type_enum > &column_types=std::vector< flex_type_enum >(), const flexible_type &na_value=FLEX_UNDEFINED, const std::vector< flexible_type > &limit=std::vector< flexible_type >()) const |
gl_sframe | stack (const std::string &column_name, const std::string &new_column_names, bool drop_na=false) const |
gl_sframe | stack (const std::string &column_name, const std::vector< std::string > &new_column_names, bool drop_na=false) const |
gl_sframe | unstack (const std::string &columns, const std::string &new_column_name="") const |
gl_sframe | unstack (const std::vector< std::string > &columns, const std::string &new_column_name="") const |
gl_sframe | unique () const |
gl_sframe | sort (const std::string &column, bool ascending=true) const |
gl_sframe | sort (const std::vector< std::string > &columns, bool ascending=true) const |
gl_sframe | sort (const std::initializer_list< std::string > &columns, bool ascending=true) const |
gl_sframe | sort (const std::vector< std::pair< std::string, bool >> &column_and_ascending) const |
gl_sframe | dropna (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const |
std::pair< gl_sframe, gl_sframe > | dropna_split (const std::vector< std::string > &columns=std::vector< std::string >(), std::string how="any", bool recursive=false) const |
gl_sframe | fillna (const std::string &column, flexible_type value) const |
gl_sframe | add_row_number (const std::string &column_name="id", size_t start=0) const |
std::vector< flexible_type > | operator[] (int64_t i) |
std::vector< flexible_type > | operator[] (int64_t i) const |
gl_sframe | operator[] (const std::initializer_list< int64_t > &slice) |
gl_sframe | operator[] (const std::initializer_list< int64_t > &slice) const |
Column Indexing | |
Selects a single column of the SFrame. This returns an internal array reference object that can be used exactly like a gl_sarray. The design is quite similar to the reference object used by std::vector<bool> for indexing. For instance: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; gl_sarray t = sf["a"]; // takes out column "a" However, this operator can also be used for modifying existing columns, or creating new columns. For instance: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; sf["a"] = sf["a"] + 1; // sf["a"] is now {2,3,4,5,6} sf["d"] = sf["c"] - 1; // sf["d"] is now {0.0,1.0,2.0,3.0,4.0} Entire constant columns can also be created the same way: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; sf["ones"] = 1; Since the returned object is meant to be a short-lived reference, the following is not permitted: gl_sframe sf{{"a", {1,2,3,4,5}}, {"b", {"1","2","3","4","5"}}, {"c", {1.0,2.0,3.0,4.0,5.0}}}; auto a_col = sf["a"]; since "auto" resolves to gl_sarray_reference which is intentionally, not copy-constructible. For functional alternatives, See replace_add_column, add_column, add_column(const gl_sarray&, const std::string&), "add_column overload". | |
const_gl_sarray_reference | operator[] (const std::string &column) const |
gl_sarray_reference | operator[] (const std::string &column) |
Multi-Column Indexing | |
gl_sframe | operator[] (const std::vector< std::string > &columns) const |
gl_sframe | operator[] (const std::initializer_list< std::string > &columns) |
gl_sframe | operator[] (const std::initializer_list< std::string > &columns) const |
Friends | |
std::ostream & | operator<< (std::ostream &out, const gl_sframe &other) |
A tabular, column-mutable dataframe object that can scale to big data.
The data in gl_sframe is stored column-wise on persistent storage (e.g. disk) to avoid being constrained by memory size. Each column in an gl_sframe is a immutable gl_sarray, but gl_sframe objects are mutable in that columns can be added and subtracted with ease. An gl_sframe essentially acts as an ordered dictionary of gl_sarray objects. Usage:
The gl_sframe API is designed to very closely mimic the Python SFrame API and supports much of the Python-like capabilities, but in C++.
Column Creation And Referencing
Logical Filter:
Python Range Slicing:
And many others.
The gl_sframe can be read inefficiently using operator[]
Or iterated efficiently using the range_iterator
Note that using "auto" above is more efficient than using vector<flexible_type>
The range_iterator materializes the SFrame if not already materialized, but materialize_to_callback can be used to read the SFrame without materialization.
The gl_sframe can constructed in a variety of means:
When used as an input argument in an SDK function, it permits a Python SFrame to be passed as an argument. When used in an output argument, it will return a Python SFrame.
For instance:
Will allow this to be done in Python:
The gl_sframe is also lazy evaluated behind the scenes to minimize disk access. This may have the unfortunate effect of hiding errors until materialization is forced to occur. i.e. it might be some time much later in your code that errors will trigger.
However, not all operations are lazy and certain operations will force materialization, and that is a constant target for optimization.
If you want to force materialization yourself, use materialize()
Definition at line 492 of file gl_sframe.hpp.
|
explicit |
turi::gl_sframe::gl_sframe | ( | const std::map< std::string, std::vector< flexible_type > > & | data | ) |
Constructs a gl_sframe from an in-memory map of values
Or, more compactly using C++11 initializer lists:
turi::gl_sframe::gl_sframe | ( | const std::map< std::string, gl_sarray > & | data | ) |
turi::gl_sframe::gl_sframe | ( | std::initializer_list< std::pair< std::string, gl_sarray >> | ) |
|
virtual |
Add a column of identical values this gl_sframe, raising an exception if a column the same name already exists. This operation modifies the current gl_sframe in place. If no name is given, a default name is chosen.
data | The value to assign to each entry in the new column |
name | Optional. The name of the column. If no name is given, a default name is chosen. |
This is almost equivalent to using operator[] for column assignment, but raises an exception if overwriting a column with the same name.
Example:
Produces output:
Reimplemented in turi::gl_gframe.
|
virtual |
Add a column to this gl_sframe, raising an exception if a column the same name already exists. The number of elements in the data given must match the length of every other column of the gl_sframe. This operation modifies the current gl_sframe in place. If no name is given, a default name is chosen.
data | The column of data to add. |
name | Optional. The name of the column. If no name is given, a default name is chosen. |
This is almost equivalent to using operator[] for column assignment, but raises an exception if overwriting a column with the same name.
Example:
Produces output:
Reimplemented in turi::gl_gframe.
|
virtual |
Adds multiple columns to this gl_sframe. The number of elements in all columns must match the length of every other column of the gl_sframe. This operation modifies the current gl_sframe in place
data | The columns to add. |
Example:
Produces output:
Reimplemented in turi::gl_gframe.
gl_sframe turi::gl_sframe::add_row_number | ( | const std::string & | column_name = "id" , |
size_t | start = 0 |
||
) | const |
Returns a new gl_sframe with a new column that numbers each row sequentially. By default the count starts at 0, but this can be changed to a positive or negative number. The new column will be named with the given column name. An error will be raised if the given column name already exists in the gl_sframe.
column_name | Optional. The name of the new column that will hold the row numbers. |
start | Optional. The number used to start the row number count. |
Example:
Produces output:
Add the rows of an gl_sframe to the end of this gl_sframe. Both gl_sframe objects must have the same set of columns with the same column names and column types.
Example:
Produces output:
gl_sarray turi::gl_sframe::apply | ( | std::function< flexible_type(const sframe_rows::row &)> | fn, |
flex_type_enum | dtype | ||
) | const |
Maps each row of the gl_sframe by a given function to a single value. The result gl_sarray is of type "dtype". "fn" should be a function that returns exactly one value which can be cast into the type specified by "dtype".
fn | The function to transform each element. Must return exactly one value which can be cast into the type specified by "dtype". |
dtype | The data type of the new gl_sarray. |
Example:
Produces output:
size_t turi::gl_sframe::column_index | ( | const std::string & | column_name | ) | const |
Returns the index of column column_name
.
const std::string& turi::gl_sframe::column_name | ( | size_t | index | ) | const |
Returns the name of column index
.
|
virtual |
Returns the columns names of the SFrame.
Reimplemented in turi::gl_gframe.
|
virtual |
Returns an array of types of each column.
Reimplemented in turi::gl_gframe.
void turi::gl_sframe::construct_from_csvs | ( | std::string | csv_file, |
csv_parsing_config_map | csv_config, | ||
str_flex_type_map | column_type_hints | ||
) |
Constructs a gl_sframe from a csv file
bool turi::gl_sframe::contains_column | ( | const std::string & | col_name | ) | const |
Returns true if the column is present in the sframe, and false otherwise.
gl_sframe turi::gl_sframe::dropna | ( | const std::vector< std::string > & | columns = std::vector< std::string >() , |
std::string | how = "any" , |
||
bool | recursive = false |
||
) | const |
Remove missing values from an gl_sframe. A missing value is either "FLEX_UNDEFINED" or "NaN". If "how" is "any", a row will be removed if any of the columns in the "columns" parameter contains at least one missing value. If "how" is "all", a row will be removed if all of the columns in the "columns" parameter are missing values. If the "columns" parameter is not specified, the default is to consider all columns when searching for missing values.
columns | Optional. The columns to use when looking for missing values. By default, all columns are used. |
how | Optional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default. |
For instance
Produces output:
Produces output:
Example:
Produces output:
std::pair<gl_sframe, gl_sframe> turi::gl_sframe::dropna_split | ( | const std::vector< std::string > & | columns = std::vector< std::string >() , |
std::string | how = "any" , |
||
bool | recursive = false |
||
) | const |
Split rows with missing values from this gl_sframe. This function has the same functionality as dropna, but returns a tuple of two gl_sframe objects. The first item is the expected output from dropna, and the second item contains all the rows filtered out by the "dropna" algorithm.
columns | Optional. The columns to use when looking for missing values. By default, all columns are used. |
how | Optional. Specifies whether a row should be dropped if at least one column has missing values, or if all columns have missing values. "any" is default. |
recursive | Optional. It will recursively check whether a cell contains nan or not. This is handy for nested data structure like list, dictionary. For instance, {{FLEX_UNDEFINED, 1}, {1} will be treat as nan and will be removed if recursive is set to be true. Otherwise it won't be treated as nan-value. |
Example:
Produces output:
Example:
Produces output:
bool turi::gl_sframe::empty | ( | ) | const |
True if size() == 0.
gl_sframe turi::gl_sframe::fillna | ( | const std::string & | column, |
flexible_type | value | ||
) | const |
Fill all missing values with a given value in a given column. If the "value" is not the same type as the values in "column", this method attempts to convert the value to the original column"s type. If this fails, an error is raised.
column | The name of the column to modify. |
value | The value used to replace all missing values. |
recursive | The recursive is used to set the manner of nan-value checking. If this value is true, a cell will be treated as missing value iff it contains nan. For instance, {{FLEX_UNDEFINED, 1}, {0}} and {FLEX_UNDEFINED, 1} will be all treated as nan-values. |
Example:
Produces output:
gl_sframe turi::gl_sframe::filter_by | ( | const gl_sarray & | values, |
const std::string & | column_name, | ||
bool | exclude = false |
||
) | const |
Filter an gl_sframe by values inside an iterable object. Result is an gl_sframe that only includes (or excludes) the rows that have a column with the given "column_name" which holds one of the values in the given "values" gl_sarray.
values | The values to use to filter the gl_sframe. The resulting gl_sframe will only include rows that have one of these values in the given column. |
column_name | The column of the gl_sframe to match with the given "values". |
exclude | Optional. Defaults to false. If true, the result gl_sframe will contain all rows except those that have one of "values" in "column_name". |
Example:
Produces output:
gl_sframe turi::gl_sframe::groupby | ( | const std::vector< std::string > & | groupkeys, |
const std::map< std::string, aggregate::groupby_descriptor_type > & | operators = std::map< std::string, aggregate::groupby_descriptor_type >() |
||
) | const |
Perform a group on the key_columns followed by aggregations on the columns listed in operations. The operations parameter is a dictionary that indicates which aggregation operators to use and which columns to use them on. The available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT, SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience, aggregators MEAN, STD, and VARIANCE are available as synonyms for AVG, STDV, and VAR. See turi::aggregate for more detail on the aggregators.
groupkeys | Columns to group on. Type of key columns can be of any type other than dictionary. |
operations | Map of columns and aggregation operations. Each key is a output column name and each value is an aggregator. |
Suppose we have an SFrame (sf) with movie ratings by many users.
Compute the number of occurrences of each user.
Compute the mean and standard deviation of ratings per user.
Compute the movie with the minimum rating per user.
Compute the count, mean, and standard deviation of ratings per (user, time), automatically assigning output column names.
The groupby function can take a variable length list of aggregation specifiers so if we want the count and the 0.25 and 0.75 quantiles of ratings:
To put all items a user rated into one list value by their star rating:
To put all items and rating of a given user together into a dictionary value:
bool turi::gl_sframe::has_size | ( | ) | const |
Returns true if the size of the SFrame is known. If it is not known, calling size() may trigger materialization.
gl_sframe turi::gl_sframe::head | ( | size_t | n | ) | const |
bool turi::gl_sframe::is_materialized | ( | ) | const |
Returns whether or not the sarray has been materialized.
gl_sframe turi::gl_sframe::join | ( | const gl_sframe & | right, |
const std::vector< std::string > & | joinkeys, | ||
const std::string & | how = "inner" |
||
) | const |
Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.
right | The gl_sframe to join. |
on | The column name(s) representing the set of join keys. Each row that has the same value in this set of columns will be merged together. |
how | Optional. The type of join to perform. "inner" is default.
|
Example:
Produces output:
gl_sframe turi::gl_sframe::join | ( | const gl_sframe & | right, |
const std::map< std::string, std::string > & | joinkeys, | ||
const std::string & | how = "inner" |
||
) | const |
Joins two gl_sframe objects. Merges the current (left) gl_sframe with the given (right) gl_sframe using a SQL-style equi-join operation by columns.
right | The gl_sframe to join. |
on | The column name(s) representing a map of join keys from left to right. Each key is taken as a column name on the left gl_sframe and each value is taken as the column name in the right gl_sframe. |
how | Optional. The type of join to perform. "inner" is default.
|
Example:
Produces output:
void turi::gl_sframe::materialize | ( | ) |
For a SFrame that is lazily evaluated, force persist this sframe to disk, committing all lazy evaluated operations.
void turi::gl_sframe::materialize_to_callback | ( | std::function< bool(size_t, const std::shared_ptr< sframe_rows > &)> | callback, |
size_t | nthreads = (size_t)(-1) |
||
) |
Calls a callback function passing each row of the SArray.
This does not materialize the array if not necessary.
The callback may be called in parallel in which case the argument provides a thread number. The function should return false, but may return true at anytime to quit the iteration process. It may also throw exceptions which will be forwarded to the caller of this function.
Each call to the callback passes:
The sframe_rows object looks like a vector<vector<flexible_type>>. i.e. to look at all the rows, you need to write:
callback | The callback to call |
nthreads | Number of threads. If not specified, #cpus is used |
|
virtual |
Returns the number of columns of the SFrame.
Reimplemented in turi::gl_gframe.
std::vector<flexible_type> turi::gl_sframe::operator[] | ( | int64_t | i | ) |
Returns the value at a particular array index; generally inefficient.
This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.
std::vector<flexible_type> turi::gl_sframe::operator[] | ( | int64_t | i | ) | const |
Returns the value at a particular array index; generally inefficient.
This returns the value of the array at a particular index. Will raise an exception if the index is out of bounds. This operation is generally inefficient: the range_iterator() is prefered.
gl_sframe turi::gl_sframe::operator[] | ( | const std::initializer_list< int64_t > & | slice | ) |
Performs a slice Python style.
slice | A list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array. |
Given a gl_sframe
Slicing a consecutive range:
Slicing a range with a step:
Using negative indexing:
gl_sframe turi::gl_sframe::operator[] | ( | const std::initializer_list< int64_t > & | slice | ) | const |
Performs a slice Python style.
slice | A list of 2 or 3 values. If 2 values, this is interpreted as {start, end} indices, with an implicit value of step = 1. If 3 values, this is interpreted as {start, step, end}. Values at the positions [start, start+step, start+2*start, ...] are returned until end (exclusive) is reached. Negative start and end values are interpreted as offsets from the end of the array. |
Given a gl_sframe
Slicing a consecutive range:
Slicing a range with a step:
Using negative indexing:
Performs a logical filter.
This function performs a logical filter: i.e. it subselects all the elements in this array where the corresponding value in the other array evaluates to true.
gl_sframe turi::gl_sframe::pack_columns | ( | const std::vector< std::string > & | columns, |
const std::string & | new_column_name, | ||
flex_type_enum | dtype = flex_type_enum::LIST , |
||
flexible_type | fill_na = FLEX_UNDEFINED |
||
) | const |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Pack two or more columns of the current gl_sframe into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.
The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST
columns | A list of column names to be packed. There must at least two columns to pack. |
new_column_name | Packed column name. |
dtype | Optional. The resulting packed column type. If not provided, dtype is list. |
fill_na | Optional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced. |
Example: Suppose 'sf' is an an SFrame that maintains business category information.
To pack all category columns into a list:
To pack all category columns into a dictionary:
gl_sframe turi::gl_sframe::pack_columns | ( | const std::string & | column_prefix, |
const std::string & | new_column_name, | ||
flex_type_enum | dtype = flex_type_enum::LIST , |
||
flexible_type | fill_na = FLEX_UNDEFINED |
||
) | const |
Pack two or more columns of the current gl_sframe with a common column name prefix into one single column. The result is a new gl_sframe with the unaffected columns from the original gl_sframe plus the newly created column.
The type of the resulting column is decided by the "dtype" parameter. Allowed values for "dtype" are flex_type_enum::DICT , flex_type_enum::VECTOR or flex_type_enum::LIST
column_prefix | Packs all columns with the given prefix. |
new_column_name | Packed column name. |
dtype | Optional. The resulting packed column type. If not provided, dtype is list. |
fill_na | Optional. Value to fill into packed column if missing value is encountered. If packing to dictionary, "fill_na" is only applicable to dictionary values; missing keys are not replaced. |
Example: Suppose 'sf' is an an SFrame that maintains business category information.
To pack all category columns into a list:
To pack all category columns into a dictionary:
std::shared_ptr<model_base> turi::gl_sframe::plot | ( | ) | const |
Return a plot object of the SFrame (same visualization as show
)
Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.
fraction | Approximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1. |
seed | Optional. Seed for the random number generator used to split. |
Example:
Produces output:
std::pair<gl_sframe, gl_sframe> turi::gl_sframe::random_split | ( | double | fraction, |
size_t | seed, | ||
bool | exact = false |
||
) | const |
Randomly split the rows of an gl_sframe into two gl_sframe objects. The first gl_sframe contains M rows, sampled uniformly (without replacement) from the original gl_sframe. M is approximately the fraction times the original number of rows. The second gl_sframe contains the remaining rows of the original gl_sframe.
fraction | Approximate fraction of the rows to fetch for the first returned gl_sframe. Must be between 0 and 1. |
seed | The random seed for the random number generator. Deterministic output is obtained if this is set to a constant. |
Example:
Produces output:
gl_sframe_range turi::gl_sframe::range_iterator | ( | size_t | start = 0 , |
size_t | end = (size_t)(-1) |
||
) | const |
Returns a one pass range object with begin() and end() iterators.
This will materialize the array.
See materialize_to_callback for a lazy version.
start | The starting index of the range |
end | The ending index of the range |
Or more compactly with C++11 syntax:
The range returned only supports one pass. The outcome of a second call to begin() is undefined after any iterator is advanced.
When iterating over a gl_sframe with many columns, if only a small number of columns are needed, there is a performance benefit to subselecting just those columns first before iterating.
i.e. if I only need columns "a" and "b" from the SFrame above:
|
virtual |
Remove a column from this gl_sframe. This operation modifies the current gl_sframe in place. Raises an exception if the column does not exist.
name | The name of the column to remove. |
Example:
Produces output:
Reimplemented in turi::gl_gframe.
|
virtual |
Rename the given columns. "names" is expected to be a dictionary mapping old names to new names. This changes the names of the columns given as the keys and replaces them with the names given as the values. This operation modifies the current gl_sframe in place.
names | a map {old-name, new-name} pairs |
Example:
Produces output:
Reimplemented in turi::gl_gframe.
|
virtual |
Add a column to this gl_sframe, replacing a column with the same name already exists. The number of elements in the data given must match the length of every other column of the gl_sframe. This operation modifies the current gl_sframe in place. If no name is given, a default name is chosen.
data | The column of data to add. |
name | Optional. The name of the column. If no name is given, a default name is chosen. |
This is equivalent to using operator[] for column assignment.
Example:
Produces output:
gl_sframe turi::gl_sframe::sample | ( | double | fraction | ) | const |
Create an gl_sframe which contains a subsample of the current gl_sframe.
fraction | The fraction of the rows to fetch. Must be between 0 and 1. |
Example:
Produces output:
gl_sframe turi::gl_sframe::sample | ( | double | fraction, |
size_t | seed, | ||
bool | exact = false |
||
) | const |
Create an gl_sframe which contains a subsample of the current gl_sframe.
fraction | The fraction of the rows to fetch. Must be between 0 and 1. |
seed | The random seed for the random number generator. Deterministic output is obtained if this is set to a constant. |
Example:
Produces output:
void turi::gl_sframe::save | ( | const std::string & | path, |
const std::string & | format = "" |
||
) | const |
Saves the SFrame to file.
When format is "binary", the saved SArray will be in a directory named with the targetfile
parameter. When format is "text" or "csv", it is saved as a single human readable text file.
filename | A local path or a remote URL. If format is 'text', it will be saved as a text file. If format is 'binary', a directory will be created at the location which will contain the SArray. |
format | Optional. Either "binary", "csv" or "". Defaults to "". Format in which to save the SFrame. Binary saved SArrays can be loaded much faster and without any format conversion losses. If "csv", Each row will be written as a single line in an output text file. If format is an empty string (default), we will try to infer the format from filename given. If file name ends with "csv", or ".csv.gz", then the gl_sframe is saved as "csv" format, otherwise the gl_sframe is saved as 'binary' format. |
void turi::gl_sframe::save_reference | ( | const std::string & | path | ) | const |
Performs an incomplete save of an existing SFrame into a directory. This saved SFrame may reference SFrames in other locations in the same filesystem for certain columns/segments/etc.
Does not modify the current sframe.
gl_sarray turi::gl_sframe::select_column | ( | const std::string & | colname | ) | const |
Extracts one column of the gl_sframe.
This is equivalent to using operator[] for column indexing.
Equivalent to:
gl_sframe turi::gl_sframe::select_columns | ( | const std::vector< std::string > & | colnames | ) | const |
Extracts a collection of columns of the gl_sframe.
This is equivalent to using operator[] for selecting multiple columns
void turi::gl_sframe::show | ( | const std::string & | path_to_client | ) | const |
Show a visualization of the SFrame.
|
virtual |
Returns the number of rows of the SFrame.
This may trigger materialization in situations in which the size of the SFrame is not known. For instance after a logical filter.
Reimplemented in turi::gl_gframe.
gl_sframe turi::gl_sframe::sort | ( | const std::string & | column, |
bool | ascending = true |
||
) | const |
Sort current gl_sframe by a single column, using the given sort order.
Only columns that are type of str, int and float can be sorted.
column | The name of the column to be sorted. |
ascending | Optional. Sort all columns in the given order. |
Example:
Produces output:
Example:
Produces output:
gl_sframe turi::gl_sframe::sort | ( | const std::vector< std::string > & | columns, |
bool | ascending = true |
||
) | const |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
Sort current gl_sframe by a multiple columns, using the given sort order.
columns | The names of the columns to be sorted. |
ascending | Optional. Sort all columns in the given order. |
The result will be sorted first by first column, followed by second column, and so on. All columns will be sorted in the same order as governed by the "ascending" parameter.
Example:
Produces output:
gl_sframe turi::gl_sframe::sort | ( | const std::initializer_list< std::string > & | columns, |
bool | ascending = true |
||
) | const |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
gl_sframe turi::gl_sframe::sort | ( | const std::vector< std::pair< std::string, bool >> & | column_and_ascending | ) | const |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Sort current gl_sframe by a multiple columns, using different sort order for each column.
column_and_ascending | A map from column name to sort order (ascending is true) |
To sort by column "a" ascending, and then by column "c" descending To control the sort ordering for each column individually, "sort_columns" must be a list of (str, bool) pairs. Given this case, the first value is the column name and the second value is a boolean indicating whether the sort order is ascending.
Example:
Produces output:
gl_sframe turi::gl_sframe::split_datetime | ( | const std::string & | expand_column, |
const std::string & | column_name_prefix = "X" , |
||
const std::vector< std::string > & | limit = std::vector< std::string >() , |
||
bool | tzone = false |
||
) | const |
Splits a datetime column of gl_sframe to multiple columns, with each value in a separate column. Returns a new gl_sframe with the column replaced with a list of new columns. The expanded column must be of datetime type. For more details regarding name generation and other, refer to gl_sarray::split_datetime
This function is a convenience function which is equivalent to calling gl_sarray::split_datetime on the column, deleting the column and adding the expanded columns back to the sframe.
expand_column | Name of the column to expand. |
column_name_prefix | Optional. If provided, expanded column names would start with the given prefix. If not provided, the default value is the name of the expanded column. |
limit | Optional. Limits the set of datetime elements to expand. Elements are 'year','month','day','hour','minute', and 'second'. |
tzone | Optional. A boolean parameter that determines whether to show the timezone column or not. Defaults to false. |
Example:
Produces output:
gl_sframe turi::gl_sframe::stack | ( | const std::string & | column_name, |
const std::string & | new_column_names, | ||
bool | drop_na = false |
||
) | const |
Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.
The stack works only for columns of list, or array type (for the dict type, see the overload). One new column is created as a result of stacking, where each row holds one element of the array or list value, and the rest columns from the same original row repeated.
The new gl_sframe includes the newly created column and all columns other than the one that is stacked.
column_names | The column(s) to stack. This column must be of dict/list/array type |
new_column_name | Optional. The new column name. If not given, column names are generated automatically. |
drop_na | Optional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s). |
Suppose 'sf' is an SFrame that contains a user and his/her friends, where 'friends' columns is an array type. Stack on 'friends' column would create a user/friend list for each user/friend pair:
Produces output:
gl_sframe turi::gl_sframe::stack | ( | const std::string & | column_name, |
const std::vector< std::string > & | new_column_names, | ||
bool | drop_na = false |
||
) | const |
Convert a "wide" column of an gl_sframe to one or two "tall" columns by stacking all values.
The stack works only for columns of dictionary type (for the list or array types, see the overload). Two new columns are created as a result of stacking: one column holds the key and another column holds the value. The rest of the columns are repeated for each key/value pair.
The new gl_sframe includes the newly created columns and all columns other than the one that is stacked.
column_names | The column(s) to stack. This column must be of dict/list/array type |
new_column_names | Optional. The new column names. Must be an vector of 2 values corresponding to the "key" column and the "value" column. If not given, column names are generated automatically. |
drop_na | Optional. Defaults to false. If true, missing values and empty list/array/dict are all dropped from the resulting column(s). If false, missing values are maintained in stacked column(s). |
Suppose 'sf' is an SFrame that contains a column of dict type. Stack would stack all keys in one column and all values in another column:
Produces output:
|
virtual |
Swap the columns with the given names. This operation modifies the current gl_sframe in place. Raises an exception if the columns do not exist.
column_1 | Name of column to swap |
column_2 | Name of other column to swap |
Example:
Produces output:
Reimplemented in turi::gl_gframe.
gl_sframe turi::gl_sframe::tail | ( | size_t | n | ) | const |
gl_sframe turi::gl_sframe::topk | ( | const std::string & | column_name, |
size_t | k = 10 , |
||
bool | reverse = false |
||
) | const |
Get top k rows according to the given column. Result is according to and sorted by "column_name" in the given order (default is descending). When "k" is small, "topk" is more efficient than "sort".
column_name | The column to sort on |
k | Optional. Defaults to 10 The number of rows to return. |
reverse | Optional. Defaults to False. If true, return the top k rows in ascending order, otherwise, in descending order. |
Example:
Produces output:
Example:
Produces output:
gl_sframe turi::gl_sframe::unique | ( | ) | const |
Remove duplicate rows of the gl_sframe. Will not necessarily preserve the order of the given gl_sframe in the new gl_sframe.
Example:
Produces output:
gl_sframe turi::gl_sframe::unpack | ( | const std::string & | unpack_column, |
const std::string & | column_name_prefix = "X" , |
||
const std::vector< flex_type_enum > & | column_types = std::vector< flex_type_enum >() , |
||
const flexible_type & | na_value = FLEX_UNDEFINED , |
||
const std::vector< flexible_type > & | limit = std::vector< flexible_type >() |
||
) | const |
Expand one column of this gl_sframe to multiple columns with each value in a separate column. Returns a new gl_sframe with the unpacked column replaced with a list of new columns. The column must be of list/array/dict type. For more details regarding name generation, missing value handling and other, refer to gl_sarray::unpack
unpack_column | Name of the unpacked column |
column_name_prefix | Optional. If provided, unpacked column names would start with the given prefix. Defaults to "X". If the empty string is used, no prefix is used. |
column_types | Optional. Column types for the unpacked columns. If not provided, column types are automatically inferred from first 100 rows. Defaults to FLEX_UNDEFINED. |
na_value | Optional. Convert all values that are equal to "na_value" to missing value if specified. |
limit | optional limits in the set of list/vector/dict keys to unpack. For list/vector gl_sarrays, "limit" must contain integer indices. For dict gl_sarrays, "limit" must contain dictionary keys. |
Example:
To unpack:
To not have prefix in the generated column name::
To limit subset of keys to unpack:
gl_sframe turi::gl_sframe::unstack | ( | const std::string & | columns, |
const std::string & | new_column_name = "" |
||
) | const |
Concatenate values from one columns into one column, grouping by all other columns. The resulting column could be of type list or array. If "column" is a numeric column, the result will be of vector type. If "column" is a non-numeric column, the new column will be of list type.
column | The column that is to be concatenated. If str, then collapsed column type is either array or list. |
new_column_name | Optional. New column name. If not given, a name is generated automatically. |
Example:
Produces output:
gl_sframe turi::gl_sframe::unstack | ( | const std::vector< std::string > & | columns, |
const std::string & | new_column_name = "" |
||
) | const |
Concatenate values two columns into one column, grouping by all other columns. The new column will be of dict type where the keys are taken from the first column in the list, and the values taken from the second column in the list.
column | The columns that are to be concatenated. |
new_column_name | Optional. New column name. If not given, a name is generated automatically. |
Example:
Produces output:
|
friend |
Provides printing of the gl_sframe.