Turi Create
4.0
|
#include <core/storage/sframe_interface/unity_sframe.hpp>
Public Member Functions | |
unity_sframe () | |
~unity_sframe () | |
void | construct_from_dataframe (const dataframe_t &df) override |
void | construct_from_sframe (const sframe &sf) |
void | construct_from_sframe_index (std::string index_file) override |
std::map< std::string, std::shared_ptr< unity_sarray_base > > | construct_from_csvs (std::string url, std::map< std::string, flexible_type > parsing_config, std::map< std::string, flex_type_enum > column_type_hints) override |
void | save_frame (std::string target_directory) override |
void | save_frame_reference (std::string target_directory) override |
void | save_frame_by_index_file (std::string index_file) |
void | clear () override |
size_t | size () override |
size_t | num_columns () override |
std::vector< flex_type_enum > | dtype () override |
flex_type_enum | dtype (size_t column_index) |
flex_type_enum | dtype (const std::string &column_name) |
std::vector< std::string > | column_names () override |
std::shared_ptr< unity_sframe_base > | head (size_t nrows) override |
size_t | column_index (const std::string &name) override |
const std::string & | column_name (size_t index) |
bool | contains_column (const std::string &name) |
dataframe_t | _head (size_t nrows) override |
std::shared_ptr< unity_sframe_base > | tail (size_t nrows) override |
dataframe_t | _tail (size_t nrows) override |
std::shared_ptr< unity_sarray_base > | select_column (const std::string &name) override |
std::shared_ptr< unity_sarray_base > | select_column (size_t idx) |
std::shared_ptr< unity_sframe_base > | logical_filter (std::shared_ptr< unity_sarray_base > index) override |
std::shared_ptr< unity_sframe_base > | select_columns (const std::vector< std::string > &names) override |
std::shared_ptr< unity_sframe_base > | select_columns (const std::vector< size_t > &indices) |
std::shared_ptr< unity_sframe_base > | copy () |
void | add_column (std::shared_ptr< unity_sarray_base >data, const std::string &name) override |
void | add_columns (std::list< std::shared_ptr< unity_sarray_base >> data_list, std::vector< std::string > name_vec) override |
std::shared_ptr< unity_sarray_base > | transform (const std::string &lambda, flex_type_enum type, bool skip_undefined, uint64_t seed) override |
std::shared_ptr< unity_sarray_base > | transform_native (const function_closure_info &lambda, flex_type_enum type, bool skip_undefined, uint64_t seed) override |
std::shared_ptr< unity_sarray_base > | transform_lambda (std::function< flexible_type(const sframe_rows::row &)> lambda, flex_type_enum type, uint64_t seed) |
std::shared_ptr< unity_sframe_base > | flat_map (const std::string &lambda, std::vector< std::string > output_column_names, std::vector< flex_type_enum > output_column_types, bool skip_undefined, uint64_t seed) override |
void | set_column_name (size_t i, std::string name) override |
void | remove_column (size_t i) override |
void | swap_columns (size_t i, size_t j) override |
std::shared_ptr< sframe > | get_underlying_sframe () |
std::shared_ptr< query_eval::planner_node > | get_planner_node () |
void | set_sframe (const std::shared_ptr< sframe > &sf_ptr) |
void | begin_iterator () override |
std::vector< std::vector< flexible_type > > | iterator_get_next (size_t len) override |
void | save_as_csv (const std::string &url, std::map< std::string, flexible_type > writing_config) override |
std::list< std::shared_ptr< unity_sframe_base > > | random_split (float percent, uint64_t random_seed, bool exact=false) override |
std::shared_ptr< unity_sframe_base > | shuffle () override |
std::shared_ptr< unity_sframe_base > | sample (float percent, uint64_t random_seed, bool exact=false) override |
void | materialize () override |
bool | is_materialized () override |
std::string | query_plan_string () override |
bool | has_size () override |
std::shared_ptr< unity_sframe_base > | groupby_aggregate (const std::vector< std::string > &key_columns, const std::vector< std::vector< std::string >> &group_columns, const std::vector< std::string > &group_output_columns, const std::vector< std::string > &group_operations) override |
std::shared_ptr< unity_sframe_base > | groupby_aggregate (const std::vector< std::string > &key_columns, const std::vector< std::vector< std::string >> &group_columns, const std::vector< std::string > &group_output_columns, const std::vector< std::shared_ptr< group_aggregate_value >> &group_operations) |
std::shared_ptr< unity_sframe_base > | append (std::shared_ptr< unity_sframe_base > other) override |
std::shared_ptr< unity_sarray_base > | pack_columns (const std::vector< std::string > &pack_column_names, const std::vector< std::string > &dict_key_names, flex_type_enum dtype, const flexible_type &fill_na) override |
std::shared_ptr< unity_sframe_base > | stack (const std::string &column_name, const std::vector< std::string > &new_column_names, const std::vector< flex_type_enum > &new_column_types, bool drop_na) override |
std::shared_ptr< unity_sframe_base > | copy_range (size_t start, size_t step, size_t end) override |
std::list< std::shared_ptr< unity_sframe_base > > | drop_missing_values (const std::vector< std::string > &column_names, bool all, bool split, bool recursive) override |
std::list< std::shared_ptr< unity_sframe_base > > | logical_filter_split (std::shared_ptr< unity_sarray_base > logical_filter_array) |
This is the SFrame object exposed to Python. It stores internally an sframe object which is a collection of named columns, each of flexible type. The SFrame represents a complete immutable collection of columns. Once created, it cannot be modified. However, shallow copies or sub-selection of columns can be created cheaply.
Internally it is simply a single shared_ptr to a sframe object. The sframe construction is delayed until one of the construct calls are made.
The SFrame may require temporary on disk storage which will be deleted on program termination. Temporary file names are obtained from turi::get_temp_name
Definition at line 52 of file unity_sframe.hpp.
turi::unity_sframe::unity_sframe | ( | ) |
Default constructor. Does nothing
turi::unity_sframe::~unity_sframe | ( | ) |
Destructor. Calls clear().
|
override |
Same as head, returning dataframe.
|
override |
Same as head, returning dataframe.
|
override |
Mutates the current SFrame by adding the given column.
Throws an exception if:
|
override |
Mutates the current SFrame by adding the given columns.
Throws an exception if ANY given column cannot be added (for one of the reasons that add_column can fail).
|
override |
Returns a new SFrame which contains all rows combined from current SFrame and "other" The "other" SFrame has to have the same number of columns with the same column names and same column types as "this" SFrame
|
override |
Begin iteration through the SFrame.
Works together with iterator_get_next(). The usage pattern is as follows:
Note that use of pretty much any of the other data-dependent SArray functions will invalidate the iterator.
|
override |
Clears the contents of the SFrame.
|
override |
Returns the index of the column name
const std::string& turi::unity_sframe::column_name | ( | size_t | index | ) |
Returns the name of the column in position index.
|
override |
Returns an array containing the name of each column. The length of the return array is equal to num_columns(). If the sframe is empty, this returns an empty array.
|
override |
Constructs an SFrame from one or more csv files. To keep the interface stable, the CSV parsing configuration read from a map of string->flexible_type called parsing_config. The URL can be a single filename or a directory name. When passing in a directory and the pattern is non-empty, we will attempt to treat it as a glob pattern.
The default parsing configuration is the following:
The fields in parsing config are:
|
override |
Constructs an Sframe using a dataframe as input. Dataframe must not contain NaN values.
void turi::unity_sframe::construct_from_sframe | ( | const sframe & | sf | ) |
Constructs an Sframe using a sframe as input.
|
override |
Constructs an SFrame from an existing directory on disk saved with save_frame() or a on disk sarray prefix (saved with save_frame_by_index_file()). This function will automatically detect if the location is a directory, or a file. The files will not be deleted on destruction. If the current object is already storing an frame, it is cleared (clear()). May throw an exception on failure. If an exception occurs, the contents of SArray is empty.
bool turi::unity_sframe::contains_column | ( | const std::string & | name | ) |
Returns true if the column is present in the sframe, and false otherwise.
std::shared_ptr<unity_sframe_base> turi::unity_sframe::copy | ( | ) |
Returns an lazy sframe which a the copy of the current one
|
override |
Extracts a range of rows from an SFrame as a new SFrame. This will extract rows beginning at start (inclusive) and ending at end(exclusive) in steps of "step". step must be at least 1.
|
override |
Returns a new SFrame with missing values dropped.
Missing values are only searched for in the columns specified in the 'column_names'. If this vector is empty, all columns will be considered. If 'all' is true, a row is only dropped if all specified columns contain a missing value. If false, the row is dropped if any of the specified columns contain a missing value.
If 'split' is true, this function returns two SFrames, the first being the SFrame with missing values dropped, and the second consisting of all the rows removed.
If 'recursive' is true, the nan
element check will be perfromed in a recursive manner to check each unit in a container-like flexible-typed cell in SFrame.
Throws if the column names are not in this SFrame, or if too many are given.
|
override |
Returns an array containing the datatype of each column. The length of the return array is equal to num_columns(). If the sframe is empty, this returns an empty array.
flex_type_enum turi::unity_sframe::dtype | ( | size_t | column_index | ) |
Returns the dtype of a particular column.
flex_type_enum turi::unity_sframe::dtype | ( | const std::string & | column_name | ) |
Returns the dtype of a particular column.
|
override |
Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.
std::shared_ptr<query_eval::planner_node> turi::unity_sframe::get_planner_node | ( | ) |
Returns the underlying planner pointer
std::shared_ptr<sframe> turi::unity_sframe::get_underlying_sframe | ( | ) |
Returns the underlying shared_ptr to the sframe object.
|
override |
Returns unity_sframe* where there is one row for each unique value of the key_column. group_operations is a collection of pairs of {column_name, operation_name} where operation_name is a builtin operator.
std::shared_ptr<unity_sframe_base> turi::unity_sframe::groupby_aggregate | ( | const std::vector< std::string > & | key_columns, |
const std::vector< std::vector< std::string >> & | group_columns, | ||
const std::vector< std::string > & | group_output_columns, | ||
const std::vector< std::shared_ptr< group_aggregate_value >> & | group_operations | ||
) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
|
override |
Return true if the sframe size is known.
|
override |
|
override |
Returns whether or not this sframe is materialized
|
override |
Obtains the next block of elements of size len from the SFrame. Works together with begin_iterator(). See the code example in begin_iterator() for details.
This function will always return a vector of length 'len' unless at the end of the array, or if an error has occured.
len | The number of elements to return |
|
override |
Returns a new SFrame which is filtered by a given logical column. The index array must be the same length as the current array. An output array is returned containing only the elements in the current where are the corresponding element in the index array evaluates to true.
std::list<std::shared_ptr<unity_sframe_base> > turi::unity_sframe::logical_filter_split | ( | std::shared_ptr< unity_sarray_base > | logical_filter_array | ) |
Similar to logical filter, but return both positive and negative rows.
logical_filter_array | is an sarray of the same size, and has only zeros and ones as value. |
Return a list of two sframes with all positive examples goes to the first one and negative rows goes to the second one.
|
override |
materialize the sframe, this is different from save() as this is a temporary persist of all sarrays underneath the sframe to speed up some computation (for example, lambda) this will NOT create a new uity_sframe.
|
override |
Returns the number of columns in the SFrame. Returns 0 if the sframe is empty.
|
override |
Pack a subset columns of current SFrame into one dictionary column, using column name as key in the dictionary, and value of the column as value in the dictionary, returns a new SFrame that includes other non-packed columns plus the newly generated dict column. Missing value in the original column will not show up in the packed dictionary value.
pack_column_names | : list of column names to pack |
dict_key_names | : dictionary key name to give to the packed dictionary |
dtype | the result SArray type missing value is maintained, it could be filled with fill_na value is specified. |
fill_na | the value to fill when missing value is encountered |
Returns a new SArray that contains the newly packed column
|
override |
Return the query plan as a string representation of a dot graph.
|
override |
Randomly split the sframe into two parts, with ratio = percent, and seed = random_seed.
Returns a list of size 2 of the unity_sframes resulting from the split.
|
override |
Remove the ith column.
|
override |
Sample the rows of sframe uniformly with ratio = percent, and seed = random_seed.
Returns unity_sframe* containing the sampled rows.
|
override |
Save the sframe to url in csv format. To keep the interface stable, the CSV parsing configuration read from a map of string->flexible_type called writing_config.
The default writing configuration is the following:
For details on the meaning of each config see csv_writer
The fields in parsing config are:
|
override |
Saves a copy of the current sframe into a directory. Does not modify the current sframe.
void turi::unity_sframe::save_frame_by_index_file | ( | std::string | index_file | ) |
Saves a copy of the current sframe into a target location defined by an index file. DOes not modify the current sframe.
|
override |
Performs an incomplete save of an existing SFrame into a directory. This saved SFrame may reference SFrames in other locations in the same filesystem for certain columns/segments/etc.
Does not modify the current sframe.
|
override |
Returns an SArray with the column that corresponds to 'name'. Throws an exception if the name is not in the current SFrame.
std::shared_ptr<unity_sarray_base> turi::unity_sframe::select_column | ( | size_t | idx | ) |
Returns an SArray with the column that corresponds to index idx. Throws an exception if the name is not in the current SFrame.
|
override |
Returns an lazy sframe with the columns that have the given names. Throws an exception if ANY of the names given are not in the current SFrame.
std::shared_ptr<unity_sframe_base> turi::unity_sframe::select_columns | ( | const std::vector< size_t > & | indices | ) |
Returns an lazy sframe with the columns given by the indices.
|
override |
Set the ith column name.
Throws an exception if index out of bound or name already exists.
void turi::unity_sframe::set_sframe | ( | const std::shared_ptr< sframe > & | sf_ptr | ) |
Sets the private shared pointer to an sframe.
|
override |
Randomly shuffles the sframe.
Returns a list of size 2 of the unity_sframes resulting from the split.
|
override |
Returns the number of rows in the SFrame. Returns 0 if the SFrame is empty.
|
override |
Convert a dictionary column of the SFrame to two columns with first column as the key for the dictionary and second column as the value for the dictionary. Returns a new SFrame with the two newly created columns, plus all columns other than the stacked column. The values from those columns are duplicated for all rows created from the same original row.
column_name | string The column to stack. The name must come from current SFrame and must be of dict type |
new_column_names | a list of str, optional Must be length of two. The two column names to stack the dict value to. If not given, the name is automatically generated. |
new_column_types | a list of types, optional Must be length of two. The type for the newly created column. If not given, the default to [str, int]. |
drop_na | if true, missing values from dictionary will be ignored. If false, for missing dict value, one row will be created with the two new columns' value being missing value |
Retruns a new unity_sframe with stacked columns
|
override |
Swap the ith and jth columns.
|
override |
|
override |
Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.
std::shared_ptr<unity_sarray_base> turi::unity_sframe::transform_lambda | ( | std::function< flexible_type(const sframe_rows::row &)> | lambda, |
flex_type_enum | type, | ||
uint64_t | seed | ||
) |
Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.
|
override |
Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.