Turi Create  4.0
turi::unity_sframe Class Reference

#include <core/storage/sframe_interface/unity_sframe.hpp>

Public Member Functions

 unity_sframe ()
 
 ~unity_sframe ()
 
void construct_from_dataframe (const dataframe_t &df) override
 
void construct_from_sframe (const sframe &sf)
 
void construct_from_sframe_index (std::string index_file) override
 
std::map< std::string, std::shared_ptr< unity_sarray_base > > construct_from_csvs (std::string url, std::map< std::string, flexible_type > parsing_config, std::map< std::string, flex_type_enum > column_type_hints) override
 
void save_frame (std::string target_directory) override
 
void save_frame_reference (std::string target_directory) override
 
void save_frame_by_index_file (std::string index_file)
 
void clear () override
 
size_t size () override
 
size_t num_columns () override
 
std::vector< flex_type_enumdtype () override
 
flex_type_enum dtype (size_t column_index)
 
flex_type_enum dtype (const std::string &column_name)
 
std::vector< std::string > column_names () override
 
std::shared_ptr< unity_sframe_base > head (size_t nrows) override
 
size_t column_index (const std::string &name) override
 
const std::string & column_name (size_t index)
 
bool contains_column (const std::string &name)
 
dataframe_t _head (size_t nrows) override
 
std::shared_ptr< unity_sframe_base > tail (size_t nrows) override
 
dataframe_t _tail (size_t nrows) override
 
std::shared_ptr< unity_sarray_base > select_column (const std::string &name) override
 
std::shared_ptr< unity_sarray_base > select_column (size_t idx)
 
std::shared_ptr< unity_sframe_base > logical_filter (std::shared_ptr< unity_sarray_base > index) override
 
std::shared_ptr< unity_sframe_base > select_columns (const std::vector< std::string > &names) override
 
std::shared_ptr< unity_sframe_base > select_columns (const std::vector< size_t > &indices)
 
std::shared_ptr< unity_sframe_base > copy ()
 
void add_column (std::shared_ptr< unity_sarray_base >data, const std::string &name) override
 
void add_columns (std::list< std::shared_ptr< unity_sarray_base >> data_list, std::vector< std::string > name_vec) override
 
std::shared_ptr< unity_sarray_base > transform (const std::string &lambda, flex_type_enum type, bool skip_undefined, uint64_t seed) override
 
std::shared_ptr< unity_sarray_base > transform_native (const function_closure_info &lambda, flex_type_enum type, bool skip_undefined, uint64_t seed) override
 
std::shared_ptr< unity_sarray_base > transform_lambda (std::function< flexible_type(const sframe_rows::row &)> lambda, flex_type_enum type, uint64_t seed)
 
std::shared_ptr< unity_sframe_base > flat_map (const std::string &lambda, std::vector< std::string > output_column_names, std::vector< flex_type_enum > output_column_types, bool skip_undefined, uint64_t seed) override
 
void set_column_name (size_t i, std::string name) override
 
void remove_column (size_t i) override
 
void swap_columns (size_t i, size_t j) override
 
std::shared_ptr< sframeget_underlying_sframe ()
 
std::shared_ptr< query_eval::planner_nodeget_planner_node ()
 
void set_sframe (const std::shared_ptr< sframe > &sf_ptr)
 
void begin_iterator () override
 
std::vector< std::vector< flexible_type > > iterator_get_next (size_t len) override
 
void save_as_csv (const std::string &url, std::map< std::string, flexible_type > writing_config) override
 
std::list< std::shared_ptr< unity_sframe_base > > random_split (float percent, uint64_t random_seed, bool exact=false) override
 
std::shared_ptr< unity_sframe_base > shuffle () override
 
std::shared_ptr< unity_sframe_base > sample (float percent, uint64_t random_seed, bool exact=false) override
 
void materialize () override
 
bool is_materialized () override
 
std::string query_plan_string () override
 
bool has_size () override
 
std::shared_ptr< unity_sframe_base > groupby_aggregate (const std::vector< std::string > &key_columns, const std::vector< std::vector< std::string >> &group_columns, const std::vector< std::string > &group_output_columns, const std::vector< std::string > &group_operations) override
 
std::shared_ptr< unity_sframe_base > groupby_aggregate (const std::vector< std::string > &key_columns, const std::vector< std::vector< std::string >> &group_columns, const std::vector< std::string > &group_output_columns, const std::vector< std::shared_ptr< group_aggregate_value >> &group_operations)
 
std::shared_ptr< unity_sframe_base > append (std::shared_ptr< unity_sframe_base > other) override
 
std::shared_ptr< unity_sarray_base > pack_columns (const std::vector< std::string > &pack_column_names, const std::vector< std::string > &dict_key_names, flex_type_enum dtype, const flexible_type &fill_na) override
 
std::shared_ptr< unity_sframe_base > stack (const std::string &column_name, const std::vector< std::string > &new_column_names, const std::vector< flex_type_enum > &new_column_types, bool drop_na) override
 
std::shared_ptr< unity_sframe_base > copy_range (size_t start, size_t step, size_t end) override
 
std::list< std::shared_ptr< unity_sframe_base > > drop_missing_values (const std::vector< std::string > &column_names, bool all, bool split, bool recursive) override
 
std::list< std::shared_ptr< unity_sframe_base > > logical_filter_split (std::shared_ptr< unity_sarray_base > logical_filter_array)
 

Detailed Description

This is the SFrame object exposed to Python. It stores internally an sframe object which is a collection of named columns, each of flexible type. The SFrame represents a complete immutable collection of columns. Once created, it cannot be modified. However, shallow copies or sub-selection of columns can be created cheaply.

Internally it is simply a single shared_ptr to a sframe object. The sframe construction is delayed until one of the construct calls are made.

// construct
frame.construct(...)
// frame is now immutable.

The SFrame may require temporary on disk storage which will be deleted on program termination. Temporary file names are obtained from turi::get_temp_name

Definition at line 52 of file unity_sframe.hpp.

Constructor & Destructor Documentation

◆ unity_sframe()

turi::unity_sframe::unity_sframe ( )

Default constructor. Does nothing

◆ ~unity_sframe()

turi::unity_sframe::~unity_sframe ( )

Destructor. Calls clear().

Member Function Documentation

◆ _head()

dataframe_t turi::unity_sframe::_head ( size_t  nrows)
override

Same as head, returning dataframe.

◆ _tail()

dataframe_t turi::unity_sframe::_tail ( size_t  nrows)
override

Same as head, returning dataframe.

◆ add_column()

void turi::unity_sframe::add_column ( std::shared_ptr< unity_sarray_base >  data,
const std::string &  name 
)
override

Mutates the current SFrame by adding the given column.

Throws an exception if:

  • The given column has a different number of rows than the SFrame.

◆ add_columns()

void turi::unity_sframe::add_columns ( std::list< std::shared_ptr< unity_sarray_base >>  data_list,
std::vector< std::string >  name_vec 
)
override

Mutates the current SFrame by adding the given columns.

Throws an exception if ANY given column cannot be added (for one of the reasons that add_column can fail).

Note
Currently leaves the SFrame in an unfinished state if one of the columns fails...the columns before that were added successfully will be there. This needs to be changed.

◆ append()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::append ( std::shared_ptr< unity_sframe_base >  other)
override

Returns a new SFrame which contains all rows combined from current SFrame and "other" The "other" SFrame has to have the same number of columns with the same column names and same column types as "this" SFrame

◆ begin_iterator()

void turi::unity_sframe::begin_iterator ( )
override

Begin iteration through the SFrame.

Works together with iterator_get_next(). The usage pattern is as follows:

sframe.begin_iterator();
while(1) {
auto ret = sframe.iterator_get_next(64);
// do stuff
if (ret.size() < 64) {
// we are done
break;
}
}

Note that use of pretty much any of the other data-dependent SArray functions will invalidate the iterator.

◆ clear()

void turi::unity_sframe::clear ( )
override

Clears the contents of the SFrame.

◆ column_index()

size_t turi::unity_sframe::column_index ( const std::string &  name)
override

Returns the index of the column name

◆ column_name()

const std::string& turi::unity_sframe::column_name ( size_t  index)

Returns the name of the column in position index.

◆ column_names()

std::vector<std::string> turi::unity_sframe::column_names ( )
override

Returns an array containing the name of each column. The length of the return array is equal to num_columns(). If the sframe is empty, this returns an empty array.

◆ construct_from_csvs()

std::map<std::string, std::shared_ptr<unity_sarray_base> > turi::unity_sframe::construct_from_csvs ( std::string  url,
std::map< std::string, flexible_type parsing_config,
std::map< std::string, flex_type_enum column_type_hints 
)
override

Constructs an SFrame from one or more csv files. To keep the interface stable, the CSV parsing configuration read from a map of string->flexible_type called parsing_config. The URL can be a single filename or a directory name. When passing in a directory and the pattern is non-empty, we will attempt to treat it as a glob pattern.

The default parsing configuration is the following:

bool use_header = true;
tokenizer.delimiter = ",";
tokenizer.comment_char = '\0';
tokenizer.escape_char = '\\';
tokenizer.double_quote = true;
tokenizer.quote_char = '\"';
tokenizer.skip_initial_space = true;

The fields in parsing config are:

  • use_header : True if not is_zero()
  • delimiter : The entire delimiter string
  • comment_char : First character if flexible_type is a string
  • escape_char : First character if flexible_type is a string
  • double_quote : True if not is zero()
  • quote_char : First character if flexible_type is a string
  • skip_initial_space : True if not is zero()

◆ construct_from_dataframe()

void turi::unity_sframe::construct_from_dataframe ( const dataframe_t df)
override

Constructs an Sframe using a dataframe as input. Dataframe must not contain NaN values.

◆ construct_from_sframe()

void turi::unity_sframe::construct_from_sframe ( const sframe sf)

Constructs an Sframe using a sframe as input.

◆ construct_from_sframe_index()

void turi::unity_sframe::construct_from_sframe_index ( std::string  index_file)
override

Constructs an SFrame from an existing directory on disk saved with save_frame() or a on disk sarray prefix (saved with save_frame_by_index_file()). This function will automatically detect if the location is a directory, or a file. The files will not be deleted on destruction. If the current object is already storing an frame, it is cleared (clear()). May throw an exception on failure. If an exception occurs, the contents of SArray is empty.

◆ contains_column()

bool turi::unity_sframe::contains_column ( const std::string &  name)

Returns true if the column is present in the sframe, and false otherwise.

◆ copy()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::copy ( )

Returns an lazy sframe which a the copy of the current one

◆ copy_range()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::copy_range ( size_t  start,
size_t  step,
size_t  end 
)
override

Extracts a range of rows from an SFrame as a new SFrame. This will extract rows beginning at start (inclusive) and ending at end(exclusive) in steps of "step". step must be at least 1.

◆ drop_missing_values()

std::list<std::shared_ptr<unity_sframe_base> > turi::unity_sframe::drop_missing_values ( const std::vector< std::string > &  column_names,
bool  all,
bool  split,
bool  recursive 
)
override

Returns a new SFrame with missing values dropped.

Missing values are only searched for in the columns specified in the 'column_names'. If this vector is empty, all columns will be considered. If 'all' is true, a row is only dropped if all specified columns contain a missing value. If false, the row is dropped if any of the specified columns contain a missing value.

If 'split' is true, this function returns two SFrames, the first being the SFrame with missing values dropped, and the second consisting of all the rows removed.

If 'recursive' is true, the nanelement check will be perfromed in a recursive manner to check each unit in a container-like flexible-typed cell in SFrame.

Throws if the column names are not in this SFrame, or if too many are given.

◆ dtype() [1/3]

std::vector<flex_type_enum> turi::unity_sframe::dtype ( )
override

Returns an array containing the datatype of each column. The length of the return array is equal to num_columns(). If the sframe is empty, this returns an empty array.

◆ dtype() [2/3]

flex_type_enum turi::unity_sframe::dtype ( size_t  column_index)

Returns the dtype of a particular column.

◆ dtype() [3/3]

flex_type_enum turi::unity_sframe::dtype ( const std::string &  column_name)

Returns the dtype of a particular column.

◆ flat_map()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::flat_map ( const std::string &  lambda,
std::vector< std::string >  output_column_names,
std::vector< flex_type_enum output_column_types,
bool  skip_undefined,
uint64_t  seed 
)
override

Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.

◆ get_planner_node()

std::shared_ptr<query_eval::planner_node> turi::unity_sframe::get_planner_node ( )

Returns the underlying planner pointer

◆ get_underlying_sframe()

std::shared_ptr<sframe> turi::unity_sframe::get_underlying_sframe ( )

Returns the underlying shared_ptr to the sframe object.

◆ groupby_aggregate() [1/2]

std::shared_ptr<unity_sframe_base> turi::unity_sframe::groupby_aggregate ( const std::vector< std::string > &  key_columns,
const std::vector< std::vector< std::string >> &  group_columns,
const std::vector< std::string > &  group_output_columns,
const std::vector< std::string > &  group_operations 
)
override

Returns unity_sframe* where there is one row for each unique value of the key_column. group_operations is a collection of pairs of {column_name, operation_name} where operation_name is a builtin operator.

◆ groupby_aggregate() [2/2]

std::shared_ptr<unity_sframe_base> turi::unity_sframe::groupby_aggregate ( const std::vector< std::string > &  key_columns,
const std::vector< std::vector< std::string >> &  group_columns,
const std::vector< std::string > &  group_output_columns,
const std::vector< std::shared_ptr< group_aggregate_value >> &  group_operations 
)

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

◆ has_size()

bool turi::unity_sframe::has_size ( )
override

Return true if the sframe size is known.

◆ head()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::head ( size_t  nrows)
override

Returns some number of rows of the SFrame in a dataframe representation. if nrows exceeds the number of rows in the SFrame ( size() ), this returns only size() rows.

◆ is_materialized()

bool turi::unity_sframe::is_materialized ( )
override

Returns whether or not this sframe is materialized

◆ iterator_get_next()

std::vector< std::vector<flexible_type> > turi::unity_sframe::iterator_get_next ( size_t  len)
override

Obtains the next block of elements of size len from the SFrame. Works together with begin_iterator(). See the code example in begin_iterator() for details.

This function will always return a vector of length 'len' unless at the end of the array, or if an error has occured.

Parameters
lenThe number of elements to return
Returns
The next collection of elements in the array. Returns less then len elements on end of file or failure.

◆ logical_filter()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::logical_filter ( std::shared_ptr< unity_sarray_base >  index)
override

Returns a new SFrame which is filtered by a given logical column. The index array must be the same length as the current array. An output array is returned containing only the elements in the current where are the corresponding element in the index array evaluates to true.

◆ logical_filter_split()

std::list<std::shared_ptr<unity_sframe_base> > turi::unity_sframe::logical_filter_split ( std::shared_ptr< unity_sarray_base >  logical_filter_array)

Similar to logical filter, but return both positive and negative rows.

Parameters
logical_filter_arrayis an sarray of the same size, and has only zeros and ones as value.

Return a list of two sframes with all positive examples goes to the first one and negative rows goes to the second one.

◆ materialize()

void turi::unity_sframe::materialize ( )
override

materialize the sframe, this is different from save() as this is a temporary persist of all sarrays underneath the sframe to speed up some computation (for example, lambda) this will NOT create a new uity_sframe.

◆ num_columns()

size_t turi::unity_sframe::num_columns ( )
override

Returns the number of columns in the SFrame. Returns 0 if the sframe is empty.

◆ pack_columns()

std::shared_ptr<unity_sarray_base> turi::unity_sframe::pack_columns ( const std::vector< std::string > &  pack_column_names,
const std::vector< std::string > &  dict_key_names,
flex_type_enum  dtype,
const flexible_type fill_na 
)
override

Pack a subset columns of current SFrame into one dictionary column, using column name as key in the dictionary, and value of the column as value in the dictionary, returns a new SFrame that includes other non-packed columns plus the newly generated dict column. Missing value in the original column will not show up in the packed dictionary value.

Parameters
pack_column_names: list of column names to pack
dict_key_names: dictionary key name to give to the packed dictionary
dtypethe result SArray type missing value is maintained, it could be filled with fill_na value is specified.
fill_nathe value to fill when missing value is encountered

Returns a new SArray that contains the newly packed column

◆ query_plan_string()

std::string turi::unity_sframe::query_plan_string ( )
override

Return the query plan as a string representation of a dot graph.

◆ random_split()

std::list<std::shared_ptr<unity_sframe_base> > turi::unity_sframe::random_split ( float  percent,
uint64_t  random_seed,
bool  exact = false 
)
override

Randomly split the sframe into two parts, with ratio = percent, and seed = random_seed.

Returns a list of size 2 of the unity_sframes resulting from the split.

◆ remove_column()

void turi::unity_sframe::remove_column ( size_t  i)
override

Remove the ith column.

◆ sample()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::sample ( float  percent,
uint64_t  random_seed,
bool  exact = false 
)
override

Sample the rows of sframe uniformly with ratio = percent, and seed = random_seed.

Returns unity_sframe* containing the sampled rows.

◆ save_as_csv()

void turi::unity_sframe::save_as_csv ( const std::string &  url,
std::map< std::string, flexible_type writing_config 
)
override

Save the sframe to url in csv format. To keep the interface stable, the CSV parsing configuration read from a map of string->flexible_type called writing_config.

The default writing configuration is the following:

writer.delimiter = ",";
writer.escape_char = '\\';
writer.double_quote = true;
writer.quote_char = '\"';
writer.use_quote_char = true;

For details on the meaning of each config see csv_writer

The fields in parsing config are:

  • delimiter : First character if flexible_type is a string
  • escape_char : First character if flexible_type is a string
  • double_quote : True if not is zero()
  • quote_char : First character if flexible_type is a string
  • use_quote_char : First character if flexible_type is a string

◆ save_frame()

void turi::unity_sframe::save_frame ( std::string  target_directory)
override

Saves a copy of the current sframe into a directory. Does not modify the current sframe.

◆ save_frame_by_index_file()

void turi::unity_sframe::save_frame_by_index_file ( std::string  index_file)

Saves a copy of the current sframe into a target location defined by an index file. DOes not modify the current sframe.

◆ save_frame_reference()

void turi::unity_sframe::save_frame_reference ( std::string  target_directory)
override

Performs an incomplete save of an existing SFrame into a directory. This saved SFrame may reference SFrames in other locations in the same filesystem for certain columns/segments/etc.

Does not modify the current sframe.

◆ select_column() [1/2]

std::shared_ptr<unity_sarray_base> turi::unity_sframe::select_column ( const std::string &  name)
override

Returns an SArray with the column that corresponds to 'name'. Throws an exception if the name is not in the current SFrame.

◆ select_column() [2/2]

std::shared_ptr<unity_sarray_base> turi::unity_sframe::select_column ( size_t  idx)

Returns an SArray with the column that corresponds to index idx. Throws an exception if the name is not in the current SFrame.

◆ select_columns() [1/2]

std::shared_ptr<unity_sframe_base> turi::unity_sframe::select_columns ( const std::vector< std::string > &  names)
override

Returns an lazy sframe with the columns that have the given names. Throws an exception if ANY of the names given are not in the current SFrame.

◆ select_columns() [2/2]

std::shared_ptr<unity_sframe_base> turi::unity_sframe::select_columns ( const std::vector< size_t > &  indices)

Returns an lazy sframe with the columns given by the indices.

◆ set_column_name()

void turi::unity_sframe::set_column_name ( size_t  i,
std::string  name 
)
override

Set the ith column name.

Throws an exception if index out of bound or name already exists.

◆ set_sframe()

void turi::unity_sframe::set_sframe ( const std::shared_ptr< sframe > &  sf_ptr)

Sets the private shared pointer to an sframe.

◆ shuffle()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::shuffle ( )
override

Randomly shuffles the sframe.

Returns a list of size 2 of the unity_sframes resulting from the split.

◆ size()

size_t turi::unity_sframe::size ( )
override

Returns the number of rows in the SFrame. Returns 0 if the SFrame is empty.

◆ stack()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::stack ( const std::string &  column_name,
const std::vector< std::string > &  new_column_names,
const std::vector< flex_type_enum > &  new_column_types,
bool  drop_na 
)
override

Convert a dictionary column of the SFrame to two columns with first column as the key for the dictionary and second column as the value for the dictionary. Returns a new SFrame with the two newly created columns, plus all columns other than the stacked column. The values from those columns are duplicated for all rows created from the same original row.

Parameters
column_namestring The column to stack. The name must come from current SFrame and must be of dict type
new_column_namesa list of str, optional Must be length of two. The two column names to stack the dict value to. If not given, the name is automatically generated.
new_column_typesa list of types, optional Must be length of two. The type for the newly created column. If not given, the default to [str, int].
drop_naif true, missing values from dictionary will be ignored. If false, for missing dict value, one row will be created with the two new columns' value being missing value

Retruns a new unity_sframe with stacked columns

◆ swap_columns()

void turi::unity_sframe::swap_columns ( size_t  i,
size_t  j 
)
override

Swap the ith and jth columns.

◆ tail()

std::shared_ptr<unity_sframe_base> turi::unity_sframe::tail ( size_t  nrows)
override

Returns some number of rows from the end of the SFrame in a dataframe representation. If nrows exceeds the number of rows in the SFrame ( size() ), this returns only size() rows.

◆ transform()

std::shared_ptr<unity_sarray_base> turi::unity_sframe::transform ( const std::string &  lambda,
flex_type_enum  type,
bool  skip_undefined,
uint64_t  seed 
)
override

Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.

◆ transform_lambda()

std::shared_ptr<unity_sarray_base> turi::unity_sframe::transform_lambda ( std::function< flexible_type(const sframe_rows::row &)>  lambda,
flex_type_enum  type,
uint64_t  seed 
)

Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.

◆ transform_native()

std::shared_ptr<unity_sarray_base> turi::unity_sframe::transform_native ( const function_closure_info lambda,
flex_type_enum  type,
bool  skip_undefined,
uint64_t  seed 
)
override

Returns a new sarray which is a transform of each row in the sframe using a Python lambda function pickled into a string.


The documentation for this class was generated from the following file: