Turi Create  4.0
Main SFrame Objects

Namespaces

 turi::sframe_config
 

Classes

class  turi::sarray< T >
 
class  turi::sarray_block_iterator< DataType >
 
class  turi::sarray_iterator< T >
 
class  turi::sarray_reader< T >
 
class  turi::sarray_reader_buffer< T >
 
class  turi::sframe
 
struct  turi::sframe_index_file_information
 
class  turi::parallel_sframe_iterator_initializer
 
class  turi::parallel_sframe_iterator
 
class  turi::sframe_iterator
 
class  turi::sframe_reader
 
class  turi::sframe_reader_buffer
 
class  turi::sframe_rows
 

Functions

template<typename T >
sarray_block_iterator< T > turi::make_sarray_block_iterator (const std::shared_ptr< sarray< T > > &data)
 
sframe_index_file_information turi::read_sframe_index_file (std::string index_file)
 
void turi::write_sframe_index_file (std::string index_file, const sframe_index_file_information &info)
 
size_t turi::sframe_row_to_csv (const std::vector< flexible_type > &row, char *buf, size_t buflen)
 
void turi::sframe_row_to_json (const std::vector< std::string > &column_names, const std::vector< flexible_type > &column_values, JSONNode &node)
 
void turi::sframe_save_naive (const sframe &sf, std::string index_file)
 
void turi::sframe_save_blockwise (const sframe &sf, std::string index_file)
 
void turi::sframe_save (const sframe &sf, std::string index_file)
 
void turi::sframe_save_weak_reference (const sframe &sf, std::string index_file)
 
std::vector< sframeturi::shuffle (sframe sframe_in, size_t n, std::function< size_t(const std::vector< flexible_type > &)> hash_fn, std::function< void(const std::vector< flexible_type > &, size_t)> emit_call_back=std::function< void(const std::vector< flexible_type > &, size_t)>())
 
size_t turi::sarray_reader< T >::read_rows (size_t row_start, size_t row_end, sframe_rows &out_obj)
 
value_type && turi::sarray_reader_buffer< T >::next ()
 Return the next element in the reader. More...
 

Variables

size_t turi::SFRAME_DEFAULT_NUM_SEGMENTS
 
const size_t turi::DEFAULT_SARRAY_READER_BUFFER_SIZE
 
const size_t turi::SARRAY_FROM_FILE_BATCH_SIZE
 
const size_t turi::MIN_SEGMENT_LENGTH
 
const size_t turi::SFRAME_WRITER_BUFFER_HARD_LIMIT
 
size_t turi::SFRAME_FILE_HANDLE_POOL_SIZE
 
const size_t turi::SFRAME_BLOCK_MANAGER_BLOCK_BUFFER_COUNT
 
const float turi::COMPRESSION_DISABLE_THRESHOLD
 
size_t turi::SFRAME_DEFAULT_BLOCK_SIZE
 
const size_t turi::SARRAY_WRITER_INITAL_ELEMENTS_PER_BLOCK
 
const size_t turi::SARRAY_WRITER_MIN_ELEMENTS_PER_BLOCK
 
size_t turi::SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK
 
size_t turi::SFRAME_WRITER_MAX_BUFFERED_CELLS
 
size_t turi::SFRAME_MAX_BLOCKS_IN_CACHE
 
size_t turi::SFRAME_CSV_PARSER_READ_SIZE
 
size_t turi::SFRAME_GROUPBY_BUFFER_NUM_ROWS
 
size_t turi::SFRAME_SHUFFLE_BUCKET_SIZE
 
size_t turi::SFRAME_JOIN_BUFFER_NUM_CELLS
 
size_t turi::SFRAME_IO_READ_LOCK
 
const size_t turi::SFRAME_IO_LOCK_FILE_SIZE_THRESHOLD
 
size_t turi::SFRAME_SORT_PIVOT_ESTIMATION_SAMPLE_SIZE
 
size_t turi::SFRAME_SORT_MAX_SEGMENTS
 
size_t turi::SFRAME_COMPACTION_THRESHOLD
 
size_t turi::FAST_COMPACT_BLOCKS_IN_SMALL_SEGMENT
 

Detailed Description

Function Documentation

◆ make_sarray_block_iterator()

template<typename T >
sarray_block_iterator<T> turi::make_sarray_block_iterator ( const std::shared_ptr< sarray< T > > &  data)

Creates a sarray block iterator; convenience function using automatic template matching.

Definition at line 245 of file sarray_iterators.hpp.

◆ next()

template<typename T >
T && turi::sarray_reader_buffer< T >::next ( )

Return the next element in the reader.

Return the next element in the chunk.

Definition at line 146 of file sarray_reader_buffer.hpp.

◆ read_rows()

template<typename T>
size_t turi::sarray_reader< T >::read_rows ( size_t  row_start,
size_t  row_end,
sframe_rows out_obj 
)
inline

Reads a collection of rows, storing the result in out_obj. This function is independent of the open_segment/read_segment/close_segment functions, and can be called anytime. This function is also fully concurrent.

Parameters
row_startFirst row to read
row_endone past the last row to read (i.e. EXCLUSIVE). row_end can be beyond the end of the array, in which case, fewer rows will be read.
out_objThe output array
Returns
Actual number of rows read. Return (size_t)(-1) on failure.
Note
This function is not always efficient. Different file formats implementations will have different characteristics.

Definition at line 574 of file sarray_reader.hpp.

◆ read_sframe_index_file()

sframe_index_file_information turi::read_sframe_index_file ( std::string  index_file)

Reads an sframe index file from disk. Raises an exception on failure.

This function will also automatically de-relativize the sframe_index_file_information::column_files to get absolute paths

◆ sframe_row_to_csv()

size_t turi::sframe_row_to_csv ( const std::vector< flexible_type > &  row,
char *  buf,
size_t  buflen 
)

Write a csv string of a vector of flexible_types (as a row in the sframe) to buffer. Return the number of bytes written.

◆ sframe_row_to_json()

void turi::sframe_row_to_json ( const std::vector< std::string > &  column_names,
const std::vector< flexible_type > &  column_values,
JSONNode &  node 
)

Write column_names and column_values (as a row in the sframe) to JSONNode.

◆ sframe_save()

void turi::sframe_save ( const sframe sf,
std::string  index_file 
)

Automatically determines the optimal strategy to save an sframe

◆ sframe_save_blockwise()

void turi::sframe_save_blockwise ( const sframe sf,
std::string  index_file 
)

Saves an SFrame to another index file location using a more efficient method, block by block.

◆ sframe_save_naive()

void turi::sframe_save_naive ( const sframe sf,
std::string  index_file 
)

Saves an SFrame to another index file location using the most naive method: decode rows, and write them

◆ sframe_save_weak_reference()

void turi::sframe_save_weak_reference ( const sframe sf,
std::string  index_file 
)

Performs an "incomplete save" to a target index file location. All this ensures is that the sframe's contents are located on the same "file-system" (protocol) as the index file. Essentially the reference save is guaranteed to be valid for only as long as no other SFrame files are deleted.

Essentially this can be used to build a "delta" SFrame.

  • You already have an SFrame on disk somewhere. Say... /data/a
  • You open it and add a column
  • Calling sframe_save_weak_reference to save it to /data/b
  • The saved SFrame in /data/b will include just the new column, but reference /data/a for the remaining columns.
Parameters
sfThe SFrame to save
index_fileThe output file location

◆ shuffle()

std::vector<sframe> turi::shuffle ( sframe  sframe_in,
size_t  n,
std::function< size_t(const std::vector< flexible_type > &)>  hash_fn,
std::function< void(const std::vector< flexible_type > &, size_t)>  emit_call_back = std::function< void(const std::vector< flexible_type > &, size_t)>() 
)

Shuffle the rows in one sframe into a collection of n sframes. Each output SFrame contains one segment.

std::vector<sframe> ret(n);
for (auto& sf : ret) {
INIT_WITH_NAMES_COLUMNS_AND_ONE_SEG(sframe_in.column_names(), sframe_in.column_types());
}
for (auto& row : sframe_in) {
size_t idx = hash_fn(row) % n;
add_row_to_sframe(ret[idx], row); // the order of addition is not guaranteed.
}

The result sframes have the same column names and types (including empty sframes). A result sframe can have 0 rows if non of the rows in the input sframe is hashed to it. (If n is greater than the size of input sframe, there will be at (n - sframe_in.size()) empty sframes in the return vector.

Parameters
nthe number of output sframe.
hash_fnthe hash function for each row in the input sframe.
Returns
A vector of n sframes.
Examples:
/build/src/core/storage/sframe_interface/unity_sframe.hpp.

◆ write_sframe_index_file()

void turi::write_sframe_index_file ( std::string  index_file,
const sframe_index_file_information info 
)

Writes an sframe index file to disk. Raises an exception on failure.

This function will also automatically relativize the sframe_index_file_information::column_files to get relative paths when writing to disk

Variable Documentation

◆ COMPRESSION_DISABLE_THRESHOLD

const float turi::COMPRESSION_DISABLE_THRESHOLD

If the post compression size is less than this fraction of the pre-compression size. compression is disabled.

◆ DEFAULT_SARRAY_READER_BUFFER_SIZE

const size_t turi::DEFAULT_SARRAY_READER_BUFFER_SIZE

The default parsed buffer size used in the sarray_reader_buffer. The iterators returned by sarray_reader::begin() , sarray_reader::end(), sframe_reader::begin() and sframe_reader::end() also use this as the default parsed buffer size.

◆ FAST_COMPACT_BLOCKS_IN_SMALL_SEGMENT

size_t turi::FAST_COMPACT_BLOCKS_IN_SMALL_SEGMENT

If a segment contains less than this number of blocks, it is considered a small segment.

◆ MIN_SEGMENT_LENGTH

const size_t turi::MIN_SEGMENT_LENGTH

The minimum number of entries we want inside a segment (only used by join right now).

◆ SARRAY_FROM_FILE_BATCH_SIZE

const size_t turi::SARRAY_FROM_FILE_BATCH_SIZE

The number of rows read from a file in a batch when loading a file into an SArray. (a single column. NOT an sframe).

◆ SARRAY_WRITER_INITAL_ELEMENTS_PER_BLOCK

const size_t turi::SARRAY_WRITER_INITAL_ELEMENTS_PER_BLOCK

The initial number of elements in a block. This is used in sarray_group_format_writer_v2. This is the number of rows the writer will buffer at the start before issuing the first block write. After which, it will use the actual number of bytes written to try to estimate the number of rows to buffer before the next write. (essentially SFRAME_DEFAULT_BLOCK_SIZE / (average bytes per element)).

◆ SARRAY_WRITER_MIN_ELEMENTS_PER_BLOCK

const size_t turi::SARRAY_WRITER_MIN_ELEMENTS_PER_BLOCK

The minimum number of elements per block. Used in sarray_group_format_writer_v2. It will never write less than this number of elements into a block.

◆ SFRAME_BLOCK_MANAGER_BLOCK_BUFFER_COUNT

const size_t turi::SFRAME_BLOCK_MANAGER_BLOCK_BUFFER_COUNT

The default number of block buffers in the v0 block manager pool.

◆ SFRAME_COMPACTION_THRESHOLD

size_t turi::SFRAME_COMPACTION_THRESHOLD

The maximum number of segments an SFrame can have after which compaction will be attempted

◆ SFRAME_CSV_PARSER_READ_SIZE

size_t turi::SFRAME_CSV_PARSER_READ_SIZE

The amount to read from the file each time by the CSV parser. (this block is then parsed in parallel by a collection of threads)

◆ SFRAME_DEFAULT_BLOCK_SIZE

size_t turi::SFRAME_DEFAULT_BLOCK_SIZE

The default size of each block in the file. This is not strict. the sarray_group_format_writer_v2 will try to target blocks to be of this size, but the actual sizes may vary.

◆ SFRAME_DEFAULT_NUM_SEGMENTS

size_t turi::SFRAME_DEFAULT_NUM_SEGMENTS

The default number of segments created when an SFrame/SArray is opened for write. (i.e. sarray::open_for_write and sframe::open_for_write). This is default is used in numerous places. For instance the default number of output segments from the sframe_csv_parser, and the dataframe to sframe converter.

◆ SFRAME_FILE_HANDLE_POOL_SIZE

size_t turi::SFRAME_FILE_HANDLE_POOL_SIZE

The default number of handles in the v2 block manager pool.

◆ SFRAME_GROUPBY_BUFFER_NUM_ROWS

size_t turi::SFRAME_GROUPBY_BUFFER_NUM_ROWS

The number of elements to accumulate in a groupby batch until it has to flush.

◆ SFRAME_IO_LOCK_FILE_SIZE_THRESHOLD

const size_t turi::SFRAME_IO_LOCK_FILE_SIZE_THRESHOLD

If SFRAME_IO_READ_LOCK is set, then the IO LOCK is only used when the file size is greater than this value.

◆ SFRAME_IO_READ_LOCK

size_t turi::SFRAME_IO_READ_LOCK

Whether locks are used when reading from SFrames on local storage. Good for spinning disks, bad for SSDs.

◆ SFRAME_JOIN_BUFFER_NUM_CELLS

size_t turi::SFRAME_JOIN_BUFFER_NUM_CELLS

The number of bytes that a join algorithm is allowed to use during execution.

◆ SFRAME_MAX_BLOCKS_IN_CACHE

size_t turi::SFRAME_MAX_BLOCKS_IN_CACHE

The maximum number of data blocks that can be maintained in a reader's decoded cache

◆ SFRAME_SHUFFLE_BUCKET_SIZE

size_t turi::SFRAME_SHUFFLE_BUCKET_SIZE

The number of elements per bucket when performing a shuffle operation.

◆ SFRAME_SORT_MAX_SEGMENTS

size_t turi::SFRAME_SORT_MAX_SEGMENTS

The maximum number of segments we will try to partition the input SFrame into for external sort. Number kept low initially to be sensitive of open file handle limits.

◆ SFRAME_SORT_PIVOT_ESTIMATION_SAMPLE_SIZE

size_t turi::SFRAME_SORT_PIVOT_ESTIMATION_SAMPLE_SIZE

Number of samples used to estimate the pivot positions to partition the data for sorting.

◆ SFRAME_WRITER_BUFFER_HARD_LIMIT

const size_t turi::SFRAME_WRITER_BUFFER_HARD_LIMIT

The number of rows to buffer before forcing to flush the buffer to disk. Used in shuffle operation.

◆ SFRAME_WRITER_MAX_BUFFERED_CELLS

size_t turi::SFRAME_WRITER_MAX_BUFFERED_CELLS

The maximum number of elements cached across all columns of the sarray_group writer. Once this is exceeded, flushes will happen even if the block size is still too small. This is maintained approximately. Essentially, this has the effect of setting SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK to SFRAME_WRITER_MAX_BUFFERED_CELLS / (#columns * #segments)

◆ SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK

size_t turi::SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK

The maximum number of elements per block. Used in sarray_group_format_writer_v2. It will never write more than this number of elements into a block.