Turi Create
4.0
|
Namespaces | |
turi::sframe_config | |
Classes | |
class | turi::sarray< T > |
class | turi::sarray_block_iterator< DataType > |
class | turi::sarray_iterator< T > |
class | turi::sarray_reader< T > |
class | turi::sarray_reader_buffer< T > |
class | turi::sframe |
struct | turi::sframe_index_file_information |
class | turi::parallel_sframe_iterator_initializer |
class | turi::parallel_sframe_iterator |
class | turi::sframe_iterator |
class | turi::sframe_reader |
class | turi::sframe_reader_buffer |
class | turi::sframe_rows |
Functions | |
template<typename T > | |
sarray_block_iterator< T > | turi::make_sarray_block_iterator (const std::shared_ptr< sarray< T > > &data) |
sframe_index_file_information | turi::read_sframe_index_file (std::string index_file) |
void | turi::write_sframe_index_file (std::string index_file, const sframe_index_file_information &info) |
size_t | turi::sframe_row_to_csv (const std::vector< flexible_type > &row, char *buf, size_t buflen) |
void | turi::sframe_row_to_json (const std::vector< std::string > &column_names, const std::vector< flexible_type > &column_values, JSONNode &node) |
void | turi::sframe_save_naive (const sframe &sf, std::string index_file) |
void | turi::sframe_save_blockwise (const sframe &sf, std::string index_file) |
void | turi::sframe_save (const sframe &sf, std::string index_file) |
void | turi::sframe_save_weak_reference (const sframe &sf, std::string index_file) |
std::vector< sframe > | turi::shuffle (sframe sframe_in, size_t n, std::function< size_t(const std::vector< flexible_type > &)> hash_fn, std::function< void(const std::vector< flexible_type > &, size_t)> emit_call_back=std::function< void(const std::vector< flexible_type > &, size_t)>()) |
size_t | turi::sarray_reader< T >::read_rows (size_t row_start, size_t row_end, sframe_rows &out_obj) |
value_type && | turi::sarray_reader_buffer< T >::next () |
Return the next element in the reader. More... | |
sarray_block_iterator<T> turi::make_sarray_block_iterator | ( | const std::shared_ptr< sarray< T > > & | data | ) |
Creates a sarray block iterator; convenience function using automatic template matching.
Definition at line 245 of file sarray_iterators.hpp.
T && turi::sarray_reader_buffer< T >::next | ( | ) |
Return the next element in the reader.
Return the next element in the chunk.
Definition at line 146 of file sarray_reader_buffer.hpp.
|
inline |
Reads a collection of rows, storing the result in out_obj. This function is independent of the open_segment/read_segment/close_segment functions, and can be called anytime. This function is also fully concurrent.
row_start | First row to read |
row_end | one past the last row to read (i.e. EXCLUSIVE). row_end can be beyond the end of the array, in which case, fewer rows will be read. |
out_obj | The output array |
Definition at line 574 of file sarray_reader.hpp.
sframe_index_file_information turi::read_sframe_index_file | ( | std::string | index_file | ) |
Reads an sframe index file from disk. Raises an exception on failure.
This function will also automatically de-relativize the sframe_index_file_information::column_files to get absolute paths
size_t turi::sframe_row_to_csv | ( | const std::vector< flexible_type > & | row, |
char * | buf, | ||
size_t | buflen | ||
) |
Write a csv string of a vector of flexible_types (as a row in the sframe) to buffer. Return the number of bytes written.
void turi::sframe_row_to_json | ( | const std::vector< std::string > & | column_names, |
const std::vector< flexible_type > & | column_values, | ||
JSONNode & | node | ||
) |
Write column_names and column_values (as a row in the sframe) to JSONNode.
void turi::sframe_save | ( | const sframe & | sf, |
std::string | index_file | ||
) |
Automatically determines the optimal strategy to save an sframe
void turi::sframe_save_blockwise | ( | const sframe & | sf, |
std::string | index_file | ||
) |
Saves an SFrame to another index file location using a more efficient method, block by block.
void turi::sframe_save_naive | ( | const sframe & | sf, |
std::string | index_file | ||
) |
Saves an SFrame to another index file location using the most naive method: decode rows, and write them
void turi::sframe_save_weak_reference | ( | const sframe & | sf, |
std::string | index_file | ||
) |
Performs an "incomplete save" to a target index file location. All this ensures is that the sframe's contents are located on the same "file-system" (protocol) as the index file. Essentially the reference save is guaranteed to be valid for only as long as no other SFrame files are deleted.
Essentially this can be used to build a "delta" SFrame.
sf | The SFrame to save |
index_file | The output file location |
std::vector<sframe> turi::shuffle | ( | sframe | sframe_in, |
size_t | n, | ||
std::function< size_t(const std::vector< flexible_type > &)> | hash_fn, | ||
std::function< void(const std::vector< flexible_type > &, size_t)> | emit_call_back = std::function< void(const std::vector< flexible_type > &, size_t)>() |
||
) |
Shuffle the rows in one sframe into a collection of n sframes. Each output SFrame contains one segment.
The result sframes have the same column names and types (including empty sframes). A result sframe can have 0 rows if non of the rows in the input sframe is hashed to it. (If n is greater than the size of input sframe, there will be at (n - sframe_in.size()) empty sframes in the return vector.
n | the number of output sframe. |
hash_fn | the hash function for each row in the input sframe. |
void turi::write_sframe_index_file | ( | std::string | index_file, |
const sframe_index_file_information & | info | ||
) |
Writes an sframe index file to disk. Raises an exception on failure.
This function will also automatically relativize the sframe_index_file_information::column_files to get relative paths when writing to disk
const float turi::COMPRESSION_DISABLE_THRESHOLD |
If the post compression size is less than this fraction of the pre-compression size. compression is disabled.
const size_t turi::DEFAULT_SARRAY_READER_BUFFER_SIZE |
The default parsed buffer size used in the sarray_reader_buffer. The iterators returned by sarray_reader::begin() , sarray_reader::end(), sframe_reader::begin() and sframe_reader::end() also use this as the default parsed buffer size.
size_t turi::FAST_COMPACT_BLOCKS_IN_SMALL_SEGMENT |
If a segment contains less than this number of blocks, it is considered a small segment.
const size_t turi::MIN_SEGMENT_LENGTH |
The minimum number of entries we want inside a segment (only used by join right now).
const size_t turi::SARRAY_FROM_FILE_BATCH_SIZE |
The number of rows read from a file in a batch when loading a file into an SArray. (a single column. NOT an sframe).
const size_t turi::SARRAY_WRITER_INITAL_ELEMENTS_PER_BLOCK |
The initial number of elements in a block. This is used in sarray_group_format_writer_v2. This is the number of rows the writer will buffer at the start before issuing the first block write. After which, it will use the actual number of bytes written to try to estimate the number of rows to buffer before the next write. (essentially SFRAME_DEFAULT_BLOCK_SIZE / (average bytes per element)).
const size_t turi::SARRAY_WRITER_MIN_ELEMENTS_PER_BLOCK |
The minimum number of elements per block. Used in sarray_group_format_writer_v2. It will never write less than this number of elements into a block.
const size_t turi::SFRAME_BLOCK_MANAGER_BLOCK_BUFFER_COUNT |
The default number of block buffers in the v0 block manager pool.
size_t turi::SFRAME_COMPACTION_THRESHOLD |
The maximum number of segments an SFrame can have after which compaction will be attempted
size_t turi::SFRAME_CSV_PARSER_READ_SIZE |
The amount to read from the file each time by the CSV parser. (this block is then parsed in parallel by a collection of threads)
size_t turi::SFRAME_DEFAULT_BLOCK_SIZE |
The default size of each block in the file. This is not strict. the sarray_group_format_writer_v2 will try to target blocks to be of this size, but the actual sizes may vary.
size_t turi::SFRAME_DEFAULT_NUM_SEGMENTS |
The default number of segments created when an SFrame/SArray is opened for write. (i.e. sarray::open_for_write and sframe::open_for_write). This is default is used in numerous places. For instance the default number of output segments from the sframe_csv_parser, and the dataframe to sframe converter.
size_t turi::SFRAME_FILE_HANDLE_POOL_SIZE |
The default number of handles in the v2 block manager pool.
size_t turi::SFRAME_GROUPBY_BUFFER_NUM_ROWS |
The number of elements to accumulate in a groupby batch until it has to flush.
const size_t turi::SFRAME_IO_LOCK_FILE_SIZE_THRESHOLD |
If SFRAME_IO_READ_LOCK is set, then the IO LOCK is only used when the file size is greater than this value.
size_t turi::SFRAME_IO_READ_LOCK |
Whether locks are used when reading from SFrames on local storage. Good for spinning disks, bad for SSDs.
size_t turi::SFRAME_JOIN_BUFFER_NUM_CELLS |
The number of bytes that a join algorithm is allowed to use during execution.
size_t turi::SFRAME_MAX_BLOCKS_IN_CACHE |
The maximum number of data blocks that can be maintained in a reader's decoded cache
size_t turi::SFRAME_SHUFFLE_BUCKET_SIZE |
The number of elements per bucket when performing a shuffle operation.
size_t turi::SFRAME_SORT_MAX_SEGMENTS |
The maximum number of segments we will try to partition the input SFrame into for external sort. Number kept low initially to be sensitive of open file handle limits.
size_t turi::SFRAME_SORT_PIVOT_ESTIMATION_SAMPLE_SIZE |
Number of samples used to estimate the pivot positions to partition the data for sorting.
const size_t turi::SFRAME_WRITER_BUFFER_HARD_LIMIT |
The number of rows to buffer before forcing to flush the buffer to disk. Used in shuffle operation.
size_t turi::SFRAME_WRITER_MAX_BUFFERED_CELLS |
The maximum number of elements cached across all columns of the sarray_group writer. Once this is exceeded, flushes will happen even if the block size is still too small. This is maintained approximately. Essentially, this has the effect of setting SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK to SFRAME_WRITER_MAX_BUFFERED_CELLS / (#columns * #segments)
size_t turi::SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK |
The maximum number of elements per block. Used in sarray_group_format_writer_v2. It will never write more than this number of elements into a block.