Turi Create  4.0
sframe_constants.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_SFRAME_CONSTANTS_HPP
7 #define TURI_SFRAME_CONSTANTS_HPP
8 #include <cstddef>
9 #include <string>
10 namespace turi {
11 
12 
13 /**
14  * \ingroup sframe_physical
15  * \addtogroup sframe_main Main SFrame Objects
16  * \{
17  */
18 
19 /**
20  * The default number of segments created when an SFrame/SArray is
21  * opened for write.
22  * (i.e. \ref sarray::open_for_write and \ref sframe::open_for_write).
23  * This is default is used in numerous places. For instance the default
24  * number of output segments from the sframe_csv_parser, and the dataframe to
25  * sframe converter.
26  */
27 extern size_t SFRAME_DEFAULT_NUM_SEGMENTS;
28 
29 /**
30  * The default parsed buffer size used in the \ref sarray_reader_buffer.
31  * The iterators returned by \ref sarray_reader::begin() , \ref sarray_reader::end(),
32  * \ref sframe_reader::begin() and \ref sframe_reader::end() also use this
33  * as the default parsed buffer size.
34  */
35 extern const size_t DEFAULT_SARRAY_READER_BUFFER_SIZE;
36 
37 /**
38  * The number of rows read from a file in a batch when loading a file into
39  * an SArray. (a single column. NOT an sframe).
40  */
41 extern const size_t SARRAY_FROM_FILE_BATCH_SIZE;
42 
43 /**
44  * The minimum number of entries we want inside a segment
45  * (only used by join right now).
46  */
47 extern const size_t MIN_SEGMENT_LENGTH;
48 
49 /*
50  * The number of rows to buffer before trying to flush the buffer to disk.
51  * Used in shuffle operation.
52  */
53 extern const size_t SFRAME_WRITER_BUFFER_SOFT_LIMIT;
54 
55 /**
56  * The number of rows to buffer before forcing to flush the buffer to disk.
57  * Used in shuffle operation.
58  */
59 extern const size_t SFRAME_WRITER_BUFFER_HARD_LIMIT;
60 
61 /**
62  * The default number of handles in the v2 block manager pool.
63  */
64 extern size_t SFRAME_FILE_HANDLE_POOL_SIZE;
65 
66 
67 /**
68  * The default number of block buffers in the v0 block manager pool.
69  */
70 extern const size_t SFRAME_BLOCK_MANAGER_BLOCK_BUFFER_COUNT;
71 
72 /**
73  * If the post compression size is less than this fraction of the
74  * pre-compression size. compression is disabled.
75  */
76 extern const float COMPRESSION_DISABLE_THRESHOLD;
77 
78 
79 /**
80  * The default size of each block in the file. This is not strict. the
81  * sarray_group_format_writer_v2 will try to target blocks to be of this size,
82  * but the actual sizes may vary.
83  */
84 extern size_t SFRAME_DEFAULT_BLOCK_SIZE;
85 
86 /**
87  * The initial number of elements in a block.
88  * This is used in sarray_group_format_writer_v2. This is the number of rows
89  * the writer will buffer at the start before issuing the first block write.
90  * After which, it will use the actual number of bytes written to try to
91  * estimate the number of rows to buffer before the next write. (essentially
92  * SFRAME_DEFAULT_BLOCK_SIZE / (average bytes per element)).
93  */
94 extern const size_t SARRAY_WRITER_INITAL_ELEMENTS_PER_BLOCK;
95 /**
96  * The minimum number of elements per block. Used in
97  * sarray_group_format_writer_v2. It will never write less than this
98  * number of elements into a block.
99  */
100 extern const size_t SARRAY_WRITER_MIN_ELEMENTS_PER_BLOCK;
101 
102 /**
103  * The maximum number of elements per block. Used in
104  * sarray_group_format_writer_v2. It will never write more than this
105  * number of elements into a block.
106  */
108 
109 /**
110  * The maximum number of elements cached across all columns of the
111  * sarray_group writer. Once this is exceeded, flushes will happen even
112  * if the block size is still too small. This is maintained approximately.
113  * Essentially, this has the effect of setting
114  * SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK to
115  * SFRAME_WRITER_MAX_BUFFERED_CELLS / (#columns * #segments)
116  */
118 
119 /**
120  * The maximum number of data blocks that can be maintained in a reader's
121  * decoded cache
122  */
123 extern size_t SFRAME_MAX_BLOCKS_IN_CACHE;
124 
125 /**
126  * The amount to read from the file each time by the CSV parser. (this block
127  * is then parsed in parallel by a collection of threads)
128  */
129 extern size_t SFRAME_CSV_PARSER_READ_SIZE;
130 
131 
132 
133 /**
134  * The number of elements to accumulate in a groupby batch until it has to flush.
135  */
136 extern size_t SFRAME_GROUPBY_BUFFER_NUM_ROWS;
137 
138 /**
139  * The number of elements per bucket when performing a shuffle operation.
140  */
141 extern size_t SFRAME_SHUFFLE_BUCKET_SIZE;
142 
143 /**
144  * The number of bytes that a join algorithm is allowed to use during execution.
145  */
146 extern size_t SFRAME_JOIN_BUFFER_NUM_CELLS;
147 
148 /**
149  * Whether locks are used when reading from SFrames on local storage. Good
150  * for spinning disks, bad for SSDs.
151  */
152 extern size_t SFRAME_IO_READ_LOCK;
153 
154 
155 /**
156  * If SFRAME_IO_READ_LOCK is set, then the IO LOCK is only used when the
157  * file size is greater than this value.
158  */
159 extern const size_t SFRAME_IO_LOCK_FILE_SIZE_THRESHOLD;
160 
161 /**
162  * Number of samples used to estimate the pivot positions to partition the
163  * data for sorting.
164  */
166 
167 /**
168  * The maximum number of segments we will try to partition the input SFrame
169  * into for external sort. Number kept low initially to be sensitive of open
170  * file handle limits.
171  */
172 extern size_t SFRAME_SORT_MAX_SEGMENTS;
173 
174 /**
175  * The maximum number of segments an SFrame can have after which compaction
176  * will be attempted
177  */
178 extern size_t SFRAME_COMPACTION_THRESHOLD;
179 
180 /**
181  * If a segment contains less than this number of blocks, it is
182  * considered a small segment.
183  */
185 /// \}
186 } // namespace turi
187 #endif
size_t SFRAME_SORT_PIVOT_ESTIMATION_SAMPLE_SIZE
size_t SFRAME_SORT_MAX_SEGMENTS
size_t SFRAME_DEFAULT_BLOCK_SIZE
size_t SFRAME_JOIN_BUFFER_NUM_CELLS
const size_t SFRAME_BLOCK_MANAGER_BLOCK_BUFFER_COUNT
const size_t SARRAY_FROM_FILE_BATCH_SIZE
size_t SFRAME_FILE_HANDLE_POOL_SIZE
size_t SFRAME_MAX_BLOCKS_IN_CACHE
size_t SFRAME_WRITER_MAX_BUFFERED_CELLS
size_t SFRAME_GROUPBY_BUFFER_NUM_ROWS
size_t SFRAME_CSV_PARSER_READ_SIZE
const float COMPRESSION_DISABLE_THRESHOLD
const size_t SFRAME_IO_LOCK_FILE_SIZE_THRESHOLD
const size_t SARRAY_WRITER_MIN_ELEMENTS_PER_BLOCK
size_t SFRAME_DEFAULT_NUM_SEGMENTS
const size_t DEFAULT_SARRAY_READER_BUFFER_SIZE
const size_t MIN_SEGMENT_LENGTH
size_t SFRAME_WRITER_MAX_BUFFERED_CELLS_PER_BLOCK
const size_t SFRAME_WRITER_BUFFER_HARD_LIMIT
const size_t SARRAY_WRITER_INITAL_ELEMENTS_PER_BLOCK
size_t FAST_COMPACT_BLOCKS_IN_SMALL_SEGMENT
size_t SFRAME_SHUFFLE_BUCKET_SIZE
size_t SFRAME_IO_READ_LOCK
size_t SFRAME_COMPACTION_THRESHOLD