Turi Create  4.0
metadata_impl.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_DML_METADATA_IMPL_H_
7 #define TURI_DML_METADATA_IMPL_H_
8 
9 namespace turi {
10 
11 ////////////////////////////////////////////////////////////////////////////////
12 // Implementations of the above
13 
14  /** Returns a pointer to the internal column metadata of column
15  * column_index. Useful for dealing with the column_index
16  */
17 inline ml_data_internal::column_metadata_ptr
18  ml_metadata::get_column_metadata(size_t column_index) const {
19 
20  DASSERT_LT(column_index, num_columns());
21  return columns[column_index];
22 }
23 
24 /** Returns the index of the column matching column_name, or throws
25  * an error if it does not exist.
26  */
27 inline size_t ml_metadata::column_index(const std::string& _column_name, bool max_on_error) const {
28  auto it = _column_name_to_index_map.find(_column_name);
29 
30  bool is_present = (it != _column_name_to_index_map.end());
31 
32  if(max_on_error) {
33  return LIKELY(is_present) ? it->second : size_t(-1);
34  } else {
35  if(UNLIKELY(!is_present)) {
36  log_and_throw((std::string("Column ") + _column_name + " not found in model metadata.").c_str());
37  }
38  DASSERT_TRUE(column_name(it->second) == _column_name);
39  return it->second;
40  }
41 }
42 
43 /**
44  * Returns true if the metadata contains the given column.
45  *
46  * \param column_name The name of the column.
47  */
48 inline bool ml_metadata::contains_column(const std::string& column_name) const {
49  return (_column_name_to_index_map.find(column_name) != _column_name_to_index_map.end());
50 }
51 
52 
53 inline bool ml_metadata::has_target() const {
54  return (target != nullptr);
55 }
56 
57 ////////////////////////////////////////////////////////////////////////////////
58 // The indexers
59 
60 inline const std::shared_ptr<ml_data_internal::column_indexer>&
62  return get_column_metadata(column_index)->indexer;
63 }
64 
65 inline const std::shared_ptr<ml_data_internal::column_indexer>&
66 ml_metadata::indexer(const std::string& column_name) const {
67  return indexer(column_index(column_name));
68 }
69 
70 
71 ////////////////////////////////////////////////////////////////////////////////
72 // Statistics
73 
74 inline const std::shared_ptr<ml_data_internal::column_statistics>&
76  return get_column_metadata(column_index)->statistics;
77 }
78 
79 inline const std::shared_ptr<ml_data_internal::column_statistics>&
80 ml_metadata::statistics(const std::string& column_name) const {
81  return statistics(column_index(column_name));
82 }
83 
84 
85 inline size_t ml_metadata::num_columns(bool include_untranslated_columns) const {
86  size_t nc = columns.size();
87 
88  if(!include_untranslated_columns)
90 
91  return nc;
92 }
93 
95  DASSERT_NE(_num_untranslated_columns, size_t(-1));
96  return _num_untranslated_columns;
97 }
98 
99 /** Returns true if there are translated columns present, and false
100  * otherwise.
101  */
103  return num_untranslated_columns() != columns.size();
104 }
105 
106 /** Returns true if there are untranslated columns present, and false
107  * otherwise.
108  */
110  return num_untranslated_columns() != 0;
111 }
112 
113 /** Returns the number of distinct dimensions, including all
114  * categorical features.
115  */
116 inline size_t ml_metadata::num_dimensions() const {
117  DASSERT_NE(_num_dimensions, size_t(-1));
118  return _num_dimensions;
119 }
120 
121 /** Returns the size of the columns in the metadata that were
122  * present at train time.
123  *
124  */
125 inline const std::string& ml_metadata::column_name(size_t column_index) const {
126  return get_column_metadata(column_index)->name;
127 }
128 
129 /** Returns the size of the columns in the metadata that were
130  * present at train time.
131  *
132  */
133 inline const std::string& ml_metadata::target_column_name() const {
135  return target->name;
136 }
137 
138 inline const std::shared_ptr<ml_data_internal::column_indexer>&
139 ml_metadata::target_indexer() const {
141  return target->indexer;
142 }
143 
144 inline const std::shared_ptr<ml_data_internal::column_statistics>&
145 ml_metadata::target_statistics() const {
147  return target->statistics;
148 }
149 
150 /** Returns the current index size of the columns in the metadata.
151  */
152 inline size_t ml_metadata::column_size(size_t column_index) const {
153  return get_column_metadata(column_index)->column_size();
154 }
155 
156 /** Returns the current nd column shape of the columns
157  */
158 inline const flex_nd_vec::index_range_type& ml_metadata::nd_column_shape(size_t column_index) const {
159  return get_column_metadata(column_index)->nd_column_shape();
160 }
161 
162 /** Returns the current nd column shape of the columns
163  */
164 inline const flex_nd_vec::index_range_type& ml_metadata::nd_column_shape(const std::string& column_name) const {
165  return nd_column_shape(column_index(column_name));
166 }
167 
168 
169 /** Returns the current index size of the columns in the metadata.
170  */
171 inline size_t ml_metadata::target_column_size() const {
173  return target->column_size();
174 }
175 
176 /** Returns the index size of the columns in the metadata that were
177  * present at train time. Index size differs from column size in
178  * that column_size may grow on test, but index_size is constant.
179  */
180 inline size_t ml_metadata::index_size(size_t column_index) const {
181  return get_column_metadata(column_index)->index_size();
182 }
183 
184 /** Returns the index size of the column in the metadata that were
185  * present at train time. Index size differs from column size in
186  * that column_size may grow on test, but index_size is constant.
187  *
188  * \overload
189  *
190  * \param column_name The name of the column.
191  */
192 inline size_t ml_metadata::index_size(const std::string& column_name) const {
193  return index_size(column_index(column_name));
194 }
195 
196 /** Returns the index size of the columns in the metadata that were
197  * present at train time. Index size differs from column size in
198  * that column_size may grow on test, but index_size is constant.
199  */
200 inline size_t ml_metadata::target_index_size() const {
201  return target->index_size();
202 }
203 
204 
205 /** Returns the global index offset of the columns in the metadata
206  * that were present at train time. This is fixed at setup time;
207  * global indices for the column c_idx are in the interval
208  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
209  */
210 inline size_t ml_metadata::global_index_offset(size_t column_index) const {
211  return get_column_metadata(column_index)->global_index_offset();
212 }
213 
214 /** Returns the global index offset of the columns in the metadata
215  * that were present at train time. This is fixed at setup time;
216  * global indices for the column c_idx are in the interval
217  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
218  *
219  * \overload
220  *
221  * \param column_name The name of the column.
222  */
223 inline size_t ml_metadata::global_index_offset(const std::string& column_name) const {
224  return global_index_offset(column_index(column_name));
225 }
226 
227 /** Returns the mode of the column. See ml_data_column_modes.hpp
228  * for details on the column modes.
229  *
230  * \param column_index The index of the column.
231  */
233  return get_column_metadata(column_index)->mode;
234 }
235 
236 /** Returns the mode of the column. See ml_data_column_modes.hpp
237  * for details on the column modes.
238  *
239  * \overload
240  *
241  * \param column_name The name of the column.
242  */
243 inline ml_column_mode ml_metadata::column_mode(const std::string& column_name) const {
244  return column_mode(column_index(column_name));
245 }
246 
247 /** Returns the mode of the target column. See
248  * ml_data_column_modes.hpp for details on the column modes.
249  */
252  return target->mode;
253 }
254 
255 /** Returns the size of the columns in the metadata that were
256  * present at train time.
257  *
258  * \param column_index The index of the column.
259  */
261  return get_column_metadata(column_index)->original_column_type;
262 }
263 
264 /** Returns the size of the columns in the metadata that were
265  * present at train time.
266  *
267  * \overload
268  *
269  * \param column_name The name of the column.
270  */
271 inline flex_type_enum ml_metadata::column_type(const std::string& column_name) const {
272  return column_type(column_index(column_name));
273 }
274 
275 
276 /** Returns the type of the target column.
277  */
280  return target->original_column_type;
281 }
282 
283 /** Returns true if the underlying type is treated as a categorical
284  * variable, and false otherwise.
285  *
286  * \param column_index The index of the column.
287  */
288 inline bool ml_metadata::is_categorical(size_t column_index) const {
289  ml_column_mode mode = get_column_metadata(column_index)->mode;
290  return mode_is_categorical(mode);
291 }
292 
293 /** Returns true if the underlying type is treated as a categorical
294  * variable, and false otherwise.
295  *
296  * \overload
297  *
298  * \param column_name The name of the column.
299  */
300 inline bool ml_metadata::is_categorical(const std::string& column_name) const {
301  return is_categorical(column_index(column_name));
302 }
303 
304 /** Returns true if the underlying column type is indexed, and false
305  * otherwise. This differs form the is_categorical in that
306  * dictionaries are not treated as pure categorical variables, as
307  * they have values associated with them, but they are indexed.
308  *
309  * \param column_index The index of the column.
310  */
311 inline bool ml_metadata::is_indexed(size_t column_index) const {
312  ml_column_mode mode = get_column_metadata(column_index)->mode;
313  return mode_is_indexed(mode);
314 }
315 
316 /** Returns true if the underlying column type is indexed, and false
317  * otherwise. This differs form the is_categorical in that
318  * dictionaries are not treated as pure categorical variables, as
319  * they have values associated with them, but they are indexed.
320  *
321  * \overload
322  *
323  * \param column_name The name of the column.
324  */
325 inline bool ml_metadata::is_indexed(const std::string& column_name) const {
326  return is_indexed(column_index(column_name));
327 }
328 
329 /** Returns true if the underlying column type is untranslated.
330  * This means it will only be available as flexible_type later on.
331  *
332  * \param column_index The index of the column.
333  */
335  return get_column_metadata(column_index)->is_untranslated_column();
336 }
337 
338 /** Returns true if the underlying column type is untranslated.
339  * This means it will only be available as flexible_type later on.
340  *
341  * \overload
342  *
343  * \param column_name The name of the column.
344  */
345 inline bool ml_metadata::is_untranslated_column(const std::string& column_name) const {
346  return is_untranslated_column(column_index(column_name));
347 }
348 
349 
350 /** Returns true if the underlying type is treated as a categorical
351  * variable, and false otherwise.
352  */
355  return mode_is_categorical(target->mode);
356 }
357 
358 /** Returns true if the underlying type is indexed, and false
359  * otherwise. This differs form the is_categorical in that
360  * dictionaries are not treated as pure categorical variables, as
361  * they have values associated with them, but they are indexed.
362  */
363 inline bool ml_metadata::target_is_indexed() const {
365  return mode_is_indexed(target->mode);
366 }
367 
368 }
369 
370 #endif
const flex_nd_vec::index_range_type & nd_column_shape(size_t column_index) const
size_t target_column_size() const
static GL_HOT_INLINE_FLATTEN bool mode_is_categorical(ml_column_mode mode)
bool has_translated_columns() const
bool is_categorical(size_t column_index) const
size_t column_size(size_t column_index) const
bool has_untranslated_columns() const
bool is_indexed(size_t column_index) const
const std::string & column_name(size_t column_index) const
const std::string & target_column_name() const
bool target_is_indexed() const
size_t num_untranslated_columns() const
const std::shared_ptr< ml_data_internal::column_indexer > & indexer(size_t column_index) const
ml_column_mode target_column_mode() const
const std::shared_ptr< ml_data_internal::column_statistics > & statistics(size_t column_index) const
size_t index_size(size_t column_index) const
ml_column_mode column_mode(size_t column_index) const
flex_type_enum target_column_type() const
size_t target_index_size() const
size_t column_index(const std::string &column_name, bool max_on_error=false) const
size_t num_dimensions() const
size_t num_columns(bool include_untranslated_columns=true) const
ml_data_internal::column_metadata_ptr get_column_metadata(size_t column_index) const
bool target_is_categorical() const
bool contains_column(const std::string &column_name) const
static GL_HOT_INLINE_FLATTEN bool mode_is_indexed(ml_column_mode mode)
bool has_target() const
bool is_untranslated_column(size_t column_index) const
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
size_t global_index_offset(size_t column_index) const
flex_type_enum column_type(size_t column_index) const