Turi Create  4.0
metadata_impl.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_METADATA_IMPL_H_
7 #define TURI_ML2_METADATA_IMPL_H_
8 
9 #include <toolkits/ml_data_2/side_features.hpp>
10 
11 namespace turi { namespace v2 {
12 
13 ////////////////////////////////////////////////////////////////////////////////
14 // Implementations of the above
15 
16  /** Returns a pointer to the internal column metadata of column
17  * column_index. Useful for dealing with the column_index
18  */
19 inline ml_data_internal::column_metadata_ptr
20  ml_metadata::get_column_metadata(size_t column_index) const {
21 
22  DASSERT_LT(column_index, num_columns());
23 
24  return (side_features != nullptr
25  ? side_features->get_full_column_metadata()[column_index]
26  : columns[column_index]);
27 }
28 
29 /** Returns the index of the column matching column_name, or throws
30  * an error if it does not exist.
31  */
32 inline size_t ml_metadata::column_index(const std::string& _column_name) const {
33  auto it = _column_name_to_index_map.find(_column_name);
34 
35  if(UNLIKELY(it == _column_name_to_index_map.end()))
36  log_and_throw((std::string("Column ") + _column_name + " not found in model metadata.").c_str());
37 
38  DASSERT_TRUE(column_name(it->second) == _column_name);
39 
40  return it->second;
41 }
42 
43 /**
44  * Returns true if the metadata contains the given column.
45  *
46  * \param column_name The name of the column.
47  */
48 inline bool ml_metadata::contains_column(const std::string& column_name) const {
49  return (_column_name_to_index_map.find(column_name) != _column_name_to_index_map.end());
50 }
51 
52 
53 inline bool ml_metadata::has_target() const {
54  return (target != nullptr);
55 }
56 
57 ////////////////////////////////////////////////////////////////////////////////
58 // The indexers
59 
60 /** Returns the metadata for a particular column, even if that
61  * column is a side feature.
62  *
63  * \param column_index The index of the column.
64  */
65 inline const std::shared_ptr<ml_data_internal::column_indexer>&
66 ml_metadata::indexer(size_t column_index) const {
67  return get_column_metadata(column_index)->indexer;
68 }
69 
70 /** Returns the metadata for a particular column, even if that
71  * column is a side feature.
72  *
73  * \overload
74  *
75  * \param column_name The name of the column.
76  */
77 inline const std::shared_ptr<ml_data_internal::column_indexer>&
78 ml_metadata::indexer(const std::string& column_name) const {
79  return indexer(column_index(column_name));
80 }
81 
82 
83 ////////////////////////////////////////////////////////////////////////////////
84 // Statistics
85 
86 /** Returns the metadata for a particular column, even if that
87  * column is a side feature.
88  *
89  * \param column_index The index of the column.
90  */
91 inline const std::shared_ptr<ml_data_internal::column_statistics>&
92 ml_metadata::statistics(size_t column_index) const {
93  return get_column_metadata(column_index)->statistics;
94 }
95 
96 /** Returns the metadata for a particular column, even if that
97  * column is a side feature.
98  *
99  * \overload
100  *
101  * \param column_name The name of the column.
102  */
103 inline const std::shared_ptr<ml_data_internal::column_statistics>&
104 ml_metadata::statistics(const std::string& column_name) const {
105  return statistics(column_index(column_name));
106 }
107 
108 
109 /** Returns the number of columns present, including any possible
110  * side columns.
111  */
112 inline size_t ml_metadata::num_columns(bool include_side_columns_if_present,
113  bool include_untranslated_columns) const {
114  size_t nc = (include_side_columns_if_present && (side_features != nullptr)
115  ? side_features->get_full_column_metadata().size()
116  : columns.size());
117 
118  if(!include_untranslated_columns)
119  nc -= num_untranslated_columns();
120 
121  return nc;
122 }
123 
124 /** Returns the number of columns present, including any possible
125  * side columns.
126  */
127 inline size_t ml_metadata::num_untranslated_columns() const {
128  DASSERT_NE(_num_untranslated_columns, size_t(-1));
129  return _num_untranslated_columns;
130 }
131 
132 /** Returns true if there are translated columns present, and false
133  * otherwise.
134  */
135 inline bool ml_metadata::has_translated_columns() const {
136  return num_untranslated_columns() != columns.size();
137 }
138 
139 /** Returns true if there are untranslated columns present, and false
140  * otherwise.
141  */
142 inline bool ml_metadata::has_untranslated_columns() const {
143  return num_untranslated_columns() != 0;
144 }
145 
146 /** Returns the number of distinct dimensions, including all
147  * categorical features, side features, etc.
148  */
149 inline size_t ml_metadata::num_dimensions() const {
150  DASSERT_NE(_num_dimensions, size_t(-1));
151  return _num_dimensions;
152 }
153 
154 /** Returns the size of the columns in the metadata that were
155  * present at train time.
156  *
157  */
158 inline const std::string& ml_metadata::column_name(size_t column_index) const {
159  return get_column_metadata(column_index)->name;
160 }
161 
162 /** Returns the size of the columns in the metadata that were
163  * present at train time.
164  *
165  */
166 inline const std::string& ml_metadata::target_column_name() const {
167  DASSERT_TRUE(has_target());
168  return target->name;
169 }
170 
171 /** Returns the metadata for a particular column, even if that
172  * column is a side feature.
173  */
174 inline const std::shared_ptr<ml_data_internal::column_indexer>&
175 ml_metadata::target_indexer() const {
176  DASSERT_TRUE(has_target());
177  return target->indexer;
178 }
179 
180 /** Returns the metadata for a particular column, even if that
181  * column is a side feature.
182  */
183 inline const std::shared_ptr<ml_data_internal::column_statistics>&
184 ml_metadata::target_statistics() const {
185  DASSERT_TRUE(has_target());
186  return target->statistics;
187 }
188 
189 /** Returns the current index size of the columns in the metadata.
190  */
191 inline size_t ml_metadata::column_size(size_t column_index) const {
192  return get_column_metadata(column_index)->column_size();
193 }
194 
195 /** Returns the current index size of the columns in the metadata.
196  */
197 inline size_t ml_metadata::target_column_size() const {
198  DASSERT_TRUE(has_target());
199  return target->column_size();
200 }
201 
202 /** Returns the index size of the columns in the metadata that were
203  * present at train time. Index size differs from column size in
204  * that column_size may grow on test, but index_size is constant.
205  */
206 inline size_t ml_metadata::index_size(size_t column_index) const {
207  return get_column_metadata(column_index)->index_size();
208 }
209 
210 /** Returns the index size of the column in the metadata that were
211  * present at train time. Index size differs from column size in
212  * that column_size may grow on test, but index_size is constant.
213  *
214  * \overload
215  *
216  * \param column_name The name of the column.
217  */
218 inline size_t ml_metadata::index_size(const std::string& column_name) const {
219  return index_size(column_index(column_name));
220 }
221 
222 /** Returns the index size of the columns in the metadata that were
223  * present at train time. Index size differs from column size in
224  * that column_size may grow on test, but index_size is constant.
225  */
226 inline size_t ml_metadata::target_index_size() const {
227  return target->index_size();
228 }
229 
230 
231 /** Returns the global index offset of the columns in the metadata
232  * that were present at train time. This is fixed at setup time;
233  * global indices for the column c_idx are in the interval
234  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
235  */
236 inline size_t ml_metadata::global_index_offset(size_t column_index) const {
237  return get_column_metadata(column_index)->global_index_offset();
238 }
239 
240 /** Returns the global index offset of the columns in the metadata
241  * that were present at train time. This is fixed at setup time;
242  * global indices for the column c_idx are in the interval
243  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
244  *
245  * \overload
246  *
247  * \param column_name The name of the column.
248  */
249 inline size_t ml_metadata::global_index_offset(const std::string& column_name) const {
250  return global_index_offset(column_index(column_name));
251 }
252 
253 /** Returns the mode of the column. See ml_data_column_modes.hpp
254  * for details on the column modes.
255  *
256  * \param column_index The index of the column.
257  */
258 inline ml_column_mode ml_metadata::column_mode(size_t column_index) const {
259  return get_column_metadata(column_index)->mode;
260 }
261 
262 /** Returns the mode of the column. See ml_data_column_modes.hpp
263  * for details on the column modes.
264  *
265  * \overload
266  *
267  * \param column_name The name of the column.
268  */
269 inline ml_column_mode ml_metadata::column_mode(const std::string& column_name) const {
270  return column_mode(column_index(column_name));
271 }
272 
273 /** Returns the mode of the target column. See
274  * ml_data_column_modes.hpp for details on the column modes.
275  */
277  DASSERT_TRUE(has_target());
278  return target->mode;
279 }
280 
281 /** Returns the size of the columns in the metadata that were
282  * present at train time.
283  *
284  * \param column_index The index of the column.
285  */
286 inline flex_type_enum ml_metadata::column_type(size_t column_index) const {
287  return get_column_metadata(column_index)->original_column_type;
288 }
289 
290 /** Returns the size of the columns in the metadata that were
291  * present at train time.
292  *
293  * \overload
294  *
295  * \param column_name The name of the column.
296  */
297 inline flex_type_enum ml_metadata::column_type(const std::string& column_name) const {
298  return column_type(column_index(column_name));
299 }
300 
301 
302 /** Returns the mode of the target column. See
303  * ml_data_column_modes.hpp for details on the column modes.
304  */
306  DASSERT_TRUE(has_target());
307  return target->original_column_type;
308 }
309 
310 /** Returns true if the underlying type is treated as a categorical
311  * variable, and false otherwise.
312  *
313  * \param column_index The index of the column.
314  */
315 inline bool ml_metadata::is_categorical(size_t column_index) const {
316  ml_column_mode mode = get_column_metadata(column_index)->mode;
317  return mode_is_categorical(mode);
318 }
319 
320 /** Returns true if the underlying type is treated as a categorical
321  * variable, and false otherwise.
322  *
323  * \overload
324  *
325  * \param column_name The name of the column.
326  */
327 inline bool ml_metadata::is_categorical(const std::string& column_name) const {
328  return is_categorical(column_index(column_name));
329 }
330 
331 /** Returns true if the underlying column type is indexed, and false
332  * otherwise. This differs form the is_categorical in that
333  * dictionaries are not treated as pure categorical variables, as
334  * they have values associated with them, but they are indexed.
335  *
336  * \param column_index The index of the column.
337  */
338 inline bool ml_metadata::is_indexed(size_t column_index) const {
339  ml_column_mode mode = get_column_metadata(column_index)->mode;
340  return mode_is_indexed(mode);
341 }
342 
343 /** Returns true if the underlying column type is indexed, and false
344  * otherwise. This differs form the is_categorical in that
345  * dictionaries are not treated as pure categorical variables, as
346  * they have values associated with them, but they are indexed.
347  *
348  * \overload
349  *
350  * \param column_name The name of the column.
351  */
352 inline bool ml_metadata::is_indexed(const std::string& column_name) const {
353  return is_indexed(column_index(column_name));
354 }
355 
356 /** Returns true if the underlying column type is untranslated.
357  * This means it will only be available as flexible_type later on.
358  *
359  * \param column_index The index of the column.
360  */
361 inline bool ml_metadata::is_untranslated_column(size_t column_index) const {
362  return get_column_metadata(column_index)->is_untranslated_column();
363 }
364 
365 /** Returns true if the underlying column type is untranslated.
366  * This means it will only be available as flexible_type later on.
367  *
368  * \overload
369  *
370  * \param column_name The name of the column.
371  */
372 inline bool ml_metadata::is_untranslated_column(const std::string& column_name) const {
373  return is_untranslated_column(column_index(column_name));
374 }
375 
376 /** Returns true if the underlying column is a side column handled
377  * by an implicit join, and false otherwise.
378  *
379  * \param column_index The index of the column.
380  */
381 inline bool ml_metadata::is_side_column(size_t column_index) const {
382  DASSERT_LT(column_index, num_columns());
383  return column_index >= columns.size();
384 }
385 
386 
387 /** Returns true if the underlying column is a side column handled
388  * by an implicit join, and false otherwise.
389  *
390  * \overload
391  *
392  * \param column_name The name of the column.
393  */
394 inline bool ml_metadata::is_side_column(const std::string& column_name) const {
395  return is_side_column(column_index(column_name));
396 }
397 
398 
399 /** Returns true if the underlying type is treated as a categorical
400  * variable, and false otherwise.
401  */
402 inline bool ml_metadata::target_is_categorical() const {
403  DASSERT_TRUE(has_target());
404  return mode_is_categorical(target->mode);
405 }
406 
407 /** Returns true if the underlying type is indexed, and false
408  * otherwise. This differs form the is_categorical in that
409  * dictionaries are not treated as pure categorical variables, as
410  * they have values associated with them, but they are indexed.
411  */
412 inline bool ml_metadata::target_is_indexed() const {
413  DASSERT_TRUE(has_target());
414  return mode_is_indexed(target->mode);
415 }
416 
417 /** Returns true if there is side data and false otherwise.
418  */
419 inline bool ml_metadata::has_side_features() const {
420  return side_features != nullptr;
421 }
422 
423 /** Returns the side feature storage class.
424  */
425 std::shared_ptr<ml_data_side_features> ml_metadata::get_side_features() const {
426  DASSERT_TRUE(has_side_features());
427  return side_features;
428 }
429 
430 /** Returns the current options.
431  */
432 const std::map<std::string, flexible_type>& ml_metadata::get_current_options() const {
433  return options;
434 }
435 
436 }}
437 
438 #endif
size_t target_column_size() const
static GL_HOT_INLINE_FLATTEN bool mode_is_categorical(ml_column_mode mode)
bool has_translated_columns() const
bool is_categorical(size_t column_index) const
size_t column_size(size_t column_index) const
bool has_untranslated_columns() const
bool is_indexed(size_t column_index) const
const std::string & column_name(size_t column_index) const
const std::string & target_column_name() const
bool target_is_indexed() const
size_t num_untranslated_columns() const
const std::shared_ptr< ml_data_internal::column_indexer > & indexer(size_t column_index) const
ml_column_mode target_column_mode() const
const std::shared_ptr< ml_data_internal::column_statistics > & statistics(size_t column_index) const
size_t index_size(size_t column_index) const
ml_column_mode column_mode(size_t column_index) const
flex_type_enum target_column_type() const
size_t target_index_size() const
size_t column_index(const std::string &column_name, bool max_on_error=false) const
size_t num_dimensions() const
size_t num_columns(bool include_untranslated_columns=true) const
ml_data_internal::column_metadata_ptr get_column_metadata(size_t column_index) const
bool target_is_categorical() const
bool contains_column(const std::string &column_name) const
static GL_HOT_INLINE_FLATTEN bool mode_is_indexed(ml_column_mode mode)
bool has_target() const
bool is_untranslated_column(size_t column_index) const
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
size_t global_index_offset(size_t column_index) const
flex_type_enum column_type(size_t column_index) const