Turi Create  4.0
sframe_index_mapping.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2DATA_SFRAME_TRANSLATION_H_
7 #define TURI_ML2DATA_SFRAME_TRANSLATION_H_
8 
9 #include <vector>
10 #include <memory>
11 #include <core/storage/sframe_data/sframe.hpp>
12 #include <toolkits/ml_data_2/metadata.hpp>
13 #include <toolkits/ml_data_2/indexing/column_indexer.hpp>
14 
15 ////////////////////////////////////////////////////////////////////////////////
16 //
17 // Translation helper functions
18 //
19 ////////////////////////////////////////////////////////////////////////////////
20 
21 namespace turi { namespace v2 {
22 
23  /**
24  * Translates an external SFrame into the corresponding indexed
25  * SFrame representation, as dictated by the indexing in
26  * column_metadata. Only the columns specified in metadata are
27  * used, and all of these must be present.
28  *
29  * If allow_new_categorical_values is false, then the metadata is
30  * not changed. New categorical values are mapped to size_t(-1)
31  * with a warning.
32  *
33  * Categorical: If a column is categorical, each unique value is mapped to
34  * a unique index in the range 0, ..., n-1, where n is the number of unique
35  * values.
36  *
37  * Numeric: The column type is checked to be INT/FLOAT, then
38  * returned as-is.
39  *
40  * Numeric Vector: If the dictated column type is VECTOR, it is
41  * checked to make sure it is numeric and of homogeneous size.
42  *
43  * Categorical Vector: If the dictated column type is VECTOR, it is
44  * checked to make sure it is numeric and of homogeneous size.
45  *
46  * Dictionary : If the dictated column type is DICT, it is checked to make
47  * sure the values are numeric. The keys are then translated to 0..n-1
48  * where n is the number of unique keys.
49  *
50  * \param[in] metadata The metadata used for the mapping.
51  * \param[in] unindexed_x The SFrame to map to indices.
52  * \param[in] allow_new_categorical_values Whether to allow new categories.
53  *
54  * \returns Indexed SFrame.
55  */
56 std::shared_ptr<sarray<flexible_type> > map_to_indexed_sarray(
57  const std::shared_ptr<ml_data_internal::column_indexer>& indexer,
58  const std::shared_ptr<sarray<flexible_type> >& src,
59  bool allow_new_categorical_values = true);
60 
61  /**
62  * Translates an external SFrame into the corresponding indexed
63  * SFrame representation, as dictated by the indexing in
64  * column_indexer. Only the columns specified in metadata are
65  * used, and all of these must be present.
66  *
67  * If allow_new_categorical_values is false, then the metadata is
68  * not changed. New categorical values are mapped to size_t(-1)
69  * with a warning.
70  *
71  * Categorical: If a column is categorical, each unique value is mapped to
72  * a unique index in the range 0, ..., n-1, where n is the number of unique
73  * values.
74  *
75  * Numeric: The column type is checked to be INT/FLOAT, then
76  * returned as-is.
77  *
78  * Numeric Vector: If the dictated column type is VECTOR, it is
79  * checked to make sure it is numeric and of homogeneous size.
80  *
81  * Categorical Vector: If the dictated column type is VECTOR, it is
82  * checked to make sure it is numeric and of homogeneous size.
83  *
84  * Dictionary : If the dictated column type is DICT, it is checked to make
85  * sure the values are numeric. The keys are then translated to 0..n-1
86  * where n is the number of unique keys.
87  *
88  * \param[in,out] metadata The metadata used for the mapping.
89  * \param[in] unindexed_x The SFrame to map to indices.
90  * \param[in] allow_new_categorical_values Whether to allow new categories.
91  *
92  * \returns Indexed SFrame.
93  */
94 sframe map_to_indexed_sframe(
95  const std::vector<std::shared_ptr<ml_data_internal::column_indexer> >& indexer,
96  sframe unindexed_x,
97  bool allow_new_categorical_values = true);
98 
99  /**
100  * Translates an external SFrame into the corresponding indexed
101  * SFrame representation, as dictated by the indexing in
102  * column_indexer. Only the columns specified in metadata are
103  * used, and all of these must be present.
104  *
105  * If allow_new_categorical_values is false, then the metadata is
106  * not changed. New categorical values are mapped to size_t(-1)
107  * with a warning.
108  *
109  * Categorical: If a column is categorical, each unique value is mapped to
110  * a unique index in the range 0, ..., n-1, where n is the number of unique
111  * values.
112  *
113  * Numeric: The column type is checked to be INT/FLOAT, then
114  * returned as-is.
115  *
116  * Numeric Vector: If the dictated column type is VECTOR, it is
117  * checked to make sure it is numeric and of homogeneous size.
118  *
119  * Categorical Vector: If the dictated column type is VECTOR, it is
120  * checked to make sure it is numeric and of homogeneous size.
121  *
122  * Dictionary : If the dictated column type is DICT, it is checked to make
123  * sure the values are numeric. The keys are then translated to 0..n-1
124  * where n is the number of unique keys.
125  *
126  * \overload
127  *
128  * \param[in,out] metadata The metadata used for the mapping.
129  * \param[in] unindexed_x The SFrame to map to indices.
130  * \param[in] allow_new_categorical_values Whether to allow new categories.
131  *
132  * \returns Indexed SFrame.
133  */
134 sframe map_to_indexed_sframe(
135  const std::shared_ptr<ml_metadata>& metadata,
136  sframe unindexed_x,
137  bool allow_new_categorical_values = true);
138 
139  /**
140  * Translates an indexed SArray into the cooriginal non-indexed
141  * representation, as dictated by the indexing in column_indexer.
142  *
143  * \param[in,out] metadata The metadata used for the mapping.
144  * \param[in] indexing_x The indexed SArray to map to external values.
145  *
146  * \returns Indexed SArray in original format.
147  */
148 std::shared_ptr<sarray<flexible_type> > map_from_indexed_sarray(
149  const std::shared_ptr<ml_data_internal::column_indexer>& indexer,
150  const std::shared_ptr<sarray<flexible_type> >& indexed_x);
151 
152 
153  /**
154  * Translates an indexed SFrame into the original non-indexed
155  * representation, as dictated by the indexing in column_indexer.
156  *
157  * \param[in,out] metadata The metadata used for the mapping.
158  * \param[in] indexing_x The indexed SArray to map to external values.
159  *
160  * \returns Indexed SFrame in original format.
161  */
162 sframe map_from_indexed_sframe(
163  const std::vector<std::shared_ptr<ml_data_internal::column_indexer> >& indexer,
164  sframe indexed_x);
165 
166  /**
167  * Translates an indexed SFrame into the original non-indexed
168  * representation, as dictated by the indexing in column_indexer.
169  *
170  * \param[in,out] metadata The metadata used for the mapping.
171  * \param[in] indexing_x The indexed SArray to map to external values.
172  *
173  * \returns Indexed SFrame in original format.
174  */
175 sframe map_from_indexed_sframe(
176  const std::shared_ptr<ml_metadata>& metadata, sframe indexed_x);
177 
178  /** Translates an indexed SFrame into the original non-indexed
179  * representation, as dictated by the indexing in column_indexer.
180  * In this case, the column metadata is contained in a column name
181  * to metadata map.
182  *
183  * \param metadata The metadata used for the mapping.
184  *
185  * \param external_x The external SArray to map to indices.
186  */
187 sframe map_from_custom_indexed_sframe(
188  const std::map<std::string, std::shared_ptr<ml_data_internal::column_indexer> >& indexer,
189  sframe indexed_x);
190 
191 
192 }}
193 
194 #endif /* TURI_ML2DATA_SFRAME_TRANSLATION_H_ */