7 #ifndef TURI_DML_DATA_COLUMN_METADATA_H_ 8 #define TURI_DML_DATA_COLUMN_METADATA_H_ 10 #include <ml/ml_data/column_indexer.hpp> 11 #include <ml/ml_data/column_statistics.hpp> 12 #include <ml/ml_data/ml_data_column_modes.hpp> 13 #include <core/storage/sframe_data/sarray.hpp> 20 namespace ml_data_internal {
37 std::string name =
"";
40 std::shared_ptr<ml_data_internal::column_indexer> indexer =
nullptr;
41 std::shared_ptr<ml_data_internal::column_statistics> statistics =
nullptr;
49 void setup(
bool is_target_column,
const std::string& name,
51 const std::map<std::string, ml_column_mode>& mode_overrides);
56 void set_training_index_offset(
size_t previous_total);
66 bool has_fixed_size = (column_data_size_if_fixed != size_t(-1));
70 return has_fixed_size;
76 return (mode == ml_column_mode::UNTRANSLATED);
83 return index_size_at_train_time;
91 DASSERT_TRUE(global_index_offset_at_train_time !=
size_t(-1));
92 return global_index_offset_at_train_time;
104 friend struct turi::metadata_load;
109 size_t index_size_at_train_time = size_t(-1);
110 size_t column_data_size_if_fixed = size_t(-1);
114 flex_nd_vec::index_range_type nd_array_size;
116 size_t global_index_offset_at_train_time = size_t(-1);
119 const size_t fixed_column_size()
const {
121 return column_data_size_if_fixed;
135 return indexer->indexed_column_size();
138 return column_data_size_if_fixed;
147 return nd_array_size;
160 typedef std::shared_ptr<column_metadata> column_metadata_ptr;
174 const std::vector<std::shared_ptr<column_metadata> >& _metadata_vect,
179 void set_index_sizes(
const std::shared_ptr<ml_metadata>& m);
181 bool has_target =
false;
182 bool target_is_indexed =
false;
186 bool data_size_is_constant =
false;
191 size_t constant_data_size = 0;
199 size_t num_x_columns = 0;
203 size_t total_num_columns = 0;
230 auto throw_error_1d = [&](
size_t nv) GL_GCC_ONLY(GL_COLD_NOINLINE) {
231 log_and_throw(std::string(
"Dataset mismatch. Numeric feature '") + name +
232 "' must contain lists of consistent size. (Found " 233 "lists/arrays of sizes " +
234 std::to_string(nv) +
" and " +
235 std::to_string(column_data_size_if_fixed) +
").");
238 auto throw_error_nd =
239 [&](
const flex_nd_vec::index_range_type& shape)
240 GL_GCC_ONLY(GL_COLD_NOINLINE) {
242 if (shape.size() == 1 && nd_array_size.size() <= 1) {
243 throw_error_1d(shape[0]);
245 std::ostringstream error;
247 error <<
"Dataset mismatch. Numeric feature '" << name
248 <<
"' must contain lists of consistent size. (Found " 249 "lists/arrays of sizes ";
251 if (nd_array_size.empty()) {
252 error <<
"(" << column_data_size_if_fixed <<
",)";
255 for (
const auto& d : nd_array_size) {
262 for (
const auto& d : shape) {
270 if (mode == ml_column_mode::NUMERIC_VECTOR) {
272 DASSERT_LE(nd_array_size.size(), 1);
276 size_t nv = v.size();
280 if (UNLIKELY(nv != column_data_size_if_fixed)) {
285 const auto& shape = v.
shape();
287 if(UNLIKELY(shape.size() != 1 || shape[0] != column_data_size_if_fixed)) {
288 throw_error_nd(shape);
294 }
else if (mode == ml_column_mode::NUMERIC_ND_VECTOR) {
300 if (nd_array_size.size() != 1) {
301 throw_error_nd({v.size()});
303 size_t nv = v.size();
307 if (UNLIKELY(nv != column_data_size_if_fixed)) {
313 const auto& shape = v.
shape();
315 if(UNLIKELY(shape.size() != nd_array_size.size())) {
316 throw_error_nd(shape);
319 for(
size_t i = 0; i < shape.size(); ++i) {
320 if (UNLIKELY(shape[i] != nd_array_size[i])) {
321 throw_error_nd(shape);
337 arc, std::shared_ptr<turi::ml_data_internal::column_metadata>, m) {
345 END_OUT_OF_PLACE_SAVE()
348 arc, std::shared_ptr<turi::ml_data_internal::column_metadata>, m) {
350 arc >> is_not_nullptr;
351 if (is_not_nullptr) {
355 m = std::shared_ptr<turi::ml_data_internal::column_metadata>(
nullptr);
358 END_OUT_OF_PLACE_LOAD()
static GL_HOT_INLINE_FLATTEN bool mode_has_fixed_size(ml_column_mode mode)
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
std::vector< double > flex_vec
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
const index_range_type & shape() const
flex_type_enum get_type() const
#define DASSERT_FALSE(cond)
#define GL_HOT_INLINE_FLATTEN
#define ASSERT_TRUE(cond)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
static GL_HOT_INLINE_FLATTEN bool mode_is_indexed(ml_column_mode mode)
#define DASSERT_TRUE(cond)
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.