Turi Create  4.0
factors_to_sframe.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FACTORIZATION_FACTORS_TO_SFRAME_H_
7 #define TURI_FACTORIZATION_FACTORS_TO_SFRAME_H_
8 
9 #include <toolkits/ml_data_2/ml_data.hpp>
10 #include <core/storage/sframe_interface/unity_sframe.hpp>
11 #include <core/storage/sframe_data/sframe.hpp>
12 #include <string>
13 
14 namespace turi { namespace factorization {
15 
16 /** Fills a unity_sframe object with data from the features in the
17  * model.
18  */
19 template <typename VectorType, typename EigenMatrixType>
20 sframe fill_linear_model_sframe_from_eigen_data(
21  const std::shared_ptr<v2::ml_metadata>& metadata,
22 
23  size_t c_idx,
24 
25  size_t n,
26 
27  bool include_w_terms,
28  size_t w_idx_offset,
29  const std::string& w_name,
30  const VectorType& w,
31 
32  bool include_V_terms,
33  size_t V_idx_offset,
34  const std::string& V_name,
35  const EigenMatrixType& V) {
36 
37  bool is_categorical;
38  size_t n_rows;
39 
40  switch(metadata->column_mode(c_idx)) {
41  case turi::v2::ml_column_mode::CATEGORICAL:
42  case turi::v2::ml_column_mode::CATEGORICAL_VECTOR:
43  case turi::v2::ml_column_mode::DICTIONARY:
44  is_categorical = true;
45  n_rows = n;
46  break;
47 
48  case turi::v2::ml_column_mode::NUMERIC:
49  is_categorical = false;
50  n_rows = 1;
51  break;
52 
53  case turi::v2::ml_column_mode::NUMERIC_VECTOR:
54  is_categorical = false;
55  n_rows = metadata->column_size(c_idx);
56  break;
57  default:
58  ASSERT_FALSE(true);
59  };
60 
61  std::vector<std::string> names;
62  std::vector<flex_type_enum> types;
63 
64  names.push_back(metadata->column_name(c_idx));
65 
66  // Decide on the type present; promote to string if there is an issue.
67  {
68  std::set<flex_type_enum> value_types_present = metadata->indexer(c_idx)->extract_key_types();
69 
70  // If undefined is in there, it is typically present with
71  // other values.
72  if(value_types_present.find(flex_type_enum::UNDEFINED) != value_types_present.end())
73  value_types_present.erase(flex_type_enum::UNDEFINED);
74 
75  // If no data is present, then use undefined.
76  if(value_types_present.size() == 0)
77  value_types_present.insert(flex_type_enum::UNDEFINED);
78 
79  flex_type_enum out_type;
80 
81  if(value_types_present.size() == 1) {
82  out_type = *value_types_present.begin();
83  } else {
84  logprogress_stream << "WARNING: Differing categorical key types present in list or "
85  << "dictionary on column " << metadata->column_name(c_idx)
86  << "; promoting all to string type." << std::endl;
87  out_type = flex_type_enum::STRING;
88  }
89 
90  types.push_back(out_type);
91  }
92 
93  size_t w_col_idx = 0, V_col_idx = 0;
94 
95  if(include_w_terms) {
96  DASSERT_LE(w_idx_offset + n, w.size());
97 
98  w_col_idx = names.size();
99  names.push_back(w_name);
100  types.push_back(flex_type_enum::FLOAT);
101  }
102 
103  if(include_V_terms) {
104  DASSERT_LE(V_idx_offset + n, V.rows());
105 
106  V_col_idx = names.size();
107  names.push_back(V_name);
108  types.push_back(flex_type_enum::VECTOR);
109  }
110 
111  size_t num_columns = names.size();
112 
113  size_t num_segments = thread::cpu_count();
114 
115  sframe out;
116 
117  out.open_for_write(names, types, "", num_segments);
118 
119  size_t num_factors = V.cols();
120 
121  in_parallel([&](size_t thread_idx, size_t num_threads) {
122 
123  size_t start_idx = (thread_idx * n_rows) / num_threads;
124  size_t end_idx = ( (thread_idx + 1) * n_rows) / num_threads;
125 
126  auto it_out = out.get_output_iterator(thread_idx);
127 
128  std::vector<flexible_type> x(num_columns);
129 
130  flex_vec factors(num_factors);
131 
132  for(size_t i = start_idx; i < end_idx; ++i, ++it_out) {
133  if(is_categorical) {
134  x[0] = metadata->indexer(c_idx)->map_index_to_value(i);
135  } else {
136  x[0] = i;
137  }
138 
139  if(include_w_terms)
140  x[w_col_idx] = w[i + w_idx_offset];
141 
142  if(include_V_terms) {
143 
144  for(size_t j = 0; j < num_factors; ++j)
145  factors[j] = V(i + V_idx_offset, j);
146 
147  x[V_col_idx] = factors;
148  }
149 
150  *it_out = x;
151  }
152  });
153 
154  out.close();
155 
156  return out;
157 }
158 
159 }}
160 
161 
162 #endif /* TURI_RECSYS_FILL_MODEL_SFRAME_H_ */
#define ASSERT_FALSE(cond)
Definition: assertions.hpp:310
std::vector< double > flex_vec
static size_t cpu_count()
#define logprogress_stream
Definition: logger.hpp:325
void in_parallel(const std::function< void(size_t thread_id, size_t num_threads)> &fn)
Definition: lambda_omp.hpp:35