Turi Create  4.0
fp_growth.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FP_GROWTH_H
7 #define TURI_FP_GROWTH_H
8 
9 #include <vector>
10 #include <map>
11 #include <stack>
12 #include <string>
13 #include <algorithm>
14 #include <core/export.hpp>
15 #include <core/util/dense_bitset.hpp>
16 
17 #include <core/data/sframe/gl_sframe.hpp>
18 #include <core/data/sframe/gl_sarray.hpp>
19 #include <model_server/lib/toolkit_class_macros.hpp>
20 #include <model_server/lib/extensions/ml_model.hpp>
21 
22 #include <toolkits/pattern_mining/fp_tree.hpp>
23 #include <toolkits/pattern_mining/fp_results_tree.hpp>
24 #include <toolkits/feature_engineering/topk_indexer.hpp>
25 
26 namespace turi {
27 namespace pattern_mining {
28 
29 
30 /**
31  * fp_growth model interface.
32  * ---------------------------------------
33  * TODO: Add some comments hare about what the base class is supposed to be.
34  *
35  */
36 class EXPORT fp_growth : public ml_model_base {
37 
38  static constexpr size_t FP_GROWTH_VERSION = 0;
39 
40  gl_sframe closed_itemset; /* Closed itemsets as SFrame. */
41  fp_top_k_results_tree closed_itemset_tree; /* Closed itemsets as trees. */
42  std::vector<dense_bitset> closed_bitsets; /* Closed itemsets as bitset. */
43 
44  std::vector<std::string> features; /* Features that make a session id. */
45  std::string item; /* Set the item column name. */
46  std::shared_ptr<topk_indexer> indexer; /* Indexer for the event map. */
47 
48  public:
49 
50  virtual inline ~fp_growth() {}
51 
52  /**
53  * Validate the input data.
54  * \param[in] data gl_sframe of data in stacked format.
55  * \param[in] item Name of the item column.
56  * \param[in] features Names of the feature columns.
57  */
58  void validate(const gl_sframe& data,
59  const std::string& item,
60  const std::vector<std::string>& features) const;
61  /**
62  * Set the feature column names.
63  *
64  * \param[in] _features Names of the "features" column.
65  */
66  void set_features(const std::vector<std::string>& _features) {
67  this->features.resize(_features.size());
68  for (size_t i = 0; i < _features.size(); ++i) {
69  this->features[i] = _features[i];
70  }
71  add_or_update_state({{"features", to_variant(this->features)}});
72  add_or_update_state({{"num_features", to_variant(_features.size())}});
73  }
74 
75  /**
76  * Set the item column name.
77  *
78  * \param[in] item Name of the item column.
79  */
80  void set_item(const std::string& _item) {
81  this->item.assign(_item);
82  add_or_update_state({{"item", to_variant(this->item)}});
83  }
84 
85  /**
86  * Train FP-Growth on the full dataset.
87  *
88  * \param[in] data gl_sframe of data in stacked format.
89  */
90  void train(const gl_sframe& data);
91 
92  /* Preprocess data.
93  *
94  * \param[in] data gl_sframe of data in stacked format.
95  * \returns Indexed data in a stacked format.
96  */
97  gl_sframe preprocess(const gl_sframe& data) const;
98 
99  /**
100  * Extract features from the training data.
101  *
102  * \param[in] Data in the same format as the input data.
103  */
104  gl_sframe extract_features(const gl_sframe& data) const;
105 
106  /**
107  * Predict top-k rules
108  *
109  * \param[in] Data in the same format as the input data.
110  * \param[in] output_type The scoring function used to score the results.
111  * \param[in] k The topk rules to predict.
112  */
113  gl_sframe predict_topk(const gl_sframe& data,
114  const std::string& score_function = "confidence",
115  const size_t& k = 5) const;
116 
117  /**
118  * Returns the current model version
119  */
120  size_t get_version() const override;
121 
122  /**
123  * Serializes the model. Must save the model to the file format version
124  * matching that of get_version()
125  */
126  void save_impl(oarchive& oarc) const override;
127 
128  /**
129  * Loads a model previously saved at a particular version number.
130  * Should raise an exception on failure.
131  */
132  void load_version(iarchive& iarc, size_t version) override;
133 
134  /**
135  * Set one of the options in the algorithm. Use the option manager to set
136  * these options. If the option does not satisfy the conditions that the
137  * option manager has imposed on it. Errors will be thrown.
138  *
139  * \param[in] options Options to set
140  */
141  void init_options(const std::map<std::string,flexible_type>& _options) override;
142 
143  /**
144  * Get frequent itemsets using the FP-Growth algorithm.
145  */
146  gl_sframe get_frequent_patterns() const;
147 
148  // Register function for Python
149  // --------------------------------------------------------------------------
157  "data", "score_function", "k");
158  REGISTER_NAMED_CLASS_MEMBER_FUNCTION("_get_default_options",
162  "key");
164 };
165 
166 /**
167  * Create a pattern mining model.
168  *
169  * \param[in] data Data as an SFrame in stacked format.
170  * \param[in] item The name of the item/event column.
171  * \param[in] features The names of the columns that uniquely identify the session.
172  * \param[in] min_support The min support of each pattern.
173  * \param[in] max_patterns The maximum number of patterns to mine.
174  * \param[in] min_length The min length of each pattern.
175  *
176  */
177 EXPORT std::shared_ptr<fp_growth> _pattern_mining_create(
178  gl_sframe data,
179  std::string item,
180  std::vector<std::string> features,
181  size_t min_support,
182  size_t max_patterns,
183  size_t min_length);
184 
185 
186 // ITEMSET MINING ALGORITHMS /////////////////////////////////////////////////
187 /**
188  * Mine the closed itemset from the transaction database with at least
189  * min_support
190  *
191  * Args:
192  * database (gl_sarray) - SArray of transactions (flex_list of int)
193  * min_support (size_t) - minimum number of transactions required to be
194  * 'frequent'.
195  *
196  * Returns:
197  * closed_itemset_tree (fp_results_tree) - closed itemsets in compressed format
198  *
199  */
200 fp_results_tree closet_algorithm(const gl_sarray& database, const size_t& min_support);
201 
202 void closet_growth(fp_tree& my_tree, fp_results_tree& closed_itemset_tree,\
203  const size_t& min_support);
204 
205 void closet_prune(fp_tree& my_tree, fp_results_tree& closed_itemset_tree,\
206  const size_t& min_support);
207 
208 
209 /**
210  * Mine the top_k closed itemsets from the transaction database that have
211  * at least min_length.
212  *
213  * Args:
214  * database (gl_sarray) - SArray of transactions (flex_list of int)
215  * min_support (size_t) - minimum number of transaction required to be
216  * 'frequent'.
217  * top_k (size_t) - the maximum number of transactions to mine
218  * min_length (size_t) - the required minimum length of a transaction
219  * increasing min_length mines longer (but less frequent) transactions
220  * Returns:
221  * closed_itemset_tree (fp_top_k_results_tree) closed itemset tree in
222  * compressed format.
223  *
224  *
225  */
226 fp_top_k_results_tree top_k_algorithm(const gl_sarray& database, \
227  size_t& min_support, const size_t& top_k = TOP_K_MAX, \
228  const size_t& min_length = 1);
229 
230 /**
231  * Helper function. Mines the global fp_tree top down (most frequent first)
232  */
233 void global_top_down_growth(fp_top_k_tree& my_tree,\
234  fp_top_k_results_tree& closed_itemset_tree, size_t& min_support);
235 
236 /**
237  * Helper function. Mines local fp_trees bottom up (least frequent first)
238  */
239 void local_bottom_up_growth(fp_top_k_tree& my_tree,\
240  fp_top_k_results_tree& closed_itemset_tree, size_t& min_support);
241 
242 
243 } // pattern_mining
244 } // turicreate
245 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void init_options(const std::map< std::string, flexible_type > &_options) override
gl_sframe predict_topk(const gl_sframe &data, const std::string &score_function="confidence", const size_t &k=5) const
const std::map< std::string, flexible_type > & get_current_options() const
#define END_CLASS_MEMBER_REGISTRATION
const variant_type & get_value_from_state(std::string key)
std::vector< std::string > list_fields()
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
variant_type to_variant(const T &f)
Definition: variant.hpp:308
gl_sframe get_frequent_patterns() const
std::map< std::string, flexible_type > get_default_options() const
gl_sframe extract_features(const gl_sframe &data) const
void set_item(const std::string &_item)
Definition: fp_growth.hpp:80
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void set_features(const std::vector< std::string > &_features)
Definition: fp_growth.hpp:66