Turi Create  4.0
rule_mining.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FP_RULE_MINING_H
7 #define TURI_FP_RULE_MINING_H
8 
9 #include <vector>
10 #include <stack>
11 #include <map>
12 #include <string>
13 #include <memory>
14 #include <algorithm>
15 
16 #include <core/data/sframe/gl_sframe.hpp>
17 #include <core/data/sframe/gl_sarray.hpp>
18 
19 #include <toolkits/feature_engineering/topk_indexer.hpp>
20 #include <toolkits/feature_engineering/statistics_tracker.hpp>
21 
22 #include <toolkits/pattern_mining/fp_node.hpp>
23 #include <toolkits/pattern_mining/fp_results_tree.hpp>
24 
25 namespace turi{
26 namespace pattern_mining {
27 
28 const size_t CONF_SCORE = 0;
29 const size_t LIFT_SCORE = 1;
30 const size_t ALL_CONF_SCORE = 2;
31 const size_t MAX_CONF_SCORE = 3;
32 const size_t KULC_SCORE = 4;
33 const size_t COSINE_SCORE = 5;
34 const size_t CONVICTION_SCORE = 6;
35 
36 // Helper Data Structures
37 /**
38  * rule structure
39  * Collection of information used for association rule mining and prediction.
40  *
41  * A rule is (LHS -> RHS). Both LHS and RHS are frequent itemsets.
42  *
43  * Vars:
44  * LHS (vector of size_t) - item ids in the current itemset
45  * RHS (vector of size_t) - item ids to add to the current itemset
46  * LHS_support (size_t) - support of LHS
47  * RHS_support (size_t) - support of RHS
48  * total_support (size_t) - support of LHS + RHS
49  * num_transactions (size_t) - normalizing constant for support values
50  */
51 struct rule {
52  std::vector<size_t> LHS;
53  std::vector<size_t> RHS;
54  size_t LHS_support;
55  size_t RHS_support;
56  size_t total_support;
57  size_t num_transactions;
58 };
59 
60 /**
61  * rule_list class
62  * A collection of rule structs
63  */
64 class rule_list {
65  public:
66  std::vector<rule> rules;
67  size_t num_transactions;
68 
69  void add_rule(const rule& new_rule) {rules.push_back(new_rule);}
70  std::vector<size_t> get_LHS_supports() const;
71  std::vector<size_t> get_RHS_supports() const;
72  std::vector<size_t> get_total_supports() const;
73  inline size_t size() const { return rules.size(); };
74 
75 
76  void append_rule_list(const rule_list& other_list);
77 
78  // Convert to gl_sframe
79  gl_sframe to_gl_sframe(
80  const std::shared_ptr<topk_indexer>& indexer = nullptr) const;
81 
82  // Convert to flex_list
83  flex_list to_flex_list(std::vector<double> scores, \
84  const std::shared_ptr<topk_indexer>& indexer = nullptr) const;
85 
86  /**
87  * Extract top_k rules
88  *
89  * Args:
90  * top_k (size_t) - maximum number of rules to return
91  * score_type (size_t) - see score_rules() for explanation of score types
92  * indexer (topk_indexer) - convert itemset ids to items
93  * Returns:
94  * top_rules (vector of flex_list) - the top k rules as
95  * flex_list{ [LHS] , [RHS], score}
96  */
97  flex_list get_top_k_rules(const size_t& top_k = TOP_K_MAX, \
98  const size_t& score_type = CONF_SCORE,\
99  const std::shared_ptr<topk_indexer>& indexer = nullptr) const;
100 
101  std::vector<double> score_rules(const size_t& score_type) const;
102 
103 };
104 /**
105  * Provides printing of the rule_list
106  */
107 std::ostream& operator<<(std::ostream& out, const rule_list& my_rules);
108 
109 
110 /**
111  * Extract all rules
112  *
113  * Args:
114  * closed_itemset_tree (fp_results_tree) - compressed closed itemsets
115  *
116  * Returns:
117  * all_rules (rule_list) where the LHS are subsets of my_itemset
118  *
119  */
120 //rule_list extract_all_rules(const fp_results_tree& closed_itemset_tree);
121 
122 
123 /**
124  * Extract the rules for a given itemset
125  *
126  * Args:
127  * my_itemset (vector of size_t) - itemset to mine
128  * closed_itemset_tree (fp_results_tree) - compressed closed itemsets
129  *
130  * Returns:
131  * my_rules (rule_list) where the LHS are subsets of my_itemset
132  */
133 rule_list extract_relevant_rules(const std::vector<size_t>& my_itemset, \
134  const fp_results_tree& closed_itemset_tree);
135 
136 /**
137  * Extract the top k rules for a given itemset
138  *
139  * Args:
140  * my_itemset (vector of size_t) - itemset to mine
141  * closed_itemset_tree (fp_results_tree) - compressed closed itemsets
142  * top_k (size_t) - maximum number of rules to mine
143  * score_type (size_t) - score type (see score_type())
144  *
145  * Returns:
146  * top_rules (vector of flex_list) - the top k rules as
147  * flex_list{ [LHS] , [RHS], score}
148  */
149 flex_list extract_top_k_rules(const std::vector<size_t>& my_itemset, \
150  const fp_results_tree& closed_itemset_tree, \
151  const size_t& top_k = TOP_K_MAX, \
152  const size_t& score_type = CONF_SCORE, \
153  const std::shared_ptr<topk_indexer>& indexer = nullptr);
154 
155 
156 
157 // Helper class for extracting rules // TODO REFACTOR
158 class rule_miner {
159  private:
160  std::vector<size_t> RHS;
161  std::vector<std::vector<size_t>> LHS_list;
162  std::vector<size_t> LHS_support_list;
163  std::vector<std::vector<size_t>> itemset_list;
164  rule_list my_rules;
165  public:
166  fp_results_tree closed_itemset_tree;
167 
168  rule_miner(const std::vector<size_t>& sorted_itemset,\
169  const fp_results_tree& my_results);
170 
171  void extract_relevant_rules_helper(std::shared_ptr<fp_node>& node);
172 
173  rule_list get_rule_list() {return my_rules;}
174  std::vector<size_t> get_itemset() {return itemset_list.front();}
175 };
176 
177 
178 // Helper functions for extract_top_k_rules
179 /**
180  * Compare rule_score_pairs using the score
181  */
183  inline bool operator()(const std::pair<rule, double> & left, \
184  const std::pair<rule, double> &right){
185  return left.second > right.second;
186  };
187 };
188 /**
189  * Rule-Score-Pair min_heap implementation
190  */
192  public:
193  size_t top_k;
194  std::priority_queue<std::pair<rule, double>, \
195  std::vector<std::pair<rule, double>>, \
196  rule_score_compare> \
197  min_heap;
198 
199  inline rule_score_min_heap(const size_t& k = TOP_K_MAX){ top_k = k;};
200 
201  void add_rule_score_pair(const std::pair<rule, double>& rule_score_pair);
202  std::vector<std::pair<rule, double>> convert_to_sorted_vector();
203 };
204 flex_list rules_to_flex_list(std::vector<std::pair<rule, double>> rule_score_pairs,
205  const std::shared_ptr<topk_indexer>& indexer);
206 
207 
208 // Score functions for Association Rules ////////////////////////////////////
209 // See "Data Mining - Concepts and Techniques 3rd Edition Chapter 6" by
210 // Jiawei Han, Micheline Kamber, and Jian Pei
211 /**
212  * Function for scoring an sframe of rules
213  * Args:
214  * sf_rules (gl_sframe) - sframe of rules
215  * score_type (size_t) - use one of the score macros below
216  * Returns:
217  * sa_scores (gl_sarray) - sarray of scores for each rule
218  *
219  * CONF_SCORE
220  * confidence = Support(LHS + RHS) / Support(LHS) = "Pr(RHS | LHS)"
221  *
222  * LIFT_SCORE
223  * lift = Support(LHS + RHS) / (Support(LHS) * Support(RHS))
224  * = "Pr(RHS | LHS) / Pr(RHS)"
225  *
226  * ALL_CONF_SCORE
227  * all_confidence = Support(LHS + RHS) / max{ Support(LHS), Support(RHS) }
228  * = min { "Pr(RHS | LHS)" , "Pr(LHS | RHS)" }
229  *
230  * MAX_CONF_SCORE
231  * max_confidence = Support(LHS + RHS) / min{ Support(LHS), Support(RHS) }
232  * = max { "Pr(RHS | LHS)" , "Pr(LHS | RHS)" }
233  *
234  * KULC_SCORE
235  * Kulczynski = 0.5 * ( "Pr(RHS | LHS)" + "Pr(LHS | RHS)" )
236  *
237  * COSINE_SCORE
238  * cosine = Support(LHS + RHS) / sqrt( Support(LHS) * Support(RHS) )
239  * = sqrt( "Pr(RHS | LHS)" * "Pr(LHS | RHS)" )
240  *
241  * CONVICTION_SCORE
242  * conviction = (1 - Support(RHS))/(1 - Conf(LHS => RHS))
243  */
244 
245 /**
246  * Return the type (in the enum) for the score function from its name.
247  */
248 inline size_t get_score_function_type_from_name(const std::string& score_function_name){
249  if (score_function_name == "confidence") {
250  return CONF_SCORE;
251  } else if (score_function_name == "lift") {
252  return LIFT_SCORE;
253  } else if (score_function_name == "all_confidence") {
254  return ALL_CONF_SCORE;
255  } else if (score_function_name == "max_confidence") {
256  return MAX_CONF_SCORE;
257  } else if (score_function_name == "kulczynski") {
258  return KULC_SCORE;
259  } else if (score_function_name == "cosine") {
260  return COSINE_SCORE;
261  } else if (score_function_name == "conviction"){
262  return CONVICTION_SCORE;
263  } else {
264  log_and_throw("Internal error. No such scoring function exists.");
265  }
266 }
267 
268 
269 // Helper function for score_rules
270 std::function<double (const rule&)> get_score_function(const size_t& score_type, \
271  const size_t& num_transactions);
272 double confidence_score(const double& LHS_support, const double& RHS_support, const double& total_support);
273 double lift_score(const double& LHS_support, const double& RHS_support, const double& total_support);
274 double all_confidence_score(const double& LHS_support, const double& RHS_support, const double& total_support);
275 double max_confidence_score(const double& LHS_support, const double& RHS_support, const double& total_support);
276 double kulc_score(const double& LHS_support, const double& RHS_support, const double& total_support);
277 double cosine_score(const double& LHS_support, const double& RHS_support, const double& total_support);
278 double conviction_score(const double& LHS_support, const double& RHS_support, const double& total_support);
279 
280 } // pattern_mining
281 } // turicreate
282 
283 #endif
std::vector< flexible_type > flex_list