Turi Create  4.0
fp_results_tree.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FP_RESULTS_TREE_H
7 #define TURI_FP_RESULTS_TREE_H
8 
9 #include <vector>
10 #include <stack>
11 #include <map>
12 #include <string>
13 #include <memory>
14 #include <algorithm>
15 
16 #include <core/data/sframe/gl_sframe.hpp>
17 #include <core/data/sframe/gl_sarray.hpp>
18 
19 #include <toolkits/feature_engineering/topk_indexer.hpp>
20 #include <toolkits/feature_engineering/statistics_tracker.hpp>
21 
22 #include <toolkits/pattern_mining/fp_node.hpp>
23 #include <toolkits/pattern_mining/fp_tree.hpp>
24 #include <core/util/dense_bitset.hpp>
25 
26 namespace turi{
27 namespace pattern_mining {
28 
29 /**
30  * Tree data structure for keeping track of the frequent 'closed' itemsets
31  * This is a compressed, memory efficient data structure used to store and
32  * during the mining of 'closed' itemsets.
33  * Similar to the fp_tree structure, but stores itemsets, not transactions.
34  *
35  * Members:
36  * id_order_map (map of size_t to ranking) - global ordering of the ids
37  * (heuristic is to use decreasing order of support)
38  * hash_id_map (map to fp_nodes) - map to the head of linked lists for each id
39  * root_node
40  *
41  *
42  * See: http://web.engr.illinois.edu/~hanj/pdf/icdm02_topk.pdf
43  */
45  public:
46  std::map<size_t, size_t> id_order_map;
47  std::map<size_t, fp_node*> hash_id_map;
48  std::shared_ptr<fp_node> root_node = nullptr;
49 
50  // Constructors
52  fp_results_tree(const std::vector<size_t>& id_order);
53  fp_results_tree(const fp_results_tree& other_tree);
54 
55  /**
56  * Save the fp_results_tree into a oarc.
57  */
58  inline void save(oarchive& oarc) const {
59  // Save the id_order_map.
60  size_t num_transactions = get_num_transactions();
61  oarc << id_order_map
62  << num_transactions;
63 
64  // Save the closed_itemsets as SFrame.
65  gl_sframe closed_itemsets = get_closed_itemsets();
66  const std::string & prefix = oarc.get_prefix();
67  closed_itemsets.save(prefix);
68  }
69 
70  /**
71  * Load the fp_results_tree from an iarc.
72  */
73  inline void load(iarchive& iarc) {
74 
75  // Save the id_order_map.
76  size_t num_transactions = 0;
77  iarc >> id_order_map
78  >> num_transactions;
79 
80  // Load the root-node
81  root_node = std::make_shared<fp_node>(ROOT_ID, 0);
82  root_node->item_count = num_transactions;
83 
84  // Load the closed_itemset SFrame.
85  const std::string & prefix = iarc.get_prefix();
86  gl_sframe closed_itemsets = gl_sframe(prefix);
87  build_tree(closed_itemsets);
88  }
89 
90  /**
91  * Check if frequent itemset cannot be closed
92  *
93  * Checks if the itemset is a subset of an existing itemset in the tree
94  * with equal support. Returns true if a subset with equal support exists.
95  *
96  * Args:
97  * potential_itemset (vector of size_t) - itemset to check
98  * Returns:
99  * is_redundant (bool) - whether the itemset passes the test
100  */
101  bool is_itemset_redundant(const std::vector<size_t>& potential_itemset, \
102  const size_t& support) const;
103 
104  /**
105  * Add a potential closed itemset to the tree
106  *
107  * Args:
108  * potential_itemset (vector of size_ts) - itemset to add
109  * support (size_t) - support of the itemset
110  */
111  virtual void add_itemset(const std::vector<size_t>& potential_itemset, \
112  const size_t& support);
113 
114  /**
115  * Build results tree from a collection of closed itemsets
116  *
117  * Args:
118  * closed_itemsets (gl_sframe) - sframe of closed itemset (ids), support
119  *
120  */
121  void build_tree(const gl_sframe& closed_itemsets);
122 
123  /**
124  * Return the current collection of closed itemsets
125  *
126  * Returns:
127  * closed_itemsets (gl_sframe) - sframe of closed itemset (ids), support
128  */
130  const std::shared_ptr<topk_indexer>& indexer = nullptr) const;
131 
132  /**
133  * Return the current collection of closed itemsets as bitsets.
134  *
135  * Args:
136  * size: Size of each bitset returned.
137  * top_k (size_t) - maximum number of closed itemsets to returns
138  * min_length (size_t) - minimum required length for a closed itemset
139  *
140  * Returns:
141  * closed_bitsets - Vector of bitsets representing each set.
142  */
143  std::vector<dense_bitset> get_top_k_closed_bitsets(
144  const size_t& size,
145  const size_t& top_k = TOP_K_MAX,
146  const size_t& min_length = 1) const;
147 
148  /**
149  * Return the top_k closed itemsets in descending order.
150  *
151  * Takes longer than get_closed_itemsets() due to sorting, but is faster
152  * than trying to sort a gl_sframe.
153  *
154  * Args:
155  * top_k (size_t) - maximum number of closed itemsets to returns
156  * min_length (size_t) - minimum required length for a closed itemset
157  *
158  * Returns:
159  * top_k_closed_itemsets (gl_sframe) - sframe of top_k itemsets, support
160  */
161  gl_sframe get_top_k_closed_itemsets(const size_t& top_k = TOP_K_MAX, \
162  const size_t& min_length = 1,
163  const std::shared_ptr<topk_indexer>& indexer = nullptr) const;
164 
165  /**
166  * Sort itemset by id_order_map (last element first)
167  * Args:
168  * itemset (vector of size_ts)
169  * Returns:
170  * sorted_itemset (vector of size_ts)
171  */
172  std::vector<size_t> sort_itemset(const std::vector<size_t>& itemset) const;
173 
174  /**
175  * Get the support for a frequent itemset
176  * Note: the support for the empty set is the total number of transactions
177  *
178  * Args:
179  * sorted_itemset (vector of size_t) - a sorted frequent itemset (not
180  * necessarily closed) (see sort_itemset).
181  * lower_bound_on_support (size_t) - optional, for efficiency
182  * Returns:
183  * support (size_t)
184  */
185  size_t get_support(const std::vector<size_t>& sorted_itemset, \
186  const size_t& lower_bound_on_support = 0) const;
187 
188  /**
189  * Get the number of transaction
190  */
191  inline size_t get_num_transactions() const {return root_node->item_count;};
192  /**
193  * Prune Tree
194  * Pass over the tree, removing all nodes (corresponding to itemsets)
195  * with support strictly less than the given (final) min support
196  * Rebuilds tree -> Should be rarely called (ideally only at the end)
197  *
198  * Args:
199  * min_support (size_t)
200  */
201  void prune_tree(const size_t& min_support);
202 
203 };
204 
205 // Helper Functions
206 
207 /**
208  * Check if itemset is a subset on path to root (from node)
209  * Args:
210  * sorted_itemset (vector of size_ts) - sorted in id_order
211  * node (pointer to fp_node) - starting point in tree
212  * Returns:
213  * is_subset (bool) - whether the sorted_itemset is a subset
214  */
215 bool is_subset_on_path(const std::vector<size_t>& sorted_itemset, \
216  fp_node* node);
217 
218 /**
219  * Provides printing of the fp_results_tree
220  */
221 std::ostream& operator<<(std::ostream& out, const fp_results_tree& tree);
222 
223 /**
224  * Converts an itemset (a vector of item_ids) into a flex_list for an sframe.
225  * Uses an indexer to convert from ids to item type
226  */
227 flex_list itemset_to_flex_list(const std::vector<size_t>& itemset, \
228  const std::shared_ptr<topk_indexer>& indexer = nullptr);
229 
230 /**
231  * Tree data structure for keeping track of the top_k frequent 'closed' itemsets
232  * of length at least min_length
233  * This is a compressed, memory efficient data structure used to store and
234  * during the mining of 'closed' itemsets.
235  *
236  * Extends the fp_results_tree class
237  * Additional Vars:
238  * top_k (size_t) - maximum number of closed itemsets to mine
239  * top_k should be less than TOP_K_MAX
240  * min_length (size_t) - minimum length of a closed itemset
241  * min_support_heap (min_heap) - min heap of top_k supports
242  *
243  */
245  public:
246  size_t top_k;
247  size_t min_length;
248  std::priority_queue<size_t, std::vector<size_t>, std::greater<size_t>>
249  min_support_heap;
250 
252  fp_top_k_results_tree(const std::vector<size_t>& id_order, \
253  const size_t& k = TOP_K_MAX, const size_t& length = 1);
254  fp_top_k_results_tree(const fp_top_k_results_tree & other_tree);
255 
256  /**
257  * Save the fp_results_tree into a oarc.
258  */
259  inline void save(oarchive& oarc) const {
260  // Save the id_order_map.
261  size_t num_transactions = root_node->item_count;
262  oarc << id_order_map
263  << top_k
264  << min_length
265  << num_transactions;
266 
267  // Save the closed_itemsets as SFrame.
268  gl_sframe closed_itemsets = get_top_k_closed_itemsets(TOP_K_MAX, 1);
269  const std::string & prefix = oarc.get_prefix();
270  closed_itemsets.save(prefix);
271  }
272 
273  /**
274  * Load the fp_results_tree from an iarc.
275  */
276  inline void load(iarchive& iarc) {
277  // Save the id_order_map.
278  size_t num_transactions = 0;
279  iarc >> id_order_map
280  >> top_k
281  >> min_length
282  >> num_transactions;
283 
284  // Load the root-node
285  root_node = std::make_shared<fp_node>(ROOT_ID, 0);
286  root_node->item_count = num_transactions;
287 
288  // Load the closed_itemset SFrame.
289  const std::string & prefix = iarc.get_prefix();
290  gl_sframe closed_itemsets = gl_sframe(prefix);
291  build_tree(closed_itemsets);
292  }
293 
294  /**
295  * Add a potential closed itemset to the tree
296  *
297  * Args:
298  * potential_itemset (vector of size_ts) - itemset to add
299  * support (size_t) - support of the itemset
300  */
301  void add_itemset(const std::vector<size_t>& potential_itemset, \
302  const size_t& support);
303 
304  /**
305  * Get a bound for the min_support.
306  *
307  * Returns:
308  * A current estimate of min support.
309  */
310  size_t get_min_support_bound();
311 
312  /**
313  * Insert support into the min_support heap.
314  *
315  * Args:
316  * support (size_t) - support of the itemset
317  */
318  void insert_support(const size_t& support);
319 
320  /**
321  * Overrides fp_result_tree::get_closed_itemsets
322  * Return the current collection of closed itemsets
323  * Args:
324  * indexer: An indexer to convert from id to the value.
325  *
326  * Returns:
327  * closed_itemsets (gl_sframe) - sframe of closed itemset (ids), support
328  */
330  const std::shared_ptr<topk_indexer>& indexer = nullptr) const;
331 
332 };
333 
334 
335 
336 
337 } // pattern_mining
338 } // turicreate
339 
340 #endif
std::vector< size_t > sort_itemset(const std::vector< size_t > &itemset) const
bool is_itemset_redundant(const std::vector< size_t > &potential_itemset, const size_t &support) const
gl_sframe get_closed_itemsets(const std::shared_ptr< topk_indexer > &indexer=nullptr) const
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
virtual void add_itemset(const std::vector< size_t > &potential_itemset, const size_t &support)
gl_sframe get_top_k_closed_itemsets(const size_t &top_k=TOP_K_MAX, const size_t &min_length=1, const std::shared_ptr< topk_indexer > &indexer=nullptr) const
void save(const std::string &path, const std::string &format="") const
void prune_tree(const size_t &min_support)
std::vector< dense_bitset > get_top_k_closed_bitsets(const size_t &size, const size_t &top_k=TOP_K_MAX, const size_t &min_length=1) const
size_t get_support(const std::vector< size_t > &sorted_itemset, const size_t &lower_bound_on_support=0) const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
std::vector< flexible_type > flex_list
void build_tree(const gl_sframe &closed_itemsets)