Turi Create  4.0
fp_tree.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FP_TREE_H
7 #define TURI_FP_TREE_H
8 
9 #include <vector>
10 #include <map>
11 #include <stack>
12 #include <string>
13 #include <memory>
14 #include <algorithm>
15 #include <iostream>
16 
17 #include <core/data/sframe/gl_sframe.hpp>
18 #include <core/data/sframe/gl_sarray.hpp>
19 
20 #include <toolkits/feature_engineering/topk_indexer.hpp>
21 #include <toolkits/feature_engineering/statistics_tracker.hpp>
22 
23 #include <toolkits/pattern_mining/fp_node.hpp>
24 #include <toolkits/pattern_mining/fp_tree_header.hpp>
25 
26 namespace turi {
27 namespace pattern_mining {
28 
29 const size_t TOP_K_MAX = std::numeric_limits<size_t>::max();
30 // FP-TREE /////////////////////////////////////////
31 /**
32  * FP-Tree data structure
33  * Vars:
34  * root_node (pointer fp_node) - the root node of the tree
35  * root_prefix (vector of ids) - the prefix of the fp-tree
36  * header (fp_tree_header) - the header for the fp-tree.
37  * my_closed_node_count - data structure to raise min_support
38  *
39  * See http://hanj.cs.illinois.edu/pdf/sigmod00.pdf
40  * http://web.engr.illinois.edu/~hanj/pdf/icdm02_topk.pdf and
41  * http://hanj.cs.illinois.edu/pdf/tkde05_tfp.pdf
42  */
43 class fp_tree {
44  public:
45  std::shared_ptr<fp_node> root_node = nullptr;
46  std::vector<size_t> root_prefix;
47  fp_tree_header header;
48 
49  // Constructors
50  fp_tree();
51  fp_tree(const fp_tree_header& header, \
52  const std::vector<size_t>& prefix = std::vector<size_t>());
53  fp_tree(const fp_tree & other_tree);
54  // Destructor
55  ~fp_tree();
56 
57  // Member Functions:
58  /**
59  * Prune tree of items less than min_support
60  * Args:
61  * min_support (size_t) - new min_support to prune tree
62  */
63  void prune_tree(const size_t& min_support);
64 
65  /**
66  * Get the support of all items at or below min_depth
67  * Args:
68  * min_depth (size_t)
69  * Returns:
70  * supports_at_depth (vector of size_t)
71  */
72  std::vector<size_t> get_supports_at_depth(const size_t& depth) const;
73 
74  /**
75  * Get the support of the item in transactions below a depth
76  *
77  * Args:
78  * heading (fp_tree_heading) - the item heading
79  * min_depth (size_t) - minimum depth in fp-tree (default is 1)
80  * Returns:
81  * support (size_t) the support (total count) of item id in the fp_tree
82  * (at min_depth)
83  */
84  size_t get_support(const fp_tree_heading& heading, \
85  const size_t& min_depth = 1) const;
86 
87  /**
88  * Get total number of transactions in the current fp-tree.
89  */
90  size_t get_num_transactions() const;
91 
92  /**
93  * Get the supports of descendant node of an anchor node
94  * Args:
95  * anchor_node (pointer to fp_node)
96  * Returns:
97  * supports (vector of size_t) the supports of the children nodes
98  * This vector is used to raise min_support.
99  */
100  std::vector<size_t> get_descendant_supports(fp_node* anchor_node);
101 
102  /**
103  * Add a transaction to the tree
104  * Args:
105  * new_transaction (vector of size_ts) - new itemset
106  * count (size_t) - repetitions of the transaction (should be positive)
107  */
108  virtual void add_transaction(const std::vector<size_t>& new_transaction,\
109  const size_t& count);
110 
111 
112  /**
113  * Get the header for a conditional fp-tree
114  *
115  * The conditional fp-tree of item 'alpha' is the fp-tree constructed
116  * from transcations in the current fp-tree that contain more frequent
117  * items than 'alpha'. Note that only items with support greater than
118  * min_support (in the new subtree) are returned.
119  *
120  * Args:
121  * heading (fp_tree_heading) - the item heading of the conditional fp-tree
122  * min_support (size_t) - the minimum number of transactions required
123  * to be 'frequent'
124  * Returns:
125  * cond_header (fp_tree-header) - header of conditional fp-tree
126  */
128  const size_t& min_support) const;
129 
130  /**
131  * Get the item frequencies for a conditional fp-tree
132  *
133  * Args:
134  * heading (fp_tree_heading) - the item heading of the conditional fp-tree
135  * Returns:
136  * item_counts (vector of size_t, size_t) - Vector of item id and counts
137  * of the conditional fp-tree
138  */
139  std::vector<std::pair<size_t, size_t>> get_cond_item_counts(\
140  const fp_tree_heading& heading) const;
141 
142  /**
143  * Construct the conditional fp-tree (fp-tree) for item id at the heading
144  *
145  * The conditional fp-tree of item 'id' is the fp-tree constructed
146  * from transcations in the current fp-tree that contain more frequent
147  * items than 'id'.
148  *
149  * Args:
150  * heading (fp_tree_heading) - the item heading of the conditional fp-tree
151  * min_support (size_t) - the minimum number of transactions required
152  * to be 'frequent'.
153  * Returns:
154  * cond_tree (fp_tree) - sorted vector of item ids.
155  */
156  fp_tree build_cond_tree(const fp_tree_heading& heading, \
157  const size_t& min_support) const;
158 
159 }; // End of fp-tree class
160 
161 // Other Functions
162 
163 /**
164  * Build the original fp-tree from a database of transactions and increase the
165  * minimum support using closed_node_count.
166  *
167  * Args:
168  * database (gl_sarray) - SArray of transactions (list of item ids)
169  * min_support (size_t) - minimum number of transactions required to be
170  * 'frequent'. Increased if possible.
171  * top_k (size_t) - maximum number of closed itemsets to mine
172  * min_length (size_t) - minimum length of closed itemsets to mine
173  * Returns:
174  * global_fp_tree (fp_tree) - the data structure for FP-growth
175  */
176 fp_tree build_tree(const gl_sarray& database, const size_t& min_support);
177 
178 // Helper functions
179 
180 /**
181  * Sorted and filter the header of an fp-tree
182  *
183  * Args:
184  * item_counts (vector of size_t, size_t) - vector of item id and count
185  * min_support (size_t) - minimum number of transactions required to be
186  * 'frequent'.
187  * Returns:
188  * header (fp_tree_header) - header for new fp_tree
189  */
190 fp_tree_header build_header( \
191  const std::vector<std::pair<size_t,size_t>>& item_counts,\
192  const size_t& min_support);
193 
194 /**
195  * Get the item frequencies for the original fp-tree
196  *
197  * Args:
198  * database (gl_sarray) - SArray of transactions (list of item ids)
199  * Returns:
200  * item_counts (vector of size_t, size_t) - vector of item id and counts
201  */
202 std::vector<std::pair<size_t, size_t>> get_item_counts(const gl_sarray& database);
203 
204 /**
205  * Convert row of gl_sarray to vector of item ids
206  * Removes Duplicates/Repeats in the row
207  * FLEX_UNDEFINED are ignored (this allows for empty vectors)
208  *
209  * Args:
210  * transaction_array (flexible_type) - flex_list of flex_int/double
211  * Returns:
212  * id_vector (vector of size_ts) - vector to add to fp_tree
213  */
214 std::vector<size_t> flex_to_id_vector(const flexible_type& transaction_array);
215 
216 /**
217  * Provides printing of the fp_tree.
218  */
219 std::ostream& operator<<(std::ostream& out, const fp_tree& tree);
220 
221 
222 
223 // FP_TOP_K_TREE /////////////////////////////////////////
224 /**
225  * TOP-K FP-Tree data structure
226  *
227  * Differs from a regular fp_tree in keeping track of a closed_node_count
228  * data structure to raise min_support
229  *
230  * Vars:
231  * top_k - number of closed itemsets to mine (MINE_ALL returns all itemsets)
232  * min_length - minimum length of closed itemset
233  * support_count_map - map of support to number of closed nodes with support
234  *
235  * http://web.engr.illinois.edu/~hanj/pdf/icdm02_topk.pdf
236  * http://hanj.cs.illinois.edu/pdf/tkde05_tfp.pdf
237  */
238 
239 class fp_top_k_tree : public fp_tree {
240  public:
241  size_t top_k;
242  size_t min_length;
243  std::map<size_t, size_t> closed_node_count;
244 
245  // Constructors are derived from fp_tree
246  fp_top_k_tree();
247  fp_top_k_tree(const fp_tree_header& header, \
248  const size_t& k = TOP_K_MAX, const size_t& length = 1, \
249  const std::vector<size_t>& prefix = std::vector<size_t>());
250  fp_top_k_tree(const fp_top_k_tree& other_tree);
251 
252  /**
253  * Get the min_support bound from closed_node_count
254  */
255  size_t get_min_support_bound();
256 
257  /*
258  * Anchor node + Descendent Support-raising-method
259  */
260  fp_node* get_anchor_node();
261  size_t get_anchor_min_support_bound();
262 
263  /**
264  * Returns min_depth for a node to be in transaction of length min_length
265  */
266  size_t get_min_depth();
267 
268  /**
269  * Modified methods to keep track of closed_node_count
270  */
272  size_t& min_support) const;
273  void add_transaction(const std::vector<size_t>& new_transaction,\
274  const size_t& count);
275  void update_if_closed_node(fp_node* node, const size_t& count);
276 
277 };
278 
279 /**
280  * Build the global top-k fp-tree from a database of transactions
281  *
282  * Args:
283  * database (gl_sarray) - SArray of transactions (list of item ids)
284  * min_support (size_t) - minimum number of transactions required to be
285  * 'frequent'. Increased if possible.
286  * top_k (size_t) - maximum number of closed itemsets to mine
287  * min_length (size_t) - minimum length of closed itemsets to mine
288  * Returns:
289  * global_top_k_tree (fp_top_k_tree) - the data structure for Top-K FP Growth
290  */
291 fp_top_k_tree build_top_k_tree(const gl_sarray& database, size_t& min_support,\
292  const size_t& top_k = TOP_K_MAX, const size_t& min_length = 1);
293 
294 /**
295  * Get k-th largest element of vector using min-heap.
296  */
297 size_t get_largest(std::vector<size_t> vec, size_t k);
298 
299 } // pattern_mining
300 } // turicreate
301 #endif
std::vector< size_t > get_supports_at_depth(const size_t &depth) const
void prune_tree(const size_t &min_support)
std::vector< std::pair< size_t, size_t > > get_cond_item_counts(const fp_tree_heading &heading) const
fp_tree build_cond_tree(const fp_tree_heading &heading, const size_t &min_support) const
std::vector< size_t > get_descendant_supports(fp_node *anchor_node)
fp_tree_header get_cond_header(const fp_tree_heading &heading, const size_t &min_support) const
size_t get_num_transactions() const
virtual void add_transaction(const std::vector< size_t > &new_transaction, const size_t &count)
size_t get_support(const fp_tree_heading &heading, const size_t &min_depth=1) const