Turi Create  4.0
countmin.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_SKETCHES_COUNTMIN_HPP
7 #define TURI_SKETCHES_COUNTMIN_HPP
8 #include <cmath>
9 #include <cstdint>
10 #include <functional>
11 #include <core/random/random.hpp>
12 #include <core/util/cityhash_tc.hpp>
13 #include <core/logging/assertions.hpp>
14 #include <core/storage/serialization/serialization_includes.hpp>
15 namespace turi {
16 namespace sketches {
17 
18 /**
19  * \ingroup sketching
20  * An implementation of the countmin sketch for estimating the frequency
21  * of each item in a stream.
22  *
23  * For more information on the details of the sketch:
24  * http://dimacs.rutgers.edu/~graham/pubs/papers/cmsoft.pdf
25  * The implementation generally follows the pseudocode in Figure 2.
26  * The resulting probabilistic data structure
27  *
28  * Usage is simple.
29  * \code
30  * countmin<T> cm; // for approximate counting of a stream with type T
31  * // repeatedly call
32  * cm.add(x) // x can be anything that is hashable.
33  * cm.estimate(x) // will return an estimate of the frequency for
34  * // a given element
35  * \endcode
36  * One can obtain guarantees on the error in answering a query is within a factor of
37  * ε with probability δ if one sets the
38  * width=ceiling(e / ε)
39  * depth=ceiling(log(1/δ)
40  * where e is Euler's constant.
41  *
42  */
43 template <typename T>
44 class countmin {
45  private:
46 
47  size_t num_hash = 0; /// Number of hash functions to use
48  size_t num_bits = 0; /// 2^b is the number of hash bins
49  size_t num_bins = 0; /// equal to 2^b: The number of buckets
50 
51  std::vector<size_t> seeds;
52  std::vector<std::vector<size_t> > counts; /// Buckets
53 
54  public:
55  /**
56  * Constructs a countmin sketch having "width" 2^bits and "depth".
57  * The size of the matrix of counts will be depth x 2^bits.
58  *
59  * \param bits The number of bins will be 2^bits.
60  *
61  * \param depth The "depth" of the sketch is the number of hash functions
62  * that will be used on each item.
63  */
64  explicit inline countmin(size_t bits = 16, size_t depth = 4, size_t seed=1000):
65  num_hash(depth), num_bits(bits), num_bins(1 << num_bits) {
66  ASSERT_GE(num_bins, 16);
67 
69  gen.seed(seed);
70  // Initialize hash functions and count matrix
71  for (size_t j = 0; j < num_hash; ++j) {
72  seeds.push_back(gen.fast_uniform<size_t>(0, std::numeric_limits<size_t>::max()));
73  counts.push_back(std::vector<size_t>(num_bins));
74  }
75  }
76 
77  /**
78  * Adds an arbitrary object to be counted. Any object type can be used,
79  * and there are no restrictions as long as std::hash<T> can be used to
80  * obtain a hash value.
81  */
82  void add(const T& t, size_t count = 1) {
83  // we use std::hash first, to bring it to a 64-bit number
84  size_t i = hash64(std::hash<T>()(t));
85  for (size_t j = 0; j < num_hash; ++j) {
86  size_t bin = hash64(seeds[j] ^ i) % num_bins; // TODO: bit mask
87  counts[j][bin] += count;
88  }
89  }
90 
91  /**
92  * Adds an arbitrary object to be counted. Any object type can be used,
93  * and there are no restrictions as long as std::hash<T> can be used to
94  * obtain a hash value.
95  */
96  void atomic_add(const T& t, size_t count = 1) {
97  // we use std::hash first, to bring it to a 64-bit number
98  size_t i = hash64(std::hash<T>()(t));
99  for (size_t j = 0; j < num_hash; ++j) {
100  size_t bin = hash64(seeds[j] ^ i) % num_bins; // TODO: bit mask
101  __sync_add_and_fetch(&(counts[j][bin]), count);
102  }
103  }
104  /**
105  * Returns the estimate of the frequency for a given object.
106  */
107  inline size_t estimate(const T& t) const {
108 
109  size_t E = std::numeric_limits<size_t>::max();
110  size_t i = hash64(std::hash<T>()(t));
111 
112  // Compute the minimum value across hashes.
113  for (size_t j = 0; j < num_hash; ++j) {
114  size_t bin = hash64(seeds[j] ^ i) % num_bins;
115  if (counts[j][bin] < E)
116  E = counts[j][bin];
117  }
118  return E;
119  }
120 
121  /**
122  * Merge two countmin datastructures.
123  * The two countmin objects must have the same width and depth.
124  */
125  void combine(const countmin& other) {
126  ASSERT_EQ(num_bins, other.num_bins);
127  ASSERT_EQ(num_hash, other.num_hash);
128 
129  for (size_t j = 0; j < num_hash; ++j) {
130  for (size_t b = 0; b < num_bins; ++b) {
131  counts[j][b] += other.counts[j][b];
132  }
133  }
134  }
135 
136  ///// HELPER FUNCTIONS /////////////////
137 
138  /**
139  * Prints the internal matrix containing the current counts.
140  */
141  inline void print() const {
142  for (size_t j = 0; j < num_hash; ++j) {
143  std::cout << ">>> ";
144  for (size_t b = 0; b < num_bins; ++b) {
145  std::cout << counts[j][b] << " ";
146  }
147  std::cout << std::endl;
148  }
149  }
150 
151  /**
152  * Computes the density of the internal counts matrix.
153  */
154  inline double density() const {
155  size_t count = 0;
156  for (size_t j = 0; j < num_hash; ++j) {
157  for (size_t b = 0; b < num_bins; ++b) {
158  if (counts[j][b] != 0) count += 1;
159  }
160  }
161  return (double) count / (double) (num_hash * num_bins);
162  }
163 
164  void save(oarchive& oarc) const {
165  oarc << num_hash
166  << num_bits
167  << num_bins
168  << seeds
169  << counts;
170  }
171  void load(iarchive& iarc) {
172  iarc >> num_hash
173  >> num_bits
174  >> num_bins
175  >> seeds
176  >> counts;
177  }
178 }; // countmin
179 } // namespace sketches
180 } // namespace turi
181 #endif
NumType fast_uniform(const NumType min, const NumType max)
Definition: random.hpp:159
void combine(const countmin &other)
Definition: countmin.hpp:125
void add(const T &t, size_t count=1)
Definition: countmin.hpp:82
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void seed()
Seed the generator using the default seed.
Definition: random.hpp:101
static uint64_t hash64(const char *s, size_t len)
size_t estimate(const T &t) const
Definition: countmin.hpp:107
double density() const
Definition: countmin.hpp:154
countmin(size_t bits=16, size_t depth=4, size_t seed=1000)
Buckets.
Definition: countmin.hpp:64
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void atomic_add(const T &t, size_t count=1)
Definition: countmin.hpp:96