Turi Create  4.0
countsketch.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_SKETCHES_COUNTSKETCH_HPP
7 #define TURI_SKETCHES_COUNTSKETCH_HPP
8 #include <cmath>
9 #include <cstdint>
10 #include <functional>
11 #include <core/random/random.hpp>
12 #include <core/util/cityhash_tc.hpp>
13 #include <core/logging/assertions.hpp>
14 namespace turi {
15 namespace sketches {
16 
17 typedef int64_t counter_int;
18 
19 /**
20  * \ingroup sketching
21  * An implementation of the CountSketch for estimating the frequency
22  * of each item in a stream.
23  *
24  * Usage is simple.
25  * \code
26  * countsketch<T> c; // for approximate counting of a stream with type T
27  * // repeatedly call
28  * c.add(x) // x can be anything that is hashable.
29  * c.estimate(x) // will return an estimate of the frequency for
30  * // a given element
31  * \endcode
32  *
33  */
34 template <typename T>
35 class countsketch {
36  private:
37 
38  size_t num_hash = 0; /// Number of hash functions to use
39  size_t num_bits = 0; /// 2^b is the number of hash bins
40  size_t num_bins = 0; /// equal to 2^b: The number of buckets
41 
42  std::vector<size_t> seeds;
43  std::vector<size_t> seeds_binary;
44 
45  std::vector<std::vector<counter_int> > counts; /// Buckets
46 
47  public:
48  /**
49  * Constructs a CountSketch having "width" 2^bits and "depth".
50  * The size of the matrix of counts will be depth x 2^bits.
51  *
52  * \param bits The number of bins will be 2^bits.
53  *
54  * \param depth The "depth" of the sketch is the number of hash functions
55  * that will be used on each item.
56  */
57  explicit inline countsketch(size_t bits = 16, size_t depth = 5, size_t seed = 1000):
58  num_hash(depth), num_bits(bits), num_bins(1 << num_bits) {
59  ASSERT_GE(num_bins, 16);
60 
62  gen.seed(seed);
63  // Initialize hash functions and count matrix
64  for (size_t j = 0; j < num_hash; ++j) {
65  seeds.push_back(gen.fast_uniform<size_t>(0, std::numeric_limits<size_t>::max()));
66  seeds_binary.push_back(gen.fast_uniform<size_t>(0, std::numeric_limits<size_t>::max()));
67 
68  counts.push_back(std::vector<counter_int>(num_bins));
69  }
70  }
71 
72  /**
73  * Adds an arbitrary object to be counted. Any object type can be used,
74  * and there are no restrictions as long as std::hash<T> can be used to
75  * obtain a hash value.
76  *
77  * Note:
78  * Theoretical properties only apply to the situation where count is 1.
79  */
80  void add(const T& t, size_t count = 1) {
81 
82  // Create a 64-bit number from the object
83  size_t i = hash64(std::hash<T>()(t));
84 
85  for (size_t j = 0; j < num_hash; ++j) {
86  // convert trailing bit to 1 or -1
87  counter_int s = (counter_int)( hash64(seeds_binary[j] ^ i) & 1);
88  s = 2*s - 1;
89  // compute which bin to increment
90  size_t bin = hash64(seeds[j] ^ i) % num_bins; // TODO: bit mask
91  counts[j][bin] += s * (counter_int) count;
92  }
93  }
94 
95  /**
96  * Returns the estimate of the frequency for a given object.
97  */
98  inline counter_int estimate(const T& t) {
99 
100  // Create a 64-bit number from the object
101  size_t i = hash64(std::hash<T>()(t));
102 
103  // Compute the minimum value across hashes.
104  std::vector<counter_int> estimates;
105  for (size_t j = 0; j < num_hash; ++j) {
106  // convert trailing bit to 1 or -1
107  counter_int s = (counter_int) (hash64(seeds_binary[j] ^ i) & 1); // convert trailing bit to 1 or -1
108  s = 2*s - 1;
109  // compute which bin to increment
110  size_t bin = hash64(seeds[j] ^ i) % num_bins; // TODO: bit mask
111  counter_int estimate = s * counts[j][bin];
112  estimates.push_back(estimate);
113  }
114 
115  // Return the median
116  std::nth_element(estimates.begin(),
117  estimates.begin() + estimates.size()/2,
118  estimates.end());
119 
120  return estimates[estimates.size()/2];
121  }
122 
123  /**
124  * Merge two CountSketch datastructures.
125  * The two countsketch objects must have the same width and depth.
126  */
127  void combine(const countsketch& other) {
128  ASSERT_EQ(num_bins, other.num_bins);
129  ASSERT_EQ(num_hash, other.num_hash);
130 
131  for (size_t j = 0; j < num_hash; ++j) {
132  for (size_t b = 0; b < num_bins; ++b) {
133  counts[j][b] += other.counts[j][b];
134  }
135  }
136  }
137 
138  ///// HELPER FUNCTIONS /////////////////
139 
140  /**
141  * Prints the internal matrix containing the current counts.
142  */
143  inline void print() {
144  for (size_t j = 0; j < num_hash; ++j) {
145  std::cout << ">>> ";
146  for (size_t b = 0; b < num_bins; ++b) {
147  std::cout << counts[j][b] << " ";
148  }
149  std::cout << std::endl;
150  }
151  }
152 
153  /**
154  * Computes the density of the internal counts matrix.
155  */
156  inline double density() {
157  size_t count = 0;
158  for (size_t j = 0; j < num_hash; ++j) {
159  for (size_t b = 0; b < num_bins; ++b) {
160  if (counts[j][b] != 0) count += 1;
161  }
162  }
163  return (double) count / (double) (num_hash * num_bins);
164  }
165 
166 }; // countsketch
167 } // namespace sketches
168 } // namespace turi
169 #endif
void add(const T &t, size_t count=1)
Definition: countsketch.hpp:80
void combine(const countsketch &other)
NumType fast_uniform(const NumType min, const NumType max)
Definition: random.hpp:159
void seed()
Seed the generator using the default seed.
Definition: random.hpp:101
static uint64_t hash64(const char *s, size_t len)
countsketch(size_t bits=16, size_t depth=5, size_t seed=1000)
Buckets.
Definition: countsketch.hpp:57
counter_int estimate(const T &t)
Definition: countsketch.hpp:98