Turi Create  4.0
hyperloglog.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_SKETCH_HYPERLOGLOG_HPP
7 #define TURI_SKETCH_HYPERLOGLOG_HPP
8 #include <cmath>
9 #include <cstdint>
10 #include <functional>
11 #include <core/util/bitops.hpp>
12 #include <core/util/cityhash_tc.hpp>
13 #include <core/logging/assertions.hpp>
14 namespace turi {
15 namespace sketches {
16 /**
17  * \ingroup sketching
18  * An implementation of the hyperloglog sketch for estimating the number of
19  * unique elements in a datastream.
20  *
21  * Implements the hyperloglog sketch algorithm as described in:
22  * Philippe Flajolet, Eric Fusy, Olivier Gandouet and Frederic Meunier.
23  * HyperLogLog: the analysis of a near-optimal cardinality
24  * estimation algorithm. Conference on Analysis of Algorithms (AofA) 2007.
25  *
26  * with further reference from:
27  * Stefan Heule, Marc Nunkesser and Alexander Hall.
28  * HyperLogLog in Practice: Algorithmic Engineering of a State of The
29  * Art Cardinality Estimation Algorithm.
30  * Proceedings of the EDBT 2013 Conference.
31  *
32  *
33  * Usage is simple.
34  * \code
35  * hyperloglog hll;
36  * // repeatedly call
37  * hll.add(stuff) // this is a templatized function.
38  * // will accept anything it can hash using std::hash.
39  * hll.estimate() // will return an estimate of the number of unique element
40  * hll.error_bound() // will return the standard deviation on the estimate
41  * \endcode
42  */
43 class hyperloglog {
44  private:
45  size_t m_b = 0; /// 2^b is the number of hash bins
46  size_t m_m = 0; /// equal to 2^b: The number of buckets
47  double m_alpha = 0;
48  std::vector<unsigned char> m_buckets; /// Buckets
49 
50  public:
51  /**
52  * Constructs a hyperloglog sketch using 2^b buckets.
53  * The resultant hyperloglog datastructure will require
54  * 2^b bytes of memory. b must be at least 4. (i.e. 2^4 buckets).
55  */
56  explicit inline hyperloglog(size_t b = 16):
57  m_b(b), m_m(1 << b), m_buckets(m_m, 0) {
58  ASSERT_GE(m_m, 16);
59  // constants from SFlajolet et al. Fig 3
60  switch(m_m) {
61  case 16:
62  m_alpha = 0.673;
63  break;
64  case 32:
65  m_alpha = 0.697;
66  break;
67  case 64:
68  m_alpha = 0.709;
69  break;
70  default:
71  m_alpha = 0.7213 / (1 + 1.079 / (double)m_m);
72  }
73  };
74 
75  /**
76  * Adds an arbitrary object to be counted. Any object type can be used,
77  * and there are no restrictions as long as std::hash<T> can be used to
78  * obtain a hash value.
79  */
80  template <typename T>
81  void add(const T& t) {
82  // Then cityhash's hash64 twice to distribute the hash.
83  // empirically, one hash64 does not produce enough scattering to
84  // get a good estimate
85  uint64_t h = hash64(hash64(t));
86  size_t index = h >> (64 - m_b);
87  DASSERT_LT(index, m_buckets.size());
88  unsigned char pos = h != 0 ? 1 + __builtin_clz(static_cast<unsigned int>(h)) : sizeof(size_t);
89  m_buckets[index] = std::max(m_buckets[index], pos);
90  }
91 
92  /**
93  * Merge two hyperloglog datastructures.
94  * The two data structures must be constructed with the same number of
95  * buckets. Combining of two sketches constructed on two disjoint data
96  * streams produces identical results to generating one sketch on the both
97  * data streams.
98  */
99  void combine(const hyperloglog& other) {
100  ASSERT_EQ(m_buckets.size(), other.m_buckets.size());
101  for (size_t i = 0;i < m_buckets.size(); ++i) {
102  m_buckets[i] = std::max(m_buckets[i], other.m_buckets[i]);
103  }
104  }
105 
106 
107  /**
108  * Returns the standard error of the estimate.
109  */
110  inline double error_bound() {
111  return estimate() * 1.04 / std::sqrt(m_m);
112  }
113 
114  /**
115  * Returns the estimate of the number of unique items.
116  */
117  inline double estimate() {
118  double E = 0;
119  for (size_t i = 0;i < m_buckets.size(); ++i) {
120  E += std::pow(2.0, -(double)m_buckets[i]);
121  }
122  E = m_alpha * m_m * m_m / E;
123  // perform bias correction for small values
124  if (E <= 2.5 * m_m) {
125  size_t zero_count = 0;
126  for(auto i: m_buckets) zero_count += (i == 0);
127  if (zero_count != 0) E = m_m * std::log((double)m_m / zero_count);
128  }
129  // we do not need correction for large values 64-bit hash, assume
130  // collisions are unlikely
131  return E;
132  }
133 }; // hyperloglog
134 } // namespace sketch
135 } // namespace turi
136 #endif
hyperloglog(size_t b=16)
Buckets.
Definition: hyperloglog.hpp:56
static uint64_t hash64(const char *s, size_t len)
void combine(const hyperloglog &other)
Definition: hyperloglog.hpp:99