Turi Create  4.0
hopscotch_table.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UTIL_HOPSCOTCH_TABLE_HPP
7 #define TURI_UTIL_HOPSCOTCH_TABLE_HPP
8 
9 #include <vector>
10 #include <utility>
11 #include <algorithm>
12 #include <functional>
13 #include <iterator>
14 #include <core/logging/assertions.hpp>
15 
16 #define _HOPSCOTCH_TABLE_DEFAULT_HASH std::hash<T>
17 
18 
19 namespace turi {
20 
21 
22 /**
23  * This defines a hash table where each entry stores a
24  * fixed data type T. The data type T should be <b>small</b>
25  * and should preferably fit in a couple of words.
26  * This hash table is not resizeable. Use the hopscotch_map
27  * For a more general purpose table.
28  *
29  * \tparam T The data type stored in the hash table
30  * \tparam Hash The hash functor type. Defaults to std::hash<T> if C++11 is
31  * available. Otherwise defaults to boost::hash<T>
32  * \tparam KeyEqual The functor used to identify object equality. Defaults to
33  * std::equal_to<T>
34  */
35 template <typename T,
36  typename Hash = _HOPSCOTCH_TABLE_DEFAULT_HASH,
37  typename KeyEqual = std::equal_to<T> >
39  public:
40  /// The data type stored in the table
41  typedef T value_type;
42  typedef size_t size_type;
43  /// The type of the hasher object
44  typedef Hash hasher;
45  /// The type of the equality tester
46  typedef KeyEqual equality_function;
47  /// A pointer to the data type stored in the table
48  typedef value_type* pointer;
49  /// A reference to the data type stored in the table
50  typedef value_type& reference;
51  /// A constant pointer to the data type stored in the table
52  typedef const value_type* const_pointer;
53  /// A constant reference to the data type stored in the table
54  typedef const value_type& const_reference;
55 
56  private:
57  /// The actual contents of the hash table
58  struct element {
59  bool hasdata: 1; /// Whether this entry has data.
60  uint32_t field: 31; /// The hopscotch bitfield. Only 31 bits are useable
61  T elem; /// User data
62  element():hasdata(false), field(0) { }
63  };
64 
65  std::vector<element> data;
66 
67  hasher hashfun;
68  equality_function equalfun;
69  size_t numel;
70  size_t mask;
71 
72  /// Returns the next power of 2 of a value
73  static uint64_t next_powerof2(uint64_t val) {
74  --val;
75  val = val | (val >> 1);
76  val = val | (val >> 2);
77  val = val | (val >> 4);
78  val = val | (val >> 8);
79  val = val | (val >> 16);
80  val = val | (val >> 32);
81  return val + 1;
82  }
83 
84  /** Computes the hash of the data. And perturbs it
85  * using either CRC32 or Jenkin's 32-bit mix
86  */
87  size_t compute_hash(const value_type& d) const {
88  size_t state = hashfun(d);
89 #ifdef HAS_BUILTIN_CRC32
90  return __builtin_ia32_crc32di(0, state);
91 #else
92  /*
93  * Bob Jenkin's 32 bit integer mix function from
94  * http://home.comcast.net/~bretm/hash/3.html
95  */
96  state += (state << 12);
97  state ^= (state >> 22);
98  state += (state << 4);
99  state ^= (state >> 9);
100  state += (state << 10);
101  state ^= (state >> 2);
102  state += (state << 7);
103  state ^= (state >> 12);
104  return state;
105 #endif
106  }
107 
108 
109 
110  public:
111  /**
112  * Constructs a hopscotch table of a given length.
113  *
114  * \param len This rounded up to the next power of 2 will be used as
115  * the length of the table. This table is not resizeable.
116  * \param hashfun The hasher functor. Defaults to Hash()
117  * \param equalfun A functor used to test for equality. Defaults to KeyEqual()
118  */
119  hopscotch_table(size_t len,
120  Hash hashfun = Hash(),
121  KeyEqual equalfun = KeyEqual()):
122  data(next_powerof2(len) + 32),
123  hashfun(hashfun),
124  equalfun(equalfun),
125  numel(0),
126  mask(data.size() - 32 - 1) {
127  }
128 
129  /// Returns the hash function used by the hash table
130  hasher hash_function() const {
131  return hashfun;
132  }
133 
134  /// Returns the equality function used by the hash table
135  equality_function key_eq() const {
136  return equalfun;
137  }
138 
139  /**
140  * A const iterator which allows iteration over the hash table
141  * entries. Insertions may disrupt the iterator order. Deletions
142  * invalidate the iterator.
143  */
144  struct const_iterator {
145  typedef std::forward_iterator_tag iterator_category;
146  typedef const typename hopscotch_table::value_type value_type;
147  typedef size_t difference_type;
148  typedef value_type* pointer;
149  typedef value_type& reference;
150 
151  friend class hopscotch_table;
152 
153  const hopscotch_table* ptr;
154  typename std::vector<element>::const_iterator iter;
155 
156  const_iterator():ptr(NULL) {}
157 
158  const_iterator operator++() {
159  ++iter;
160  while(iter != ptr->data.end() && !iter->hasdata) {
161  ++iter;
162  }
163  return *this;
164  }
165 
166  const_iterator operator++(int) {
167  iterator cur = *this;
168  ++(*this);
169  return cur;
170  }
171 
172 
173  reference operator*() {
174  return iter->elem;
175  }
176 
177  pointer operator->() {
178  return &(iter->elem);
179  }
180 
181  bool operator==(const const_iterator it) const {
182  return ptr == it.ptr && iter == it.iter;
183  }
184 
185  bool operator!=(const const_iterator iter) const {
186  return !((*this) == iter);
187  }
188 
189 
190  private:
191  const_iterator(const hopscotch_table* table,
192  typename std::vector<element>::const_iterator iter):
193  ptr(table), iter(iter) { }
194  };
195 
196 
197 
198  /**
199  * A const iterator which allows iteration over the hash table
200  * entries. Insertions may disrupt the iterator order. Deletions
201  * invalidate the iterator.
202  */
203  struct iterator {
204  typedef std::forward_iterator_tag iterator_category;
205  typedef typename hopscotch_table::value_type value_type;
206  typedef size_t difference_type;
207  typedef value_type* pointer;
208  typedef value_type& reference;
209 
210  friend class hopscotch_table;
211 
212  hopscotch_table* ptr;
213  typename std::vector<element>::iterator iter;
214 
215  iterator():ptr(NULL) {}
216 
217 
218  operator const_iterator() const {
219  const_iterator it(ptr, iter);
220  return it;
221  }
222 
223  iterator operator++() {
224  ++iter;
225  while(iter != ptr->data.end() && !iter->hasdata) {
226  ++iter;
227  }
228  return *this;
229  }
230 
231  iterator operator++(int) {
232  iterator cur = *this;
233  ++(*this);
234  return cur;
235  }
236 
237 
238  reference operator*() {
239  return iter->elem;
240  }
241 
242  pointer operator->() {
243  return &(iter->elem);
244  }
245 
246  bool operator==(const iterator it) const {
247  return ptr == it.ptr && iter == it.iter;
248  }
249 
250  bool operator!=(const iterator iter) const {
251  return !((*this) == iter);
252  }
253 
254 
255  private:
256  iterator(hopscotch_table* table,
257  typename std::vector<element>::iterator iter):
258  ptr(table), iter(iter) { }
259  };
260 
261  /** Standard insert iterator. Writing into this iterator
262  * will cause insertions to occur. It is however, recommended
263  * that the insert() operation be used instead of the insert_iterator
264  * since the insert_iterator silently fails on insert failure.
265  */
267  hopscotch_table* cmap;
268  typedef std::forward_iterator_tag iterator_category;
269  typedef typename hopscotch_table::value_type value_type;
270 
271  insert_iterator(hopscotch_table* c):cmap(c) {}
272 
273  insert_iterator operator++() {
274  return (*this);
275  }
276  insert_iterator operator++(int) {
277  return (*this);
278  }
279 
280  insert_iterator& operator*() {
281  return *this;
282  }
283  insert_iterator& operator=(const insert_iterator& i) {
284  cmap = i.cmap;
285  return *this;
286  }
287 
288  insert_iterator& operator=(const value_type& v) {
289  cmap->insert(v);
290  return *this;
291  }
292  };
293 
294  private:
295  /**
296  * Searches for a target entry and overwrites if it exists
297  */
298  iterator try_find_and_overwrite(const value_type& newdata,
299  size_t target,
300  bool overwrite) {
301  // find the next empty entry
302  iterator iter = find_impl(newdata, target);
303  if (iter != end() && overwrite) {
304  iter.iter->elem = newdata;
305  }
306  return iter;
307  }
308 
309  /**
310  * If overwrite is set, it will additionally check for existance
311  * of the entry and overwrite if it exists.
312  * Iterator is not going to be necessarily valid under parallel access.
313  */
314  iterator insert_impl(const value_type& newdata, bool overwrite = true) {
315  // find the next empty entry
316  size_t target = compute_hash(newdata) & mask;
317 
318  iterator ret = try_find_and_overwrite(newdata,
319  target,
320  overwrite);
321  if (ret != end()) return ret;
322 
323  // search for a place to stick it into
324  bool found = false;
325  size_t shift_target = target;
326  // let max range is 31 * 20
327  size_t limit = std::min(data.size(), target + 31 * 20);
328  for (;shift_target < limit; shift_target++) {
329  if (data[shift_target].hasdata == false) {
330  // double check
331  if (data[shift_target].hasdata == false) {
332  // yup still true.
333  // we got an empty value.
334  // quit the search
335  found = true;
336  break;
337  }
338  }
339  }
340 
341  if (!found) {
342  // failed to find a place to put this value.
343  return iterator(this, data.end());
344  }
345 
346  // while the shift target is out of range
347  while(shift_target - target >= 31) {
348  // search backwards
349  // we would like to jump as far as possible
350  // find an hash entry whose field placed something
351  // between here and the shift target
352  // and move it to the shift target.
353  // for i = 31 to 1
354  found = false;
355 
356  for (size_t i = 30; i >= 1; --i) {
357  size_t r;
358  if (data[shift_target - i].field) {
359  r = __builtin_ctz(data[shift_target - i].field);
360  if (r <= i) {
361  // shift
362  size_t new_shift_target = shift_target - i + r;
363  DASSERT_TRUE(data[new_shift_target].hasdata);
364  data[shift_target].elem = data[new_shift_target].elem;
365  data[shift_target].hasdata = true;
366  data[new_shift_target].hasdata = false;
367  data[new_shift_target].elem = T();
368 
369  // unset the bit for r and set the bit for i
370  data[shift_target - i].field =
371  (data[shift_target - i].field & ~((uint32_t)1 << r))
372  | ((uint32_t)1 << i);
373  shift_target = new_shift_target;
374  found = true;
375  break;
376  }
377  }
378  }
379 
380  if (!found) {
381  return iterator(this, data.end());
382  }
383  }
384  // insert and return
385  // we need to lock ID - 1 so as to ensure intersection with the hash target
386  data[shift_target].elem = newdata;
387  data[target].field |= (1 << (shift_target - target));
388  data[shift_target].hasdata = true;
389  ++numel;
390  return iterator(this, data.begin() + shift_target);
391  }
392 
393 
394  /**
395  * Searches for an entry and returns an iterator to the entry.
396  * The hash entry of the key is provided.
397  * KeyEqual will be used to identify if an entry matches the request.
398  * return end() on failure.
399  */
400  const_iterator find_impl(const value_type& key, size_t target) const {
401  uint32_t field = data[target].field;
402  while (field > 0) {
403  int r = __builtin_ctz(field);
404  if (data[target + r].hasdata &&
405  key_eq()(data[target + r].elem, key)) {
406  return const_iterator(this, data.begin() + target + r);
407  }
408  else {
409  // mask out the current bit and try again.
410  field &= ~(((uint32_t)1 << r));
411  }
412  }
413  return const_iterator(this, data.end());
414  }
415 
416 
417  iterator find_impl(const value_type& key, size_t target) {
418  const_iterator iter = ((const hopscotch_table*)(this))->find_impl(key, target);
419  return iterator(this, data.begin() + (iter.iter - data.begin()));
420  }
421 
422  public:
423  /**
424  * Inserts an entry into the array.
425  * Returns an iterator to the just inserted data on success.
426  * If the entry already exists, it will be overwritten.
427  * Returns end() on failure.
428  */
429  iterator insert(const value_type& newdata) {
430  return insert_impl(newdata);
431  }
432 
433  /**
434  * Inserts an entry into the array.
435  * Returns an iterator to the just inserted data on success.
436  * This function check if the entry already exists, if it does,
437  * do nothing
438  * Returns end() on failure.
439  */
440  iterator insert_do_not_overwrite(const value_type& newdata) {
441  return insert_impl(newdata, false);
442  }
443 
444 
445 
446  /**
447  * Searches for an entry and returns an iterator to the entry.
448  * KeyEqual will be used to identify if an entry matches the request.
449  * return end() on failure.
450  */
451  const_iterator find(const value_type& key) const {
452  size_t target = compute_hash(key) & mask;
453  return find_impl(key, target);
454  }
455 
456  /**
457  * Searches for an entry and returns an iterator to the entry.
458  * KeyEqual will be used to identify if an entry matches the request.
459  * return end() on failure.
460  */
461  iterator find(const value_type& key) {
462  const_iterator iter = ((const hopscotch_table*)(this))->find(key);
463  return iterator(this, data.begin() + (iter.iter - data.begin()));
464  }
465 
466 
467  void clear() {
468  for (size_t i = 0;i < data.size(); ++i) {
469  data[i].hasdata = false;
470  data[i].field = 0;
471  data[i].elem = value_type();
472  }
473  numel = 0;
474  }
475 
476  /**
477  * Erases an entry pointed to by an iterator.
478  */
479  bool erase(iterator iter) {
480  if (iter.iter == data.end()) return false;
481  DASSERT_TRUE(iter.iter->hasdata);
482  size_t target = compute_hash(iter.iter->elem) & mask;
483  size_t offset = iter.iter - (data.begin() + target);
484  DASSERT_TRUE(offset < 31);
485  --numel;
486  iter.iter->hasdata = false;
487  iter.iter->elem = value_type();
488  data[target].field &= ~((uint32_t)1 << offset);
489  return true;
490  }
491 
492  /// Erases a entry matching a given value.
493  bool erase(const value_type& key) {
494  return erase(find(key));
495  }
496 
497  /// Returns an iterator to the start of the table
499  // find the first which is not empty
500  typename std::vector<element>::iterator iter = data.begin();
501  while (iter != data.end() && !iter->hasdata) {
502  ++iter;
503  }
504  return iterator(this, iter);
505  }
506 
507  /// Returns an iterator to the start of the table
509  // find the first which is not empty
510  typename std::vector<element>::iterator iter = data.begin();
511  while (iter != data.end() && !iter->hasdata) {
512  ++iter;
513  }
514  return const_iterator(this, iter);
515  }
516 
517  /// Returns an iterator to the end of the table
519  return iterator(this, data.end());
520  }
521 
522  /// Returns an iterator to the end of the table
523  const_iterator end() const {
524  return const_iterator(this, data.end());
525  }
526 
527  /// Returns 1 if the table contains a given element. 0 otherwise.
528  size_t count(const value_type& v) const {
529  return find(v) != end();
530  }
531 
532  /// Returns true if the table contains a given element. false otherwise.
533  bool contains(const value_type& v) const {
534  return find(v) != end();
535  }
536 
537  /// Returns the number of elements in the table
538  size_t size() const {
539  return numel;
540  }
541 
542  /// Returns the capacity of the table
543  size_t capacity() const {
544  return data.size();
545  }
546 
547  float load_factor() const {
548  return float(size()) / capacity();
549  }
550 
551  // now for the safe accessors
552 
553  hopscotch_table& operator=(const hopscotch_table& other) {
554  data = other.data;
555  hashfun = other.hashfun;
556  equalfun = other.equalfun;
557  numel = other.numel;
558  mask = other.mask;
559  return *this;
560  }
561 
562 
563  /** Inserts an element into the hash table. Safe under parallel access.
564  * if t already exists, it will be overwritten
565  */
566  bool put(const T& t) {
567  // since data is not resizeable,
568  // data.end() is always valid.
569  return insert_impl(t).iter != data.end();
570  }
571 
572 
573  /** Inserts an element into the hash table. Safe under parallel access.
574  * if t already exists, nothing will happen
575  */
576  bool put_do_not_overwrite(const T& t) {
577  // since data is not resizeable,
578  // data.end() is always valid.
579  return insert_impl(t, false).iter != data.end();
580  }
581 
582 
583  /** If the argument is found in the hash table,
584  * return {true, V} where V is the hash table content matching the argument.
585  * Otherwise {false, T()} is returned.
586  * KeyEqual() is used to compare entries.
587  * Safe under parallel access.
588  */
589  std::pair<bool, T> get(const T& t) const {
590  // fast path. Try to get it without locking
591  const_iterator iter = find(t);
592  if (iter != end()) {
593  // take a snapshot of the data
594  element e = *(iter.iter);
595  if (e.hasdata && key_eq()(e.elem, t)) {
596  return std::make_pair(true, e.elem);
597  }
598  }
599 
600  return std::make_pair(false, T());
601  }
602 };
603 
604 } // turicreate
605 #endif
size_t capacity() const
Returns the capacity of the table.
KeyEqual equality_function
The type of the equality tester.
Hash hasher
The type of the hasher object.
hasher hash_function() const
Returns the hash function used by the hash table.
value_type * pointer
A pointer to the data type stored in the table.
iterator end()
Returns an iterator to the end of the table.
iterator insert_do_not_overwrite(const value_type &newdata)
size_t count(const value_type &v) const
Returns 1 if the table contains a given element. 0 otherwise.
const_iterator find(const value_type &key) const
iterator begin()
Returns an iterator to the start of the table.
bool contains(const value_type &v) const
Returns true if the table contains a given element. false otherwise.
bool erase(const value_type &key)
Erases a entry matching a given value.
bool erase(iterator iter)
bool put_do_not_overwrite(const T &t)
equality_function key_eq() const
Returns the equality function used by the hash table.
iterator find(const value_type &key)
const value_type * const_pointer
A constant pointer to the data type stored in the table.
const_iterator begin() const
Returns an iterator to the start of the table.
hopscotch_table(size_t len, Hash hashfun=Hash(), KeyEqual equalfun=KeyEqual())
iterator insert(const value_type &newdata)
size_t size() const
Returns the number of elements in the table.
T value_type
The data type stored in the table.
value_type & reference
A reference to the data type stored in the table.
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
const value_type & const_reference
A constant reference to the data type stored in the table.
const_iterator end() const
Returns an iterator to the end of the table.