6 #ifndef TURI_SKETCHES_SPACE_SAVING_SKETCH_HPP 7 #define TURI_SKETCHES_SPACE_SAVING_SKETCH_HPP 13 #include <core/generics/value_container_mapper.hpp> 52 init_data_structures();
66 m_max_capacity = size_t(std::ceil(1.0 / epsilon) + 1);
76 base_level_candidate_count = 0;
77 cached_insertion_key = hashkey();
78 cached_insertion_value =
nullptr;
85 void add(
const T& t,
size_t count = 1) {
86 add_impl(t, count, 0);
101 std::vector<std::pair<T, size_t> > ret;
103 if(n_entries < entries.size()) {
105 ret.resize(n_entries);
106 for(
size_t i = 0; i < n_entries; ++i) {
107 const entry& e = entries[i];
108 ret[i] = {e.value(), e.count};
112 size_t threshhold = std::max(
size_t(1),
size_t(m_epsilon *
size()));
114 for(
size_t i = 0; i < n_entries; ++i) {
115 const entry& e = entries[i];
117 if (e.count >= threshhold)
118 ret.push_back(std::make_pair(e.value(), e.count));
132 std::vector<std::pair<T, size_t> > ret;
134 if(n_entries < entries.size()) {
136 ret.resize(n_entries);
137 for(
size_t i = 0; i < n_entries; ++i) {
138 const entry& e = entries[i];
139 ret[i] = {e.value(), e.count};
142 size_t threshhold = std::max(
size_t(1),
size_t(m_epsilon *
size()));
144 for(
size_t i = 0; i < n_entries; ++i) {
145 const entry& e = entries[i];
147 if (e.count - e.error >= threshhold)
148 ret.push_back(std::make_pair(e.value(), e.count));
158 template <
typename U>
159 typename std::enable_if<std::is_convertible<U, T>::value,
void>::type
175 size_t m_max_capacity = 0;
176 double m_epsilon = 1;
181 typedef typename global_map_type::hashkey hashkey;
193 entry(hashkey_and_value&& _kv)
197 hashkey_and_value kv;
199 inline const hashkey_and_value& get_hashkey_and_value()
const {
return kv; }
201 inline const T& value()
const {
return kv.value(); }
215 size_t n_entries = 0;
216 std::vector<entry> entries;
219 size_t base_level = 1;
221 std::vector<entry*> base_level_candidates;
222 size_t base_level_candidate_count = 0;
224 global_map_type global_map;
227 hashkey cached_insertion_key;
228 entry* cached_insertion_value;
232 std::vector<size_t> index_buffer;
237 void init_data_structures() {
238 entries.resize(m_max_capacity);
241 index_buffer.resize(m_max_capacity);
244 base_level_candidates.resize(m_max_capacity);
245 base_level_candidate_count = 0;
250 global_map.
reserve(5 * m_max_capacity / 4);
252 cached_insertion_value =
nullptr;
257 void regenerate_base_level() GL_HOT_NOINLINE_FLATTEN {
259 _debug_check_level_integrity();
261 DASSERT_EQ(base_level_candidate_count, 0);
266 size_t min_value = std::numeric_limits<size_t>::max();
268 for(
size_t i = 0; i < m_max_capacity; ++i) {
270 size_t count = entries[i].count;
272 if(count < min_value) {
273 base_level_candidate_count = 0;
275 base_level_candidates[base_level_candidate_count++] = &entries[i];
276 }
else if(count == min_value) {
277 base_level_candidates[base_level_candidate_count++] = &entries[i];
284 base_level = min_value;
286 _debug_check_level_integrity();
291 entry* increment_element(
294 _debug_check_level_integrity();
296 element->count += count_incr;
297 element->error += error_incr;
303 entry* insert_element(
const hashkey& key,
const T& t,
304 size_t count,
size_t error) GL_HOT_NOINLINE_FLATTEN {
306 DASSERT_LT(n_entries, m_max_capacity);
308 size_t dest_index = n_entries;
311 entries[dest_index] = entry(hashkey_and_value(key, t));
313 entries[dest_index].count = count;
314 entries[dest_index].error = error;
316 if(count == base_level)
317 base_level_candidates[base_level_candidate_count++] = &(entries[dest_index]);
319 global_map.
insert(entries[dest_index].get_hashkey_and_value(), &(entries[dest_index]));
322 _debug_check_level_integrity();
324 return &(entries[dest_index]);
329 entry* change_and_increment_value(
330 const hashkey& key,
const T& t,
size_t count,
size_t error) GL_HOT_NOINLINE_FLATTEN {
332 if(base_level_candidate_count > 0
333 && base_level_candidates.back()->count != base_level) {
335 while(base_level_candidate_count > 0
336 && base_level_candidates[base_level_candidate_count - 1]->count != base_level) {
337 --base_level_candidate_count;
341 if(UNLIKELY(base_level_candidate_count == 0)) {
342 regenerate_base_level();
346 DASSERT_GT(base_level_candidate_count, 0);
348 size_t idx = base_level_candidate_count - 1;
350 entry* e = base_level_candidates[idx];
352 DASSERT_EQ(e->count, base_level);
354 global_map.
invalidate(e->get_hashkey_and_value(), e);
356 e->error = e->count + error;
358 e->kv = hashkey_and_value(key, t);
360 global_map.
insert(key, e);
362 --base_level_candidate_count;
364 _debug_check_level_integrity();
371 void add_impl(
const T& t,
size_t count = 1,
size_t error = 0) GL_HOT {
372 _debug_check_level_integrity();
380 if(cached_insertion_value !=
nullptr 381 && cached_insertion_key == key
382 && (hashkey::key_is_exact()
383 || (cached_insertion_value->value() == t) ) ) {
385 increment_element(cached_insertion_value, count, error);
389 cached_insertion_key = key;
390 entry *ee_ptr = global_map.
find(key, t);
392 if(ee_ptr !=
nullptr) {
393 increment_element(ee_ptr, count, error);
394 cached_insertion_value = ee_ptr;
398 if(n_entries == m_max_capacity) {
399 cached_insertion_value = change_and_increment_value(key, t, count, error);
401 cached_insertion_value = insert_element(key, t, count, error);
407 if(cached_insertion_value !=
nullptr) {
408 const entry* e = global_map.
find(cached_insertion_key,
409 cached_insertion_value->value());
418 template <
typename OtherEntry>
419 inline entry* _find_eptr(
const OtherEntry& e,
420 typename std::enable_if<std::is_same<OtherEntry, entry>::value >::type* = 0) {
421 return global_map.
find(e.get_hashkey_and_value());
424 template <
typename OtherEntry>
425 inline entry* _find_eptr(
const OtherEntry& e,
426 typename std::enable_if<!std::is_same<OtherEntry, entry>::value >::type* = 0) {
427 return global_map.
find(hashkey(e.value()), T(e.value()));
433 template <
typename OtherEntry>
434 inline const entry& _cast_entry(
const OtherEntry& e,
435 typename std::enable_if<std::is_same<OtherEntry, entry>::value >::type* = 0) {
439 template <
typename OtherEntry>
440 inline entry _cast_entry(
const OtherEntry& e,
441 typename std::enable_if<!std::is_same<OtherEntry, entry>::value >::type* = 0) {
445 ret.kv = hashkey_and_value(T(e.value()));
451 template <
typename U>
452 GL_HOT_NOINLINE_FLATTEN
460 _debug_check_level_integrity(
true);
461 other._debug_check_level_integrity(
true);
463 if(other.m_size == 0)
468 std::map<size_t, size_t> part_sizes;
473 for(
size_t i = 0; i < other.n_entries; ++i) {
475 entry* e_ptr = _find_eptr(other.entries[i]);
477 if(e_ptr !=
nullptr) {
478 e_ptr->count += other.entries[i].count;
479 e_ptr->error += other.entries[i].error;
481 ++part_sizes[other.entries[i].count];
485 for(
size_t i = 0; i < n_entries; ++i) {
486 ++part_sizes[entries[i].count];
492 size_t current_position = m_max_capacity;
494 for(
auto it = part_sizes.rbegin(); it != part_sizes.rend(); ++it) {
496 size_t level = it->first;
497 size_t part_size = it->second;
499 if(part_size >= current_position) {
503 current_position -= part_size;
507 if(base_level == 0) {
509 base_level = part_sizes.begin()->first;
510 current_position += part_sizes.begin()->second;
513 size_t num_base_entries_left = current_position;
514 size_t write_position = 0;
517 std::vector<entry> alt_entries(m_max_capacity);
519 auto write_entry = [&](
const entry& e) {
521 size_t lvl = e.count;
526 if(lvl == base_level) {
527 if(num_base_entries_left == 0)
529 --num_base_entries_left;
532 DASSERT_LT(write_position, m_max_capacity);
533 alt_entries[write_position] = e;
537 for(
size_t i = 0; i < n_entries; ++i)
538 write_entry(_cast_entry(entries[i]));
540 for(
size_t i = 0; i < other.n_entries; ++i) {
541 if(_find_eptr(other.entries[i]) ==
nullptr)
542 write_entry(_cast_entry(other.entries[i]));
547 n_entries = write_position;
548 m_size += other.m_size;
550 entries = std::move(alt_entries);
555 for(
size_t i = 0; i < n_entries; ++i) {
556 global_map.
insert(entries[i].get_hashkey_and_value(), &(entries[i]));
559 cached_insertion_value =
nullptr;
560 base_level_candidate_count = 0;
562 _debug_check_level_integrity(
true);
563 other._debug_check_level_integrity(
true);
571 #ifdef ENABLE_SKETCH_CONSISTENCY_CHECKS 575 DASSERT_LE(n_entries, m_size);
580 std::set<const entry*> bl_set(base_level_candidates.begin(),
581 base_level_candidates.begin() + base_level_candidate_count);
585 for(
size_t i = 0; i < n_entries; ++i) {
586 ASSERT_GE(entries[i].count, base_level);
590 if(entries[i].count == base_level && base_level_candidate_count != 0)
591 ASSERT_TRUE(bl_set.find(&entries[i]) != bl_set.end());
593 const entry* e = global_map.
find(entries[i].get_hashkey_and_value());
596 ASSERT_EQ(e, &(entries[i]));
std::vector< std::pair< T, size_t > > guaranteed_frequent_items() const
std::enable_if< std::is_convertible< U, T >::value, void >::type combine(const space_saving< U > &other)
ValueContainer * find(const hashkey_and_value &hv) GL_HOT_INLINE_FLATTEN
void initialize(double epsilon=0.0001)
#define GL_HOT_INLINE_FLATTEN
#define ASSERT_TRUE(cond)
std::vector< std::pair< T, size_t > > frequent_items() const
void insert(const hashkey_and_value &hv, ValueContainer *v_ptr) GL_HOT_INLINE_FLATTEN
space_saving(double epsilon=0.0001)
#define DASSERT_TRUE(cond)
void invalidate(const hashkey_and_value &hv, ValueContainer *v_ptr) GL_HOT_INLINE_FLATTEN
void add(const T &t, size_t count=1)