Turi Create  4.0
histogram.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef __TC_HISTOGRAM
7 #define __TC_HISTOGRAM
8 
9 #include <core/storage/sframe_data/groupby_aggregate_operators.hpp>
10 #include <visualization/server/batch_size.hpp>
11 #include <visualization/server/escape.hpp>
12 #include <visualization/server/histogram.hpp>
13 #include <visualization/server/plot.hpp>
14 #include <visualization/server/vega_spec.hpp>
15 #include <core/data/sframe/gl_sarray.hpp>
16 
17 #include "transformation.hpp"
18 
19 #include <sstream>
20 
21 namespace turi {
22 namespace visualization {
23 
24 /*
25  * histogram_bins()
26  * Represents bin values (typically rescaled from original bin contents)
27  * along with an effective range (min of first bin, last of max)
28  */
29 template<typename T>
30 struct histogram_bins {
31  flex_list bins;
32  T max;
33  T min;
34 };
35 
36 /*
37  * bin_specification_object
38  * Represents bin widths (typically rescaled from original bin contents) for
39  * compatibility with https://vega.github.io/vega/docs/scales/#bins
40  */
41 template<typename T>
42 struct bin_specification_object {
43  T start, stop, step;
44  bin_specification_object(T start, T stop, T step)
45  : start(start), stop(stop), step(step) {}
46  void serialize(std::stringstream& ss) {
47  ss << "{\"start\":" << start;
48  ss << ", \"stop\":" << stop;
49  ss << ", \"step\":" << step << "}";
50  }
51 };
52 
53 template<typename T>
54 static T get_value_at_bin(
55  T bin_idx,
56  T scale_min,
57  T scale_max,
58  T num_bins
59  ) {
60  double ret = (
61  ((double)bin_idx / (double)num_bins) *
62  (double)(scale_max - scale_min)
63  ) + scale_min;
64  if (std::is_same<T, flex_int>::value) {
65  DASSERT_EQ(ret, std::floor(ret));
66  }
67  return ret;
68 }
69 
70 /*
71  * histogram_result()
72  *
73  * Stores the intermediate or complete result of a histogram stream.
74  *
75  * Attributes
76  * ----------
77  * bins : flex_list<flex_int>
78  * The counts in each bin, in the range scale_min to scale_max.
79  * min : flexible_type
80  * The lowest value encountered in the source SArray.
81  * max : flexible_type
82  * The highest value encountered in the source SArray.
83  * scale_min : T
84  * The low end of the range represented by the bins.
85  * scale_max : T
86  * The high end of the range represented by the bins.
87  *
88  * Methods
89  * -------
90  * init(flexible_type value1, flexible_type value2)
91  * Initialize the result with the range represented by two values.
92  * init(flexible_type value1, flexible_type value2, T scale1, T scale2)
93  * Initialize the result with the range represented by two values and
94  * the scale range (should be >= value range) represented by two scale
95  * values.
96  * rescale(T new_min, T new_max)
97  * Rescale the result to the range represented by the provided values.
98  */
99 template<typename T>
100 struct histogram_result : public sframe_transformation_output {
101  public:
102  // also store and compute basic summary stats
103  groupby_operators::count m_count; // num rows
104  groupby_operators::count_distinct m_count_distinct; // num unique
105  groupby_operators::non_null_count m_non_null_count; // (inverse) num missing
106  groupby_operators::average m_average; // mean
107  groupby_operators::min m_min; // min
108  groupby_operators::max m_max; // max
109  groupby_operators::quantile m_median; // median (quantile at 0.5)
110  groupby_operators::stdv m_stdv; // stdev
111 
112  histogram_result() {
113  // initialize quantile (so we can use it as median)
114  m_median.init(std::vector<double>({0.5}));
115  }
116  flex_type_enum m_type;
117  const static size_t VISIBLE_BINS = 20;
118  const static size_t REAL_BINS = 1000;
119  std::array<flex_int, REAL_BINS> bins;
120  T min;
121  T max;
122  T scale_min;
123  T scale_max;
124  static size_t get_bin_idx(
125  T value,
126  T scale_min,
127  T scale_max
128  ) {
129  T range = scale_max - scale_min;
130  size_t bin = std::floor(
131  (((double)value - (double)scale_min) / (double)range) *
132  (double)histogram_result<T>::REAL_BINS
133  );
134  if (bin == histogram_result<T>::REAL_BINS) {
135  bin -= 1;
136  }
137  DASSERT_LT(bin, histogram_result<T>::REAL_BINS);
138  return bin;
139  }
140  void rescale(T new_min, T new_max) {
141  if (std::is_same<T, flex_int>::value) {
142  static_assert(REAL_BINS % 2 == 0, "Streaming int histogram expects REAL_BINS cleanly divisible by 2");
143  }
144  // collapse bins towards the center to expand range by 2x
145  while (new_min < scale_min || new_max > scale_max) {
146  // first, combine bins next to each other (every other bin)
147  for (ssize_t i=(REAL_BINS / 2) - 1; i>0; i-=2) {
148  bins[i] += bins[i-1];
149  }
150  for (size_t i=(REAL_BINS / 2); i<REAL_BINS; i+=2) {
151  bins[i] += bins[i+1];
152  }
153  // then, collapse them inward towards the center
154  for (size_t i=0; i<(REAL_BINS/4); i++) {
155  bins[(REAL_BINS/2) + i] = bins[(REAL_BINS/2) + (2 * i)];
156  bins[(REAL_BINS/2) - (i + 1)] = bins[(REAL_BINS/2) - ((2 * i) + 1)];
157  }
158  // finally, zero out the newly-unused bins
159  if (std::is_same<T, flex_int>::value) {
160  static_assert((REAL_BINS * 3) % 4 == 0, "Streaming int histogram expects (REAL_BINS*3) cleanly divisible by 4");
161  }
162  for (size_t i=((REAL_BINS * 3) / 4); i<REAL_BINS; i++) {
163  bins[i] = 0;
164  }
165  if (std::is_same<T, flex_int>::value) {
166  static_assert(REAL_BINS % 4 == 0, "Streaming int histogram expects REAL_BINS cleanly divisible by 4");
167  }
168  for (size_t i=0; i<(REAL_BINS/4); i++) {
169  bins[i] = 0;
170  }
171 
172  // bump up scale by 2x
173  T range = scale_max - scale_min;
174  if (std::is_same<T, flex_int>::value) {
175  DASSERT_EQ((flex_int)range % 2, 0);
176  }
177  scale_max += (range / 2);
178  scale_min -= (range / 2);
179  }
180  }
181  void init(flex_type_enum dtype, flexible_type value1, flexible_type value2) {
182  // use 2 initial values for both min/max and bin scale value1/value2
183  this->init(dtype, value1, value2, value1, value2);
184  }
185  void init(
186  flex_type_enum dtype, T value1, T value2, T scale1, T scale2
187  ) {
188  // initialize min/max to use dtype (for some reason it defaults to int, then crashes on float)
189  this->m_type = dtype;
190  this->m_min.set_input_type(dtype);
191  this->m_max.set_input_type(dtype);
192 
193  // initialize bins to 0
194  for (size_t i=0; i<this->bins.size(); i++) {
195  this->bins[i] = 0;
196  }
197 
198  T epsilon;
199  if (std::is_same<T, flex_int>::value) {
200  epsilon = 1;
201  } else {
202  epsilon = 1e-2;
203  }
204 
205  this->min = std::min(value1, value2);
206  this->max = std::max(value1, value2);
207  this->scale_min = std::min(scale1, scale2);
208  this->scale_max = std::max(scale1, scale2);
209  if (this->scale_max == this->scale_min) {
210  // make sure they are not the same value
211  if(this->scale_max>0){
212  this->scale_max *= (1.0+epsilon);
213  }else if(this->scale_max<0){
214  this->scale_max *= (1.0-epsilon);
215  }else{ // scale_max == 0
216  this->scale_max += epsilon;
217  }
218  }
219  if (std::is_same<T, flex_int>::value) {
220  // For int histogram, make sure our scale range is evenly divisible by 2 to start with
221  if ((flex_int)(this->scale_max - this->scale_min) % 2 != 0) {
222  this->scale_max += 1;
223  DASSERT_EQ((flex_int)(this->scale_max - this->scale_min) % 2, 0);
224  }
225 
226  // We also need to make sure we can evenly divide the scale range into REAL_BINS,
227  // so that we don't end up with non-integer bin boundaries
228  flex_int pad = (flex_int)(this->scale_max - this->scale_min) % REAL_BINS;
229  if (pad > 0) {
230  pad = REAL_BINS - pad;
231  flex_int pad_left = pad / 2;
232  flex_int pad_right = (pad / 2) + (pad % 2);
233  this->scale_min -= pad_left;
234  this->scale_max += pad_right;
235  }
236  DASSERT_EQ((flex_int)(this->scale_max - this->scale_min) % REAL_BINS, 0);
237  }
238  }
239  histogram_bins<T> get_bins(flex_int num_bins) const {
240  if (num_bins < 1) {
241  log_and_throw("num_bins must be positive.");
242  }
243  histogram_bins<T> ret;
244 
245  // determine the bin range that covers min to max
246  size_t first_bin = get_bin_idx(min, scale_min, scale_max);
247  size_t last_bin = get_bin_idx(max, scale_min, scale_max);
248  size_t effective_bins = (last_bin - first_bin) + 1;
249 
250  // Might end up with fewer effective bins due to very small
251  // number of unique values. For now, comment out this assert.
252  // TODO -- what should we assert here instead, to make sure we have enough
253  // effective range for the desired number of bins? Or should we force
254  // discrete histogram for very low cardinality? (At which point, we keep
255  // this assertion).
256  //DASSERT_GE(effective_bins, (REAL_BINS/4));
257 
258  if (static_cast<size_t>(num_bins) > (REAL_BINS/4)) {
259  log_and_throw("num_bins must be less than or equal to the effective number of bins available.");
260  }
261 
262  // rescale to desired bins, taking more than the effective range if
263  // necessary in order to get to num_bins total without resampling
264  size_t bins_per_bin = effective_bins / num_bins;
265  size_t overflow = effective_bins % num_bins;
266  size_t before = 0;
267  size_t after = 0;
268  if (overflow) {
269  overflow = num_bins - overflow;
270  bins_per_bin = (effective_bins + overflow) / num_bins;
271  before = overflow / 2;
272  after = (overflow / 2) + (overflow % 2);
273  }
274 
275  ret.bins = flex_list(num_bins, 0); // initialize empty
276  ret.min = get_value_at_bin<T>(std::max<ssize_t>(0, first_bin - before), scale_min, scale_max, REAL_BINS);
277  ret.max = get_value_at_bin<T>(std::min<ssize_t>(last_bin + after + 1, REAL_BINS), scale_min, scale_max, REAL_BINS);
278  for (size_t i=0; i<static_cast<size_t>(num_bins); i++) {
279  for (size_t j=0; j<bins_per_bin; j++) {
280  ssize_t idx = (i * bins_per_bin) + j + (first_bin - before);
281  if (idx < 0 || static_cast<size_t>(idx) >= REAL_BINS) {
282  // don't try to get values below 0, or past REAL_BINS, that would be silly
283  continue;
284  }
285  ret.bins[i] += this->bins[idx];
286  }
287  }
288  return ret;
289  }
290  flexible_type get_min_value() {
291  return this->min;
292  }
293  flexible_type get_max_value() const {
294  return this->max;
295  }
296  void add_element_simple(const flexible_type& value) {
297  /*
298  * add element to summary stats
299  */
300  m_count.add_element_simple(value);
301  m_count_distinct.add_element_simple(value);
302  m_non_null_count.add_element_simple(value);
303  m_average.add_element_simple(value);
304  m_min.add_element_simple(value);
305  m_max.add_element_simple(value);
306  m_median.add_element_simple(value);
307  m_stdv.add_element_simple(value);
308 
309  /*
310  * add element to histogram
311  */
312 
313 
314  if (value.get_type() == flex_type_enum::UNDEFINED) {
315  return;
316  }
317 
318  // ignore nan values
319  // ignore inf values
320  if (value.get_type() == flex_type_enum::FLOAT &&
321  !std::isfinite(value.get<flex_float>())) {
322  return;
323  }
324 
325  // assign min/max to return value
326  if (value < this->min) { this->min = value; }
327  if (value > this->max) { this->max = value; }
328 
329  // resize bins if needed
330  this->rescale(this->min, this->max);
331 
332  // update count in bin
333  size_t bin = get_bin_idx(value, this->scale_min, this->scale_max);
334  this->bins[bin] += 1;
335  }
336  virtual std::string vega_column_data(bool) const override {
337  auto bins = get_bins(VISIBLE_BINS);
338  T binWidth = (bins.max - bins.min)/VISIBLE_BINS;
339  bin_specification_object<T> binSpec(bins.min, bins.max, binWidth);
340 
341  std::stringstream ss;
342 
343  for (size_t i=0; i<bins.bins.size(); i++) {
344  if (i != 0) {
345  ss << ",";
346  }
347  const auto& value = bins.bins[i];
348  ss << "{\"left\": ";
349  ss << static_cast<T>(bins.min + (i * binWidth));
350  ss << ",\"right\": ";
351  ss << static_cast<T>(bins.min + ((i+1) * binWidth));
352  ss << ", \"count\": ";
353  ss << value;
354  ss << "}";
355  }
356 
357  // if there are null values, include them separately
358  size_t null_count = m_count.emit() - m_non_null_count.emit();
359  if (null_count > 0) {
360  ss << ",";
361  ss << "{\"missing\": true";
362  ss << ", \"count\": ";
363  ss << null_count;
364  ss << "}";
365  }
366 
367  // include metadata about bin ranges
368  ss << ",";
369  binSpec.serialize(ss);
370 
371  return ss.str();
372  }
373  virtual std::string vega_summary_data() const override {
374  std::stringstream ss;
375 
376  flex_vec median_vec = m_median.emit().template get<flex_vec>();
377  flex_float median = median_vec[0];
378  flex_int num_missing = m_count.emit() - m_non_null_count.emit();
379  std::string data = vega_column_data(true);
380  std::string typeName = flex_type_enum_to_name(m_type);
381 
382  ss << "\"type\": \"" << typeName << "\",";
383  ss << "\"num_unique\": " << m_count_distinct.emit() << ",";
384  ss << "\"num_missing\": " << num_missing << ",";
385  ss << "\"mean\": " << escape_float(m_average.emit()) << ",";
386  ss << "\"min\": " << escape_float(m_min.emit()) << ",";
387  ss << "\"max\": " << escape_float(m_max.emit()) << ",";
388  ss << "\"median\": " << escape_float(median) << ",";
389  ss << "\"stdev\": " << escape_float(m_stdv.emit()) << ",";
390  ss << "\"numeric\": [" << data << "],";
391  ss << "\"categorical\": []";
392 
393  return ss.str();
394 
395  }
396 
397 };
398 
399 /*
400  * histogram()
401  *
402  * Implements Optimal Streaming Histogram (sort-of) as described in
403  * http://blog.amplitude.com/2014/08/06/optimal-streaming-histograms/.
404  * dtype of sarray can be flex_int or flex_float
405  * histogram always gives bins as flex_ints (bin counts are positive integers).
406  *
407  * Attributes
408  * ----------
409  *
410  * Methods
411  * ----------
412  * init(const gl_sarray& source)
413  * Initialize the histogram with an SArray as input.
414  * eof()
415  * Returns true if the streaming histogram has covered all input, false
416  * otherwise.
417  * get()
418  * Bins (up to) the next BATCH_SIZE values from the input, and returns
419  * the histogram (result type, as shown below) representing the current
420  * distribution of values seen so far.
421  */
422 template<typename T>
423 class histogram : public transformation<gl_sarray, histogram_result<T>> {
424  public:
425  virtual std::vector<histogram_result<T>> split_input(size_t num_threads) override {
426  flexible_type current_min = this->m_transformer->min;
427  flexible_type current_max = this->m_transformer->max;
428  T current_scale_min = this->m_transformer->scale_min;
429  T current_scale_max = this->m_transformer->scale_max;
430  std::vector<histogram_result<T>> thread_results(num_threads);
431  for (auto& thread_result : thread_results) {
432  thread_result.init(this->m_source.dtype(), current_min, current_max, current_scale_min, current_scale_max);
433  }
434  return thread_results;
435  }
436  virtual void merge_results(std::vector<histogram_result<T>>& thread_results) override {
437  for (auto& thread_result : thread_results) {
438  // combine summary stats
439  this->m_transformer->m_count.combine(thread_result.m_count);
440  this->m_transformer->m_count_distinct.combine(thread_result.m_count_distinct);
441  this->m_transformer->m_non_null_count.combine(thread_result.m_non_null_count);
442  this->m_transformer->m_average.combine(thread_result.m_average);
443  this->m_transformer->m_min.combine(thread_result.m_min);
444  this->m_transformer->m_max.combine(thread_result.m_max);
445  this->m_transformer->m_stdv.combine(thread_result.m_stdv);
446 
447  // need to partial_finalize the quantile sketch before combining
448  thread_result.m_median.partial_finalize();
449  this->m_transformer->m_median.combine(thread_result.m_median);
450 
451  // combine histogram
452  flexible_type combined_min = std::min(this->m_transformer->min, thread_result.min);
453  flexible_type combined_max = std::max(this->m_transformer->max, thread_result.max);
454  this->m_transformer->min = combined_min;
455  this->m_transformer->max = combined_max;
456  this->m_transformer->rescale(combined_min, combined_max);
457  thread_result.rescale(combined_min, combined_max);
458  DASSERT_EQ(this->m_transformer->scale_min, thread_result.scale_min);
459  DASSERT_EQ(this->m_transformer->scale_max, thread_result.scale_max);
460  for (size_t i=0; i<histogram_result<T>::REAL_BINS; i++) {
461  this->m_transformer->bins[i] += thread_result.bins[i];
462  }
463  }
464  }
465  virtual void init(const gl_sarray& source, size_t batch_size) override {
466  transformation<gl_sarray, histogram_result<T>>::init(source, batch_size);
467  flex_type_enum dtype = this->m_source.dtype();
468  if (dtype != flex_type_enum::INTEGER &&
469  dtype != flex_type_enum::FLOAT) {
470  log_and_throw("dtype of the provided SArray is not valid for histogram. Only int and float are valid dtypes.");
471  }
472 
473  size_t input_size = this->m_source.size();
474  if (input_size >= 2 &&
475  this->m_source[0].get_type() != flex_type_enum::UNDEFINED &&
476  this->m_source[1].get_type() != flex_type_enum::UNDEFINED &&
477  std::isfinite(this->m_source[0].template to<flex_float>()) &&
478  std::isfinite(this->m_source[1].template to<flex_float>())) {
479  // start with a sane range for the bins (somewhere near the data)
480  // (it can be exceptionally small, since the doubling used in resize()
481  // will make it converge to the real range quickly)
482  this->m_transformer->init(dtype, this->m_source[0], this->m_source[1]);
483  } else if (input_size == 1 &&
484  this->m_source[0].get_type() != flex_type_enum::UNDEFINED &&
485  std::isfinite(this->m_source[0].template to<flex_float>())) {
486  // one value, not so interesting
487  this->m_transformer->init(dtype, this->m_source[0], this->m_source[0]);
488  } else {
489  // no data
490  this->m_transformer->init(dtype, 0.0, 0.0);
491  }
492  }
493 };
494 
495 std::shared_ptr<Plot> plot_histogram(
496  const gl_sarray& sa, const flexible_type& xlabel, const flexible_type& ylabel,
497  const flexible_type& title);
498 
499 }} // turi::visualization
500 
501 #endif // __TC_HISTOGRAM
std::vector< double > flex_vec
const char * flex_type_enum_to_name(flex_type_enum en)
std::vector< flexible_type > flex_list