9 #include <core/storage/sframe_data/groupby_aggregate_operators.hpp> 10 #include <visualization/server/batch_size.hpp> 11 #include <visualization/server/escape.hpp> 12 #include <visualization/server/histogram.hpp> 13 #include <visualization/server/plot.hpp> 14 #include <visualization/server/vega_spec.hpp> 15 #include <core/data/sframe/gl_sarray.hpp> 17 #include "transformation.hpp" 22 namespace visualization {
30 struct histogram_bins {
42 struct bin_specification_object {
44 bin_specification_object(T start, T stop, T step)
45 : start(start), stop(stop), step(step) {}
46 void serialize(std::stringstream& ss) {
47 ss <<
"{\"start\":" << start;
48 ss <<
", \"stop\":" << stop;
49 ss <<
", \"step\":" << step <<
"}";
54 static T get_value_at_bin(
61 ((double)bin_idx / (
double)num_bins) *
62 (
double)(scale_max - scale_min)
64 if (std::is_same<T, flex_int>::value) {
65 DASSERT_EQ(ret, std::floor(ret));
100 struct histogram_result :
public sframe_transformation_output {
103 groupby_operators::count m_count;
104 groupby_operators::count_distinct m_count_distinct;
105 groupby_operators::non_null_count m_non_null_count;
106 groupby_operators::average m_average;
107 groupby_operators::min m_min;
108 groupby_operators::max m_max;
109 groupby_operators::quantile m_median;
110 groupby_operators::stdv m_stdv;
114 m_median.init(std::vector<double>({0.5}));
117 const static size_t VISIBLE_BINS = 20;
118 const static size_t REAL_BINS = 1000;
119 std::array<flex_int, REAL_BINS> bins;
124 static size_t get_bin_idx(
129 T range = scale_max - scale_min;
130 size_t bin = std::floor(
131 (((
double)value - (
double)scale_min) / (
double)range) *
132 (
double)histogram_result<T>::REAL_BINS
134 if (bin == histogram_result<T>::REAL_BINS) {
137 DASSERT_LT(bin, histogram_result<T>::REAL_BINS);
140 void rescale(T new_min, T new_max) {
141 if (std::is_same<T, flex_int>::value) {
142 static_assert(REAL_BINS % 2 == 0,
"Streaming int histogram expects REAL_BINS cleanly divisible by 2");
145 while (new_min < scale_min || new_max > scale_max) {
147 for (ssize_t i=(REAL_BINS / 2) - 1; i>0; i-=2) {
148 bins[i] += bins[i-1];
150 for (
size_t i=(REAL_BINS / 2); i<REAL_BINS; i+=2) {
151 bins[i] += bins[i+1];
154 for (
size_t i=0; i<(REAL_BINS/4); i++) {
155 bins[(REAL_BINS/2) + i] = bins[(REAL_BINS/2) + (2 * i)];
156 bins[(REAL_BINS/2) - (i + 1)] = bins[(REAL_BINS/2) - ((2 * i) + 1)];
159 if (std::is_same<T, flex_int>::value) {
160 static_assert((REAL_BINS * 3) % 4 == 0,
"Streaming int histogram expects (REAL_BINS*3) cleanly divisible by 4");
162 for (
size_t i=((REAL_BINS * 3) / 4); i<REAL_BINS; i++) {
165 if (std::is_same<T, flex_int>::value) {
166 static_assert(REAL_BINS % 4 == 0,
"Streaming int histogram expects REAL_BINS cleanly divisible by 4");
168 for (
size_t i=0; i<(REAL_BINS/4); i++) {
173 T range = scale_max - scale_min;
174 if (std::is_same<T, flex_int>::value) {
177 scale_max += (range / 2);
178 scale_min -= (range / 2);
181 void init(
flex_type_enum dtype, flexible_type value1, flexible_type value2) {
183 this->init(dtype, value1, value2, value1, value2);
189 this->m_type = dtype;
190 this->m_min.set_input_type(dtype);
191 this->m_max.set_input_type(dtype);
194 for (
size_t i=0; i<this->bins.size(); i++) {
199 if (std::is_same<T, flex_int>::value) {
205 this->min = std::min(value1, value2);
206 this->max = std::max(value1, value2);
207 this->scale_min = std::min(scale1, scale2);
208 this->scale_max = std::max(scale1, scale2);
209 if (this->scale_max == this->scale_min) {
211 if(this->scale_max>0){
212 this->scale_max *= (1.0+epsilon);
213 }
else if(this->scale_max<0){
214 this->scale_max *= (1.0-epsilon);
216 this->scale_max += epsilon;
219 if (std::is_same<T, flex_int>::value) {
221 if ((
flex_int)(this->scale_max - this->scale_min) % 2 != 0) {
222 this->scale_max += 1;
223 DASSERT_EQ((
flex_int)(this->scale_max - this->scale_min) % 2, 0);
230 pad = REAL_BINS - pad;
232 flex_int pad_right = (pad / 2) + (pad % 2);
233 this->scale_min -= pad_left;
234 this->scale_max += pad_right;
236 DASSERT_EQ((
flex_int)(this->scale_max - this->scale_min) % REAL_BINS, 0);
239 histogram_bins<T> get_bins(
flex_int num_bins)
const {
241 log_and_throw(
"num_bins must be positive.");
243 histogram_bins<T> ret;
246 size_t first_bin = get_bin_idx(min, scale_min, scale_max);
247 size_t last_bin = get_bin_idx(max, scale_min, scale_max);
248 size_t effective_bins = (last_bin - first_bin) + 1;
258 if (static_cast<size_t>(num_bins) > (REAL_BINS/4)) {
259 log_and_throw(
"num_bins must be less than or equal to the effective number of bins available.");
264 size_t bins_per_bin = effective_bins / num_bins;
265 size_t overflow = effective_bins % num_bins;
269 overflow = num_bins - overflow;
270 bins_per_bin = (effective_bins + overflow) / num_bins;
271 before = overflow / 2;
272 after = (overflow / 2) + (overflow % 2);
276 ret.min = get_value_at_bin<T>(std::max<ssize_t>(0, first_bin - before), scale_min, scale_max, REAL_BINS);
277 ret.max = get_value_at_bin<T>(std::min<ssize_t>(last_bin + after + 1, REAL_BINS), scale_min, scale_max, REAL_BINS);
278 for (
size_t i=0; i<static_cast<size_t>(num_bins); i++) {
279 for (
size_t j=0; j<bins_per_bin; j++) {
280 ssize_t idx = (i * bins_per_bin) + j + (first_bin - before);
281 if (idx < 0 || static_cast<size_t>(idx) >= REAL_BINS) {
285 ret.bins[i] += this->bins[idx];
290 flexible_type get_min_value() {
293 flexible_type get_max_value()
const {
296 void add_element_simple(
const flexible_type& value) {
300 m_count.add_element_simple(value);
301 m_count_distinct.add_element_simple(value);
302 m_non_null_count.add_element_simple(value);
303 m_average.add_element_simple(value);
304 m_min.add_element_simple(value);
305 m_max.add_element_simple(value);
306 m_median.add_element_simple(value);
307 m_stdv.add_element_simple(value);
326 if (value < this->min) { this->min = value; }
327 if (value > this->max) { this->max = value; }
330 this->rescale(this->min, this->max);
333 size_t bin = get_bin_idx(value, this->scale_min, this->scale_max);
334 this->bins[bin] += 1;
336 virtual std::string vega_column_data(
bool)
const override {
337 auto bins = get_bins(VISIBLE_BINS);
338 T binWidth = (bins.max - bins.min)/VISIBLE_BINS;
339 bin_specification_object<T> binSpec(bins.min, bins.max, binWidth);
341 std::stringstream ss;
343 for (
size_t i=0; i<bins.bins.size(); i++) {
347 const auto& value = bins.bins[i];
349 ss << static_cast<T>(bins.min + (i * binWidth));
350 ss <<
",\"right\": ";
351 ss << static_cast<T>(bins.min + ((i+1) * binWidth));
352 ss <<
", \"count\": ";
358 size_t null_count = m_count.emit() - m_non_null_count.emit();
359 if (null_count > 0) {
361 ss <<
"{\"missing\": true";
362 ss <<
", \"count\": ";
369 binSpec.serialize(ss);
373 virtual std::string vega_summary_data()
const override {
374 std::stringstream ss;
376 flex_vec median_vec = m_median.emit().template get<flex_vec>();
378 flex_int num_missing = m_count.emit() - m_non_null_count.emit();
379 std::string data = vega_column_data(
true);
382 ss <<
"\"type\": \"" << typeName <<
"\",";
383 ss <<
"\"num_unique\": " << m_count_distinct.emit() <<
",";
384 ss <<
"\"num_missing\": " << num_missing <<
",";
385 ss <<
"\"mean\": " << escape_float(m_average.emit()) <<
",";
386 ss <<
"\"min\": " << escape_float(m_min.emit()) <<
",";
387 ss <<
"\"max\": " << escape_float(m_max.emit()) <<
",";
388 ss <<
"\"median\": " << escape_float(median) <<
",";
389 ss <<
"\"stdev\": " << escape_float(m_stdv.emit()) <<
",";
390 ss <<
"\"numeric\": [" << data <<
"],";
391 ss <<
"\"categorical\": []";
423 class histogram :
public transformation<gl_sarray, histogram_result<T>> {
425 virtual std::vector<histogram_result<T>> split_input(
size_t num_threads)
override {
426 flexible_type current_min = this->m_transformer->min;
427 flexible_type current_max = this->m_transformer->max;
428 T current_scale_min = this->m_transformer->scale_min;
429 T current_scale_max = this->m_transformer->scale_max;
430 std::vector<histogram_result<T>> thread_results(num_threads);
431 for (
auto& thread_result : thread_results) {
432 thread_result.init(this->m_source.dtype(), current_min, current_max, current_scale_min, current_scale_max);
434 return thread_results;
436 virtual void merge_results(std::vector<histogram_result<T>>& thread_results)
override {
437 for (
auto& thread_result : thread_results) {
439 this->m_transformer->m_count.combine(thread_result.m_count);
440 this->m_transformer->m_count_distinct.combine(thread_result.m_count_distinct);
441 this->m_transformer->m_non_null_count.combine(thread_result.m_non_null_count);
442 this->m_transformer->m_average.combine(thread_result.m_average);
443 this->m_transformer->m_min.combine(thread_result.m_min);
444 this->m_transformer->m_max.combine(thread_result.m_max);
445 this->m_transformer->m_stdv.combine(thread_result.m_stdv);
448 thread_result.m_median.partial_finalize();
449 this->m_transformer->m_median.combine(thread_result.m_median);
452 flexible_type combined_min = std::min(this->m_transformer->min, thread_result.min);
453 flexible_type combined_max = std::max(this->m_transformer->max, thread_result.max);
454 this->m_transformer->min = combined_min;
455 this->m_transformer->max = combined_max;
456 this->m_transformer->rescale(combined_min, combined_max);
457 thread_result.rescale(combined_min, combined_max);
458 DASSERT_EQ(this->m_transformer->scale_min, thread_result.scale_min);
459 DASSERT_EQ(this->m_transformer->scale_max, thread_result.scale_max);
460 for (
size_t i=0; i<histogram_result<T>::REAL_BINS; i++) {
461 this->m_transformer->bins[i] += thread_result.bins[i];
465 virtual void init(
const gl_sarray& source,
size_t batch_size)
override {
466 transformation<gl_sarray, histogram_result<T>>::init(source, batch_size);
470 log_and_throw(
"dtype of the provided SArray is not valid for histogram. Only int and float are valid dtypes.");
473 size_t input_size = this->m_source.size();
474 if (input_size >= 2 &&
477 std::isfinite(this->m_source[0].
template to<flex_float>()) &&
478 std::isfinite(this->m_source[1].
template to<flex_float>())) {
482 this->m_transformer->init(dtype, this->m_source[0], this->m_source[1]);
483 }
else if (input_size == 1 &&
485 std::isfinite(this->m_source[0].
template to<flex_float>())) {
487 this->m_transformer->init(dtype, this->m_source[0], this->m_source[0]);
490 this->m_transformer->init(dtype, 0.0, 0.0);
495 std::shared_ptr<Plot> plot_histogram(
496 const gl_sarray& sa,
const flexible_type& xlabel,
const flexible_type& ylabel,
497 const flexible_type& title);
501 #endif // __TC_HISTOGRAM
std::vector< double > flex_vec
const char * flex_type_enum_to_name(flex_type_enum en)
std::vector< flexible_type > flex_list