Turi Create  4.0
groupby_aggregate.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_SFRAME_GROUPBY_AGGREGATE_HPP
7 #define TURI_SFRAME_GROUPBY_AGGREGATE_HPP
8 
9 #include <memory>
10 #include <vector>
11 #include <cstdint>
12 #include <functional>
13 #include <unordered_set>
14 #include <core/storage/sframe_data/sframe.hpp>
15 #include <core/storage/sframe_data/sframe_constants.hpp>
16 #include <core/storage/sframe_data/group_aggregate_value.hpp>
17 
18 namespace turi {
19 
20 const std::set<std::string> registered_arg_functions = {"argmax","argmin"};
21 
22 
23 /**
24  * \ingroup sframe_physical
25  * \addtogroup groupby_aggregate Groupby Aggregation
26  * \{
27  */
28 
29 /**
30  * Groupby Aggregate function for an SFrame.
31  * Given the source SFrame this function performs a group-by aggregate of the
32  * SFrame, using one or more columns to define the group key, and a descriptor
33  * for how to aggregate other non-key columns.
34  *
35  * For instance given an SFrame:
36  * \verbatim
37  * user_id movie_id rating time
38  * 5 10 1 4pm
39  * 5 15 2 1pm
40  * 6 12 1 2pm
41  * 7 13 1 3am
42  * \endverbatim
43  * \code
44  * sframe output = turi::groupby_aggregate(input,
45  * {"user_id"},
46  * {"movie_count", "rating_sum"},
47  * {{"movie_id", std::make_shared<groupby_operators::count>()},
48  * {"rating", std::make_shared<groupby_operators::sum>()}});
49  * \endcode
50  *
51  * will generate groups based on the user_id column, and within each group,
52  * count the movie_id, and sum the ratings.
53  * \verbatim
54  * user_id "Count of movie_id" "Sum of rating"
55  * 5 2 3
56  * 6 1 1
57  * 7 1 1
58  * \endverbatim
59  *
60  * See groupby_aggregate_operators for operators that have been implemented.
61  *
62  * Describing a Group
63  * ------------------
64  * A group is basically a pair of column-name and the operator.
65  * The column name can be any existing column in the table (there is no
66  * restriction. You can group on user_id and aggregate on user_id, though the
67  * result is typically not very meaningful). A special column name with the
68  * empty string "" is also defined in which case, the aggregator will be
69  * sent a flexible type of type FLEX_UNDEFINED for every row (this is useful
70  * for COUNT).
71  *
72  * \param source The input SFrame to group
73  * \param keys An array of column names to generate the group on
74  * \param group_output_columns The output column names for each aggregate.
75  * This must be the same length as the 'groups' parameter.
76  * Output column names must be unique and must not
77  * share similar column names as keys. If there are any
78  * empty entries, their values will be automatically
79  * assigned.
80  * \param groups A collection of {column_names, group operator} pairs describing
81  * the aggregates to generate. You can have multiple aggregators
82  * for each set of columns. You do not need every column in the source
83  * to be represented. This must be the same length as the
84  * 'group_output_columns' parameter.
85  * \param max_buffer_size The maximum size of intermediate aggregation buffers
86  *
87  * \return The new aggregated SFrame. throws a string exception on failures.
88  */
89 sframe groupby_aggregate(const sframe& source,
90  const std::vector<std::string>& keys,
91  const std::vector<std::string>& group_output_columns,
92  const std::vector<std::pair<std::vector<std::string>,
93  std::shared_ptr<group_aggregate_value>>>& groups,
94  size_t max_buffer_size = SFRAME_GROUPBY_BUFFER_NUM_ROWS);
95 
96 
97 /// \}
98 } // end of turi
99 #endif //TURI_SFRAME_GROUPBY_AGGREGATE_HPP
std::set< Key > keys(const std::map< Key, T > &map)
Definition: stl_util.hpp:358
sframe groupby_aggregate(const sframe &source, const std::vector< std::string > &keys, const std::vector< std::string > &group_output_columns, const std::vector< std::pair< std::vector< std::string >, std::shared_ptr< group_aggregate_value >>> &groups, size_t max_buffer_size=SFRAME_GROUPBY_BUFFER_NUM_ROWS)
size_t SFRAME_GROUPBY_BUFFER_NUM_ROWS