Turi Create  4.0
grouped_sframe.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_GROUPED_SFRAME_HPP
7 #define TURI_GROUPED_SFRAME_HPP
8 
9 #include <model_server/lib/toolkit_class_macros.hpp>
10 #include <core/data/sframe/gl_sframe.hpp>
11 #include <model_server/lib/extensions/model_base.hpp>
12 #include <core/export.hpp>
13 #include <core/parallel/lambda_omp.hpp>
14 
15 namespace turi {
16 std::vector<turi::toolkit_class_specification> get_toolkit_class_registration();
17 
18 /**
19  * Hash function for a row of flexible_type.
20  */
21 struct EXPORT GroupKeyHash {
22  inline size_t operator()(std::vector<flexible_type> f) const {
23  size_t key_hash = 0;
24  for(const auto &i : f) {
25  key_hash = hash64_combine(key_hash, i.hash());
26  }
27 
28  return key_hash;
29  }
30 };
31 
32 class EXPORT grouped_sframe: public model_base {
33  public:
34  /**
35  * Groups an SFrame by the distinct values in one or more columns.
36  *
37  * Logically, this creates an SFrame for each "group" of values, where the
38  * new SFrames all have the same number of columns as the original SFrame.
39  * These SFrames are accessed through the interface of this data structure.
40  *
41  * If is_grouped is true, this function skips the grouping step and just sets
42  * up the data structure to provide an interface to the grouped SFrame.
43  *
44  * Throws if group has already been called on this object, or the column
45  * names are not valid.
46  */
47  void group(const gl_sframe &sf, const std::vector<std::string> column_names,
48  bool is_grouped);
49 
50 
51  /**
52  * Get the SFrame that corresponds to the group named `key`.
53  *
54  * Each group's name is its distinct value, including its type. This means
55  * that an SFrame grouped by a column of integers that has some 1s and some
56  * 2s, the name of the group with ones is the integer 1, not the string '1'.
57  * The key is given as a vector because more than one columns can be used to
58  * group.
59  */
60  gl_sframe get_group(std::vector<flexible_type> key);
61 
62  /**
63  * The number of distinct groups found.
64  */
65  inline size_t num_groups() const {
66  return m_range_directory.size();
67  }
68 
69  /**
70  * A list of all the group names.
71  */
72  gl_sarray groups();
73 
74  /**
75  * Begin iteration through the grouped SFrame.
76  *
77  * Works together with \ref iterator_get_next(). The usage pattern
78  * is as follows:
79  * \code
80  * grouped_sframe.begin_iterator();
81  * while(1) {
82  * auto ret = grouped_sframe.iterator_get_next(64);
83  * // do stuff
84  * if (ret.size() < 64) {
85  * // we are done
86  * break;
87  * }
88  * }
89  * \endcode
90  */
91  inline void begin_iterator() {
92  m_iterating = true;
93  m_cur_iterator_idx = 0;
94  }
95 
96  /**
97  * Obtains the next block of elements of size len from the grouped SFrame.
98  * Works together with \ref begin_iterator(). See the code example
99  * in \ref begin_iterator() for details.
100  *
101  * This function will always return a vector of length 'len' unless
102  * at the end of the array, or if an error has occured.
103  *
104  * The element value is a pair of <group name, SFrame>.
105  *
106  * \param len The number of elements to return
107  * \returns The next collection of elements in the array. Returns less then
108  * len elements on end of file or failure.
109  */
110  std::vector<std::pair<flexible_type,gl_sframe>> iterator_get_next(size_t len);
111 
112  /**
113  * Returns a single SFrame which contains all the data.
114  */
115  gl_sframe get_sframe() const {
116  return m_grouped_sf;
117  }
118 
119  /**
120  * Return an SFrame with group_info i.e key columns + number of rows in each
121  * key column.
122  */
123  gl_sframe group_info() const;
124 
125  protected:
126  private:
127  /// Methods
128 
129  /**
130  * Get a group by its index in the range directory.
131  *
132  * Internal method
133  */
134  gl_sframe get_group_by_index(size_t range_dir_idx);
135 
136  /// Variables
137  gl_sframe m_grouped_sf;
138 
139  // The first row in each range. The sequential order of the vector corresponds
140  // to where the group is located in the underlying sframe e.g. 1st group in
141  // the SFrame's last row is m_range_directory[0]. This data structure only
142  // exists to preserve the ORDER of groups: the order the SFrame is sorted in.
143  // This may have some significance.
144  std::vector<size_t> m_range_directory;
145  std::vector<std::string> m_key_col_names;
146  std::vector<flexible_type> m_group_names;
147 
148  // Key: Hash value of "group" key
149  // Value: Index of m_range_directory
150  //TODO: This is what will run out of memory first when scaling up
151  std::unordered_map<std::vector<flexible_type>, size_t, GroupKeyHash> m_key2range;
152  gl_sarray m_groups_sa;
153  bool m_inited = false;
155  bool m_iterating = false;
156  size_t m_cur_iterator_idx = 0;
157 
158  public:
159  BEGIN_CLASS_MEMBER_REGISTRATION("grouped_sframe")
161  "is_grouped")
162  REGISTER_CLASS_MEMBER_FUNCTION(grouped_sframe::get_group, "key")
163  REGISTER_CLASS_MEMBER_FUNCTION(grouped_sframe::num_groups)
164  REGISTER_CLASS_MEMBER_FUNCTION(grouped_sframe::groups)
165  REGISTER_CLASS_MEMBER_FUNCTION(grouped_sframe::begin_iterator)
166  REGISTER_CLASS_MEMBER_FUNCTION(grouped_sframe::iterator_get_next, "num_items")
167 
168  REGISTER_GETTER("sframe", grouped_sframe::get_sframe)
170 };
171 
172 } // namespace turi
173 #endif // TURI_GROUPED_SFRAME_HPP
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
sframe group(sframe sframe_in, std::string key_column)
static uint64_t hash64_combine(uint64_t h1, uint64_t h2)
#define END_CLASS_MEMBER_REGISTRATION
#define REGISTER_GETTER(propname, function)