6 #ifndef TURI_UNITY_GL_SFRAME_HPP 7 #define TURI_UNITY_GL_SFRAME_HPP 14 #include <core/data/flexible_type/flexible_type.hpp> 15 #include <core/storage/sframe_data/group_aggregate_value.hpp> 16 #include <core/storage/sframe_data/sframe_rows.hpp> 17 #include "gl_sarray.hpp" 21 class unity_sframe_base;
25 class sframe_reader_buffer;
27 class gl_sframe_range;
28 class gl_sarray_reference;
29 class const_gl_sarray_reference;
31 typedef std::map<std::string, flex_type_enum> str_flex_type_map;
32 typedef std::map<std::string, flexible_type> csv_parsing_config_map;
33 typedef std::map<std::string, std::string> string_map;
34 typedef std::map<std::string, std::shared_ptr<unity_sarray_base>> csv_parsing_errors;
57 const std::vector<std::string>& group_columns);
61 const std::vector<std::string>& group_columns);
98 template<
typename T,
typename... Args>
100 const Args&... args){
101 static_assert(std::is_base_of<group_aggregate_value, T>::value,
102 "T must inherit from group_aggregate_value");
103 auto aggregator = std::make_shared<T>(&args...);
507 explicit gl_sframe(
const std::string& directory);
509 void construct_from_sframe_index(
const std::string& directory);
514 void construct_from_csvs(std::string csv_file, csv_parsing_config_map csv_config,
515 str_flex_type_map column_type_hints);
525 void show(
const std::string& path_to_client)
const;
530 std::shared_ptr<model_base> plot()
const;
548 gl_sframe(
const std::map<std::string, std::vector<flexible_type> >& data);
550 void construct_from_dataframe(
const std::map<std::string, std::vector<flexible_type> >& data);
569 gl_sframe(
const std::map<std::string, gl_sarray>& data);
580 gl_sframe(std::initializer_list<std::pair<std::string, gl_sarray>>);
590 gl_sframe(std::shared_ptr<unity_sframe_base> sframe);
599 operator std::shared_ptr<unity_sframe>()
const;
603 operator std::shared_ptr<unity_sframe_base>()
const;
608 sframe materialize_to_sframe()
const;
620 std::vector<flexible_type> operator[](int64_t i);
621 std::vector<flexible_type> operator[](int64_t i)
const;
659 gl_sframe operator[](
const std::initializer_list<int64_t>& slice);
660 gl_sframe operator[](
const std::initializer_list<int64_t>& slice)
const;
760 gl_sframe operator[](
const std::vector<std::string>& columns)
const;
761 gl_sframe operator[](
const std::initializer_list<std::string>& columns);
762 gl_sframe operator[](
const std::initializer_list<std::string>& columns)
const;
798 void materialize_to_callback(
799 std::function<
bool(
size_t,
const std::shared_ptr<sframe_rows>&)> callback,
800 size_t nthreads = (
size_t)(-1));
851 gl_sframe_range range_iterator(
size_t start=0,
size_t end=(
size_t)(-1))
const;
861 virtual size_t size()
const;
873 bool is_materialized()
const;
879 bool has_size()
const;
910 void save(
const std::string& path,
const std::string& format=
"")
const;
920 void save_reference(
const std::string& path)
const;
925 virtual std::vector<flex_type_enum> column_types()
const;
930 virtual size_t num_columns()
const;
935 virtual std::vector<std::string> column_names()
const;
941 bool contains_column(
const std::string& col_name)
const;
1017 gl_sframe sample(
double fraction)
const;
1052 gl_sframe sample(
double fraction,
size_t seed,
bool exact=
false)
const;
1080 std::pair<gl_sframe, gl_sframe> random_split(
double fraction)
const;
1108 std::pair<gl_sframe, gl_sframe> random_split(
double fraction,
size_t seed,
bool exact=
false)
const;
1161 gl_sframe topk(
const std::string& column_name,
size_t k=10,
bool reverse=
false)
const;
1165 size_t column_index(
const std::string &column_name)
const;
1169 const std::string& column_name(
size_t index)
const;
1186 gl_sarray select_column(
const std::string& colname)
const;
1199 gl_sframe select_columns(
const std::vector<std::string>& colnames)
const;
1243 virtual void replace_add_column(
const gl_sarray& data,
const std::string& name=
"");
1287 virtual void add_column(
const flexible_type& data,
const std::string& name=
"");
1332 virtual void add_column(
const gl_sarray& data,
const std::string& name=
"");
1365 virtual void add_columns(
const gl_sframe& data);
1395 virtual void remove_column(
const std::string& name);
1427 virtual void swap_columns(
const std::string& column_1,
const std::string& column_2);
1459 virtual void rename(
const std::map<std::string, std::string>& old_to_new_names);
1722 gl_sframe groupby(
const std::vector<std::string>& groupkeys,
1723 const std::map<std::string, aggregate::groupby_descriptor_type>& operators
1724 = std::map<std::string, aggregate::groupby_descriptor_type>())
const;
1806 const std::vector<std::string>& joinkeys,
1807 const std::string& how=
"inner")
const;
1857 const std::map<std::string, std::string>& joinkeys,
1858 const std::string& how=
"inner")
const;
1997 gl_sframe pack_columns(
const std::vector<std::string>& columns,
1998 const std::string& new_column_name,
2088 gl_sframe pack_columns(
const std::string& column_prefix,
2089 const std::string& new_column_name,
2140 gl_sframe split_datetime(
const std::string& expand_column,
2141 const std::string& column_name_prefix =
"X",
2142 const std::vector<std::string>& limit = std::vector<std::string>(),
2143 bool tzone=
false)
const;
2239 gl_sframe unpack(
const std::string& unpack_column,
2240 const std::string& column_name_prefix =
"X",
2241 const std::vector<flex_type_enum>& column_types = std::vector<flex_type_enum>(),
2243 const std::vector<flexible_type>& limit = std::vector<flexible_type>())
const;
2310 gl_sframe stack(
const std::string& column_name,
2311 const std::string& new_column_names,
2312 bool drop_na =
false)
const;
2383 gl_sframe stack(
const std::string& column_name,
2384 const std::vector<std::string>& new_column_names,
2385 bool drop_na =
false)
const;
2422 gl_sframe unstack(
const std::string& columns,
2423 const std::string& new_column_name =
"")
const;
2460 gl_sframe unstack(
const std::vector<std::string>& columns,
2461 const std::string& new_column_name =
"")
const;
2543 gl_sframe sort(
const std::string& column,
bool ascending =
true)
const;
2581 gl_sframe sort(
const std::vector<std::string>& columns,
bool ascending =
true)
const;
2586 gl_sframe sort(
const std::initializer_list<std::string>& columns,
bool ascending =
true)
const;
2619 gl_sframe sort(
const std::vector<std::pair<std::string, bool>>& column_and_ascending)
const;
2690 gl_sframe dropna(
const std::vector<std::string>& columns = std::vector<std::string>(),
2691 std::string how =
"any",
bool recursive =
false)
const;
2750 std::pair<gl_sframe, gl_sframe> dropna_split(
2751 const std::vector<std::string>& columns=std::vector<std::string>(),
2752 std::string how =
"any",
bool recursive =
false)
const;
2824 gl_sframe add_row_number(
const std::string& column_name =
"id",
size_t start = 0)
const;
2828 virtual std::shared_ptr<unity_sframe> get_proxy()
const;
2832 void instantiate_new();
2834 std::shared_ptr<unity_sframe> m_sframe;
2836 std::shared_ptr<sframe_reader> get_sframe_reader()
const;
2856 size_t start,
size_t end);
2864 public boost::iterator_facade<iterator,
2865 const sframe_rows::row&, boost::single_pass_traversal_tag> {
2875 friend class boost::iterator_core_access;
2877 void advance(
size_t n);
2878 inline bool equal(
const iterator& other)
const {
2879 return m_counter == other.m_counter;
2881 const type& dereference()
const;
2882 size_t m_counter = 0;
2907 std::shared_ptr<sframe_reader_buffer> m_sframe_reader_buffer;
2927 virtual std::shared_ptr<unity_sarray> get_proxy()
const;
2934 std::string m_column_name;
2951 virtual std::shared_ptr<unity_sarray> get_proxy()
const;
2958 std::string m_column_name;
2963 class gl_sframe_writer_impl;
3014 const std::vector<flex_type_enum>& column_types,
3015 size_t num_segments = (
size_t)(-1));
3035 void write(
const std::vector<flexible_type>& f,
size_t segmentid);
3054 template <
typename T>
3055 void write(T begin, T end,
size_t segmentid) {
3056 while (begin != end) {
3057 write((*begin), segmentid);
3071 size_t num_segments()
const;
3076 std::unique_ptr<gl_sframe_writer_impl> m_writer_impl;
3081 #endif // TURI_UNITY_GL_SFRAME_HPP
groupby_descriptor_type MAX(const std::string &col)
groupby_descriptor_type QUANTILE(const std::string &col, double quantile)
groupby_descriptor_type VARIANCE(const std::string &col)
groupby_descriptor_type MIN(const std::string &col)
iterator const_iterator
const_iterator type
std::shared_ptr< sframe > sort(std::shared_ptr< planner_node > sframe_planner_node, const std::vector< std::string > column_names, const std::vector< size_t > &sort_column_indices, const std::vector< bool > &sort_orders)
groupby_descriptor_type SUM(const std::string &col)
static std::ostream & operator<<(std::ostream &out, const uint128_t &x)
Enables printing of uint128_t values.
groupby_descriptor_type ARGMAX(const std::string &agg, const std::string &out)
groupby_descriptor_type CONCAT(const std::string &col)
groupby_descriptor_type STDV(const std::string &col)
void write(T begin, T end, size_t segmentid)
groupby_descriptor_type COUNT_DISTINCT(const std::string &col)
groupby_descriptor_type AVG(const std::string &col)
std::shared_ptr< group_aggregate_value > m_aggregator
aggregator
groupby_descriptor_type MEAN(const std::string &col)
std::set< T > values(const std::map< Key, T > &map)
groupby_descriptor_type COUNT()
groupby_descriptor_type SELECT_ONE(const std::string &col)
groupby_descriptor_type VAR(const std::string &col)
static flexible_type FLEX_UNDEFINED
std::vector< std::string > m_group_columns
columns as input into the aggregator
groupby_descriptor_type STD(const std::string &col)
groupby_descriptor_type ARGMIN(const std::string &agg, const std::string &out)
groupby_descriptor_type make_aggregator(const std::vector< std::string > &group_columns, const Args &... args)