6 #ifndef TURI_UNITY_LIB_SFRAME_HPP 7 #define TURI_UNITY_LIB_SFRAME_HPP 13 #include <core/data/flexible_type/flexible_type.hpp> 14 #include <core/storage/sframe_data/sarray.hpp> 15 #include <core/storage/sframe_data/dataframe.hpp> 16 #include <core/storage/sframe_data/sframe_index_file.hpp> 17 #include <core/storage/sframe_data/sframe_constants.hpp> 18 #include <core/storage/sframe_data/output_iterator.hpp> 24 struct csv_line_tokenizer;
29 std::vector<flexible_type>,
30 std::function<void(const std::vector<flexible_type>&)>,
31 std::function<
void(std::vector<flexible_type>&&)>,
32 std::function<void(const sframe_rows&)> >
33 sframe_output_iterator;
103 (*this) = std::move(other);
128 explicit inline sframe(std::string frame_idx_file) {
162 bool fail_on_column_names=
true) {
185 std::map<std::string, std::shared_ptr<sarray<flexible_type>>>
init_from_csvs(
186 const std::string& path,
189 bool continue_on_failure,
191 std::map<std::string, flex_type_enum> column_type_hints,
192 std::vector<std::string> output_columns = std::vector<std::string>(),
193 size_t row_limit = 0,
194 size_t skip_rows = 0);
217 ASSERT_MSG(!inited,
"Attempting to init an SFrame " 218 "which has already been inited.");
220 create_arrays_for_reading(frame_index_info);
231 bool fail_on_column_names=
true) {
233 ASSERT_MSG(!inited,
"Attempting to init an SFrame " 234 "which has already been inited.");
236 create_arrays_for_reading(new_columns,
column_names, fail_on_column_names);
267 const std::string& frame_sidx_file =
"",
269 bool fail_on_column_names=
true) {
271 ASSERT_MSG(!inited,
"Attempting to init an SFrame " 272 "which has already been inited.");
273 if (column_names.size() != column_types.size()) {
274 log_and_throw(std::string(
"Names and Types array length mismatch"));
277 create_arrays_for_writing(column_names, column_types,
278 nsegments, frame_sidx_file, fail_on_column_names);
292 return (inited && !writing);
301 return (inited && writing);
318 inline bool get_metadata(
const std::string& key, std::string &val)
const {
330 inline std::pair<bool, std::string>
get_metadata(
const std::string& key)
const {
331 ASSERT_MSG(inited,
"Invalid SFrame");
332 if (index_info.
metadata.count(key)) {
333 return std::pair<bool, std::string>(
true, index_info.
metadata.at(key));
335 return std::pair<bool, std::string>(
false,
"");
355 return inited ? index_info.
nrows : 0;
364 log_and_throw(
"Column index out of range!");
376 if(i >= group_writer->get_index_info().columns.size()) {
377 log_and_throw(
"Column index out of range!");
380 (atoi(group_writer->get_index_info().columns[i].metadata[
"__type__"].c_str()));
382 if(i >= columns.size()) {
383 log_and_throw(
"Column index out of range!");
385 return columns[i]->get_type();
431 ASSERT_MSG(inited,
"Invalid SFrame");
433 return group_writer->num_segments();
435 if (index_info.
ncolumns == 0)
return 0;
436 return columns[0]->num_segments();
446 DASSERT_MSG(inited,
"Invalid SFrame");
447 if (index_info.
ncolumns == 0)
return 0;
448 else return columns[0]->segment_length(i);
464 log_and_throw(std::string(
"Column name " + column_name +
" does not exist."));
466 __builtin_unreachable();
487 std::unique_ptr<reader_type>
get_reader()
const;
500 std::unique_ptr<reader_type>
get_reader(
const std::vector<size_t>& segment_lengths)
const;
519 std::shared_ptr<sarray<flexible_type> >
select_column(
size_t column_id)
const;
526 std::shared_ptr<sarray<flexible_type> >
select_column(
const std::string &name)
const;
548 const std::string&
column_name=std::string(
""))
const;
656 bool set_metadata(
const std::string& key, std::string val);
662 void save(std::string index_file)
const;
683 bool delete_files_on_destruction();
690 std::shared_ptr<sarray_group_format_writer<flexible_type> >
707 void write(
size_t segmentid,
const std::vector<flexible_type>& t);
714 void write(
size_t segmentid, std::vector<flexible_type>&& t);
721 void write(
size_t segmentid,
const sframe_rows& t);
737 void create_arrays_for_reading(
740 bool fail_on_column_names=
true);
746 void create_arrays_for_writing(
const std::vector<std::string>&
column_names,
749 const std::string& frame_sidx_file,
750 bool fail_on_column_names);
752 void keep_array_file_ref();
756 std::string generate_valid_column_name(
const std::string &column_name)
const;
759 std::string index_file;
760 std::vector<std::shared_ptr<fileio::file_ownership_handle> > index_file_handle;
762 std::vector<std::shared_ptr<sarray<flexible_type> > > columns;
763 std::shared_ptr<sarray_group_format_writer<flexible_type> > group_writer;
768 bool writing =
false;
783 #include <core/storage/sframe_data/sframe_reader.hpp>
sframe_index_file_information read_sframe_index_file(std::string index_file)
sframe(sframe_index_file_information frame_index_info)
void set_column_name(size_t column_id, const std::string &name)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
void save(std::string index_file) const
std::pair< bool, std::string > get_metadata(const std::string &key) const
void open_for_read(const std::vector< std::shared_ptr< sarray< flexible_type > > > &new_columns, const std::vector< std::string > &column_names={}, bool fail_on_column_names=true)
flex_type_enum column_type(size_t i) const
std::shared_ptr< sarray_group_format_writer< flexible_type > > get_internal_writer()
size_t column_index(const std::string &column_name) const
const std::vector< std::string > & column_names() const
sframe remove_column(size_t column_id) const
bool set_metadata(const std::string &key, std::string val)
sframe swap_columns(size_t column_1, size_t column_2) const
void load(iarchive &iarc)
sframe(const std::vector< std::shared_ptr< sarray< flexible_type > > > &new_columns, const std::vector< std::string > &column_names={}, bool fail_on_column_names=true)
sframe add_column(std::shared_ptr< sarray< flexible_type > > sarr_ptr, const std::string &column_name=std::string("")) const
size_t num_rows() const
Returns the length of each sarray.
iterator get_output_iterator(size_t segmentid)
sframe_output_iterator iterator
The iterator type which get_output_iterator returns.
bool is_opened_for_read() const
const std::string & get_index_file() const
size_t SFRAME_DEFAULT_NUM_SEGMENTS
size_t segment_length(size_t i) const
bool contains_column(const std::string &column_name) const
sframe replace_column(std::shared_ptr< sarray< flexible_type >> sarr_ptr, const std::string &column_name) const
sframe & operator=(const sframe &other)
sframe select_columns(const std::vector< std::string > &names) const
const sframe_index_file_information get_index_info() const
std::shared_ptr< sarray< flexible_type > > select_column(size_t column_id) const
#define ASSERT_TRUE(cond)
size_t num_segments() const
std::map< std::string, std::shared_ptr< sarray< flexible_type > > > init_from_csvs(const std::string &path, csv_line_tokenizer &tokenizer, bool use_header, bool continue_on_failure, bool store_errors, std::map< std::string, flex_type_enum > column_type_hints, std::vector< std::string > output_columns=std::vector< std::string >(), size_t row_limit=0, size_t skip_rows=0)
void open_for_write(const std::vector< std::string > &column_names, const std::vector< flex_type_enum > &column_types, const std::string &frame_sidx_file="", size_t nsegments=SFRAME_DEFAULT_NUM_SEGMENTS, bool fail_on_column_names=true)
std::string column_name(size_t i) const
sframe_reader reader_type
The reader type.
std::unique_ptr< reader_type > get_reader() const
bool get_metadata(const std::string &key, std::string &val) const
size_t num_columns() const
Returns the number of columns in the SFrame. Does not throw.
void open_for_read(sframe_index_file_information frame_index_info)
bool set_num_segments(size_t numseg)
std::vector< flex_type_enum > column_types() const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
void flush_write_to_segment(size_t segment)
bool is_opened_for_write() const
dataframe_t to_dataframe()
sframe(std::string frame_idx_file)
void save_as_csv(std::string csv_file, csv_writer &writer)
std::vector< flexible_type > value_type
The type contained in the sframe.
sframe append(const sframe &other) const
flex_type_enum column_type(const std::string &column_name) const