Turi Create  4.0
general_fstream.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef FILEIO_GENERAL_ISTREAM_HPP
7 #define FILEIO_GENERAL_ISTREAM_HPP
8 #include <iostream>
9 #include <string>
10 #include <fstream>
11 #include <boost/iostreams/stream.hpp>
12 #include <boost/iostreams/filtering_stream.hpp>
13 #include <core/storage/fileio/general_fstream_source.hpp>
14 #include <core/storage/fileio/general_fstream_sink.hpp>
15 #include <core/export.hpp>
16 
17 namespace turi {
18 typedef boost::iostreams::stream<fileio_impl::general_fstream_source>
19  general_ifstream_base;
20 /**
21  * \ingroup fileio
22  * A generic input file stream interface that provides unified access to
23  * local filesystem, HDFS, S3, in memory files, and can automatically
24  * perform gzip decoding.
25  *
26  * Usage:
27  * \code
28  * general_ifstream fin("file");
29  * // after which fin behaves like a regular std::ifstream object.
30  * \endcode
31  *
32  * file can be:
33  * - local filesystem
34  * - S3 (in which case the filename must be of the form s3://... (see below)
35  * - HDFS (filename must be of the form hdfs://...)
36  * - In memory / disk paged (filename must be of the form cache://...)
37  *
38  * In all filesystems, random seek is allowed.
39  *
40  * If the file is gzip compressed, it will automatically be decoded on the fly,
41  * but random seeks will be disabled.
42  *
43  * S3 access keys are mediated by having the filename be of the form
44  * s3://[access_key_id]:[secret_key]:[endpoint/][bucket]/[object_name]
45  *
46  * Endpoint URLs however, are set globally via the global variable S3_ENDPOINT.
47  */
48 class EXPORT general_ifstream : public general_ifstream_base {
49  private:
50  std::string opened_filename;
51  public:
52  /**
53  * Constructs a general ifstream object when opens the filename specified.
54  * The file may be on HDFS and may be gzip compressed. If the file
55  * is gzip compressed, the file must be have the ".gz" suffix for it to be
56  * properly identified.
57  *
58  * Throw an std::io_base::failure exception if failing to contruct the stream.
59  */
60  general_ifstream(std::string filename);
61 
62  /**
63  * Constructs a general ifstream object when opens the filename specified.
64  * The file may be on HDFS and may be gzip compressed.
65  * This overloaded constructor allows you to explicitly specify if the file
66  * was gzip compressed regardless of the filename.
67  *
68  * Throw an std::io_base::failure exception if failing to contruct the stream.
69  */
70  general_ifstream(std::string filename, bool gzip_compressed);
71 
72  /**
73  * Returns the file size of the opened file.
74  * Returns (size_t)(-1) if there is no file opened, or if there is an
75  * error obtaining the file size.
76  */
77  size_t file_size();
78 
79  /**
80  * Returns the number of bytes read from disk so far. Due to file
81  * compression and buffering this can be very different from how many bytes
82  * were read from the stream.
83  */
84  size_t get_bytes_read();
85 
86  /**
87  * Returns the local file name used by the stream.
88  */
89  std::string filename() const;
90 
91  /**
92  * Returns the underlying stream object
93  */
94  std::shared_ptr<std::istream> get_underlying_stream();
95 };
96 
97 
98 
99 typedef boost::iostreams::stream<fileio_impl::general_fstream_sink>
100  general_ofstream_base;
101 
102 /**
103  * \ingroup fileio
104  * A generic output file stream interface that provides unified access to
105  * local filesystem, HDFS, S3, in memory files, and can automatically
106  * perform gzip decoding.
107  *
108  * Usage:
109  * \code
110  * general_ofstream fin("file");
111  * // after which fout almost behaves like a regular std::ofstream object.
112  * \endcode
113  *
114  * file can be:
115  * - local filesystem
116  * - S3 (in which case the filename must be of the form s3://... (see below)
117  * - HDFS (filename must be of the form hdfs://...)
118  * - In memory / disk paged (filename must be of the form cache://...)
119  *
120  * Unlike standard std::ofstream, random seek is \b not allowed. In other words,
121  * only sequential write is permitted.
122  *
123  * If the filename ends with ".gz", gzip compression is automatically performed.
124  *
125  * S3 access keys are mediated by having the filename be of the form
126  * s3://[access_key_id]:[secret_key]:[endpoint/][bucket]/[object_name]
127  *
128  * Endpoint URLs however, are set globally via the global variable S3_ENDPOINT.
129  */
130 class EXPORT general_ofstream: public general_ofstream_base {
131  private:
132  std::string opened_filename;
133  public:
134 
135  /**
136  * Constructs a general ofstream object when opens the filename specified.
137  * The file may be on HDFS. If the filename has the ".gz" suffix, it will be
138  * gzip compressed.
139  *
140  * Throw an std::io_base::failure exception if failing to contruct the stream.
141  */
142  general_ofstream(std::string filename);
143 
144 
145  /**
146  * Constructs a general ofstream object when opens the filename specified.
147  * The file may be on HDFS.
148  * This overloaded constructor allows you to explicitly specify whether
149  * the file is to to be gzip compressed, regardless of the filename.
150  *
151  * Throw an std::io_base::failure exception if failing to contruct the stream.
152  */
153  general_ofstream(std::string filename, bool gzip_compress);
154 
155  /**
156  * Returns true if the stream is good. See std::ios_base
157  */
158  bool good() const;
159 
160  /**
161  * Returns true if the stream is bad. See std::ios_base
162  */
163  bool bad() const;
164 
165  /**
166  * Returns true if the last stream operation has failed. See std::ios_base
167  */
168  bool fail() const;
169 
170  /**
171  * Returns the number of bytes written to disk so far. Due to file
172  * compression and buffering this can be very different from how many bytes
173  * were wrtten to the stream.
174  */
175  size_t get_bytes_written() const;
176 
177  /**
178  * Returns the local file name used by the stream.
179  */
180  std::string filename() const;
181 };
182 
183 } // namespace turi
184 
185 #endif // TURI_UTIL_GENERAL_ISTREAM_HPP