Turi Create
4.0
|
The fileio library provides a collection of generic libraries to unify access to HDFS, S3, local filesystem and in-memory filesystem. More...
Classes | |
class | turi::block_cache |
class | turi::buffered_writer< ValueType, OutIterator > |
class | turi::file_download_cache |
class | turi::fileio::file_handle_pool |
struct | turi::fileio::file_ownership_handle |
struct | turi::fileio::cache_block |
class | turi::fileio::fixed_size_cache_manager |
class | turi::general_ifstream |
class | turi::general_ofstream |
class | turi::fileio_impl::general_fstream_sink |
class | turi::fileio_impl::general_fstream_source |
class | turi::read_caching_device< T > |
struct | turi::s3url |
struct | turi::list_objects_response |
class | turi::s3_device |
class | turi::union_fstream |
Typedefs | |
typedef boost::iostreams::stream< turi::fileio_impl::cache_stream_source > | turi::fileio::icache_stream |
typedef boost::iostreams::stream< turi::fileio_impl::cache_stream_sink > | turi::fileio::ocache_stream |
Enumerations | |
enum | turi::fileio::file_status |
Functions | |
int | turi::download_url (std::string url, std::string output_file) |
std::tuple< int, bool, std::string > | turi::download_url (std::string url) |
std::string | turi::fileio::get_system_temp_directory () |
std::string | turi::fileio::get_cache_prefix () |
std::string | turi::fileio::get_temp_cache_prefix () |
std::string | turi::fileio::get_cache_file_locations () |
void | turi::fileio::set_cache_file_locations (std::string) |
std::string | turi::fileio::get_cache_file_hdfs_location () |
const std::string & | turi::fileio::get_alternative_ssl_cert_dir () |
const std::string & | turi::fileio::get_alternative_ssl_cert_file () |
const bool | turi::fileio::insecure_ssl_cert_checks () |
std::pair< file_status, std::string > | turi::fileio::get_file_status (const std::string &path) |
std::vector< std::pair< std::string, file_status > > | turi::fileio::get_directory_listing (const std::string &path) |
bool | turi::fileio::create_directory (const std::string &path) |
bool | turi::fileio::create_directory_or_throw (const std::string &path) |
bool | turi::fileio::delete_path (const std::string &path, file_status status=file_status::FS_UNAVAILABLE) |
bool | turi::fileio::delete_path_impl (const std::string &path, file_status status=file_status::FS_UNAVAILABLE) |
bool | turi::fileio::delete_path_recursive (const std::string &path) |
std::tuple< std::string, std::string, std::string > | turi::fileio::parse_hdfs_url (std::string url) |
std::string | turi::fileio::remove_protocol (std::string path) |
std::string | turi::fileio::get_filename (std::string path) |
std::string | turi::fileio::get_dirname (std::string path) |
std::string | turi::fileio::convert_to_generic (const std::string &path) |
std::string | turi::fileio::make_relative_path (std::string root_directory, std::string path) |
std::string | turi::fileio::make_absolute_path (std::string root_directory, std::string path) |
std::vector< std::pair< std::string, file_status > > | turi::fileio::get_glob_files (const std::string &url) |
size_t | turi::fileio::get_io_parallelism_id (const std::string url) |
bool | turi::fileio::try_to_open_file (const std::string url) |
void | turi::fileio::copy (const std::string src, const std::string dest) |
bool | turi::fileio::change_file_mode (const std::string path, short mode) |
std::string | turi::fileio::make_canonical_path (const std::string &path) |
std::vector< std::string > | turi::fileio::get_s3_endpoints () |
std::string | turi::fileio::get_region_name_from_endpoint (std::string endpoint) |
std::string | turi::fileio::get_bucket_path (const std::string &bucket) |
Aws::S3::S3Client | turi::init_aws_sdk_with_turi_env (s3url &parsed_url) |
std::string | turi::get_s3_file_last_modified (const std::string &url) |
list_objects_response | turi::list_objects (std::string s3_url, std::string proxy="") |
list_objects_response | turi::list_directory (std::string s3_url, std::string proxy="") |
std::string | turi::delete_object (std::string s3_url, std::string proxy="") |
std::string | turi::delete_prefix (std::string s3_url, std::string proxy="") |
std::string | turi::sanitize_s3_url (const std::string &url) |
bool | turi::parse_s3url (const std::string &url, s3url &ret, std::string &err_msg) |
void | turi::set_upload_timeout (long timeout) |
void | turi::set_download_timeout (long timeout) |
std::string | turi::sanitize_url (std::string url) |
void | turi::fileio::set_curl_options (void *ecurl) |
std::string | turi::get_system_user_name () |
std::string | turi::get_temp_name (const std::string &prefix="", bool _prefer_hdfs=false) |
std::string | turi::get_temp_name_prefer_hdfs (const std::string &prefix="") |
bool | turi::delete_temp_file (std::string s) |
void | turi::delete_temp_files (std::vector< std::string > files) |
void | turi::reap_unused_temp_files () |
void | turi::reap_current_process_temp_files () |
std::vector< std::string > | turi::get_temp_directories () |
size_t | turi::num_temp_directories () |
Variables | |
const size_t | turi::fileio::FILEIO_INITIAL_CAPACITY_PER_FILE |
size_t | turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE |
size_t | turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY |
size_t | turi::fileio::FILEIO_READER_BUFFER_SIZE |
size_t | turi::fileio::FILEIO_WRITER_BUFFER_SIZE |
std::string | turi::fileio::S3_ENDPOINT |
std::string | turi::fileio::S3_REGION |
int64_t | turi::fileio::NUM_GPUS |
The fileio library provides a collection of generic libraries to unify access to HDFS, S3, local filesystem and in-memory filesystem.
The two key objects everybody should care about is turi::general_ifstream and turi::general_ofstream. Almost everything else is implementation detail. boost::stream is used liberally to provide compatibility.
Generally,
typedef boost::iostreams::stream<turi::fileio_impl::cache_stream_source> turi::fileio::icache_stream |
icache_stream provides an input stream to a cache object; this should not be used directly.
Definition at line 22 of file cache_stream.hpp.
typedef boost::iostreams::stream<turi::fileio_impl::cache_stream_sink> turi::fileio::ocache_stream |
ocache_stream provides an output stream to a cache object; this should not be used directly.
Definition at line 32 of file cache_stream.hpp.
|
strong |
Return values of get_file_status
Definition at line 24 of file fs_utils.hpp.
bool turi::fileio::change_file_mode | ( | const std::string | path, |
short | mode | ||
) |
Changes the file mode bits of the given file or directory in the url
std::string turi::fileio::convert_to_generic | ( | const std::string & | path | ) |
Converts the path to a generic format for operation.
Currently, all this means is that backslashes are converted to forward slashes.
void turi::fileio::copy | ( | const std::string | src, |
const std::string | dest | ||
) |
Copies a file from src to dest
bool turi::fileio::create_directory | ( | const std::string & | path | ) |
Creates a directory and all parent required directories (like mkdir -p). Path can be hdfs, s3, or regular filesystem. Returns true on creation, false on failure or if the directory already exists. To get meaningful error messages thrown on failure, use create_directory_or_throw.
bool turi::fileio::create_directory_or_throw | ( | const std::string & | path | ) |
Creates a directory and all parent required directories (like mkdir -p). Path can be hdfs, s3, or regular filesystem. Returns true on creation, false if the directory already exists. Throws std::ios_base::failure on failure.
std::string turi::delete_object | ( | std::string | s3_url, |
std::string | proxy = "" |
||
) |
Where url points to a single object, this deletes the object. Returns an empty string on success, and an error string on failure.
bool turi::fileio::delete_path | ( | const std::string & | path, |
file_status | status = file_status::FS_UNAVAILABLE |
||
) |
Try delete a given path. Path can be hdfs, s3, or regular filesystem. If the path is a directory, then try remove all files under directory If the path is a file, then file will be deleted immediately if the file is not currently in use, otherwise the files are to be removed later when the files are not used by anyone If path doesn't exist, this returns true. Returns true on success, false on failure.
path | The path to delete |
status | The file status if known. (Optional) |
bool turi::fileio::delete_path_impl | ( | const std::string & | path, |
file_status | status = file_status::FS_UNAVAILABLE |
||
) |
Deletes a path. Internal function not meant to be called by external components
If the path is a directory, then try remove all files under directory If the path is a file, then file will be removed If path doesn't exist, this returns true. Returns true on success, false on failure.
path | The path to delete |
status | The file status if known. (Optional) |
bool turi::fileio::delete_path_recursive | ( | const std::string & | path | ) |
Deletes a path. if path is a directory, deletion will delete all files and directories it contains. Path can be hdfs, s3, or regular filesystem. If path doesn't exist, this returns true. Returns true on success, false on failure.
std::string turi::delete_prefix | ( | std::string | s3_url, |
std::string | proxy = "" |
||
) |
Where url points to a prefix, this deletes all objects with the specified prefix. Returns an empty string on success, and an error string on failure.
bool turi::delete_temp_file | ( | std::string | s | ) |
Deletes the temporary file with the name s. Returns true on success, false on failure (file does not exist, or cannot be deleted). The file will only be deleted if a prefix of s was previously returned by get_temp_name(). This is done for safety to prevent this function from being used to delete arbitrary files.
For instance, if get_temp_name() previously returned /tmp/file51apTO , delete_temp_file will succeed on /tmp/file51apTO.csv . delete_temp_file will fail on stuff like /usr/bin/bash
void turi::delete_temp_files | ( | std::vector< std::string > | files | ) |
Deletes a collection of temporary files. The files will only be deleted if a prefix of s was previously returned by get_temp_name(). This is done for safety to prevent this function from being used to delete arbitrary files.
For instance, if get_temp_name() previously returned /tmp/file51apTO , delete_temp_files will succeed on a collection of files {/tmp/file51apTO.csv, /tmp/file51apTO.txt}. delete_temp_file will fail on stuff like /usr/bin/bash
int turi::download_url | ( | std::string | url, |
std::string | output_file | ||
) |
Downloads a given URL into a given output file
Returns 0 on success, non-zero (a curl error code) on failure.
std::tuple<int, bool, std::string> turi::download_url | ( | std::string | url | ) |
Downlaods a given URL returning the local filename it has been downloaded to. If the url is a remote URL, the URL will be downloaded to a temporary local file (created using tmpnam), and the local file name returned. If the url is a local file, the local filename will be returned directly.
Returns 0 on success, non-zero (a curl error code) on failure.
const std::string& turi::fileio::get_alternative_ssl_cert_dir | ( | ) |
Gets the alternative ssl certificate file and directory.
const std::string& turi::fileio::get_alternative_ssl_cert_file | ( | ) |
Sets the alternative ssl certificate file and directory.
std::string turi::fileio::get_bucket_path | ( | const std::string & | bucket | ) |
Returns a S3 bucket specific path. On regular S3 this returns the virtualhosting style bucket. On other explicitly specified endpoints, this returns $S3_ENDPOINT/[bucket]/
For consistency, the returned bucket path will always end with a "/"
std::string turi::fileio::get_cache_file_hdfs_location | ( | ) |
Additional HDFS location for storing large temp files.
std::string turi::fileio::get_cache_file_locations | ( | ) |
Gets the physical directory (/var/tmp) which all cached files are located in . colon seperated.
std::string turi::fileio::get_cache_prefix | ( | ) |
The protocol prefix cache:// to identify a cached file.
std::vector<std::pair<std::string, file_status> > turi::fileio::get_directory_listing | ( | const std::string & | path | ) |
Enumerates the contents of a directory, listing all the files as well as the file type. Path can be hdfs, s3, or regular filesystem.
std::string turi::fileio::get_dirname | ( | std::string | path | ) |
Extracts the directory name from a fully qualified path. So given: s3://bucket/data/123 This will return "s3://bucket/data"
In short, this will return everything to the left of the last trailing "/".
std::pair<file_status, std::string> turi::fileio::get_file_status | ( | const std::string & | path | ) |
Checks a path (can be hdfs, s3, or regular) to see if it is a local path, or a remote path.
if file is missing, err_msg will be the second value if provided
std::string turi::fileio::get_filename | ( | std::string | path | ) |
Extracts the file name from a fully qualified path. So given: s3://bucket/data/123 This will return "123".
In short, this will return everything to the right of the last trailing "/".
std::vector<std::pair<std::string, file_status> > turi::fileio::get_glob_files | ( | const std::string & | url | ) |
Where URL is a glob of the form directory1/directory2/[glob] (glob must only be on the file portion), returns a list of files matching the glob pattern.
size_t turi::fileio::get_io_parallelism_id | ( | const std::string | url | ) |
Given a URL, returns an ID value where URLs which return different ID values are ok to be read in parallel, and URLs which return the same ID value are probably sub-optimal if read in parallel; An ID of (size_t)(-1) indicates that it can be read in parallel with everything.
std::string turi::fileio::get_region_name_from_endpoint | ( | std::string | endpoint | ) |
Get an region name from the endpoint url.
std::vector<std::string> turi::fileio::get_s3_endpoints | ( | ) |
Returns a complete list of all available S3 region-specific endpoints.
std::string turi::get_s3_file_last_modified | ( | const std::string & | url | ) |
Get the last modified time stamp of file.
Throw exception if the url cannot be fetched.
Return empty string if last modified is not available, e.g. the url is a directory path or file does not exist.
std::string turi::fileio::get_system_temp_directory | ( | ) |
Returns the system temporary directory
std::string turi::get_system_user_name | ( | ) |
Get the current system user name.
std::string turi::fileio::get_temp_cache_prefix | ( | ) |
The "directory" (cache://tmp/) which all cached files are located in
std::vector<std::string> turi::get_temp_directories | ( | ) |
Returns the set of temp directories
std::string turi::get_temp_name | ( | const std::string & | prefix = "" , |
bool | _prefer_hdfs = false |
||
) |
Returns a file name which can be used for a temp file. Returns an empty string on failure, a temporary file name on success. The file name returned is allowed to be a "prefix". i.e. arbitrary extensions can be attached to be tail of the file. For instance, if get_temp_name() returns /tmp/file51apTO, you can use /tmp/file51apTO.csv
prefix | Optional. If specified, this exact prefix will be returned in the temporary path. ex: /var/tmp/turicreate-user/12345/[prefix]. If an empty string or not specified, a random unique prefix will be generated. |
_prefer_hdfs | Optional, defaults to false. If true, prefers to use HDFS if available. |
Note that if you specify your own prefix it is up to you to manage collisions, i.e. multiple parts of the program using the same prefix for instance.
std::string turi::get_temp_name_prefer_hdfs | ( | const std::string & | prefix = "" | ) |
Same as get_temp_name but return the temp file on hdfs if avaiable. The hdfs temp file location is a runtime configurable variable TURI_CACHE_FILE_HDFS_LOCATION defined in fileio_constant.hpp.
prefix | Optional. If specified, this exact prefix will be returned in the temporary path. ex: /var/tmp/turicreate-user/12345/[prefix]. If an empty string or not specified, a random unique prefix will be generated. |
Note that if you specify your own prefix it is up to you to manage collisions, i.e. multiple parts of the program using the same prefix for instance.
Aws::S3::S3Client turi::init_aws_sdk_with_turi_env | ( | s3url & | parsed_url | ) |
initialize the sdk with TRUI constomized environment variable
will set the endpoint/region that used to configure the client
this call will modify optional sdk_* members
const bool turi::fileio::insecure_ssl_cert_checks | ( | ) |
If true, ssl certificate checks are disabled.
list_objects_response turi::list_directory | ( | std::string | s3_url, |
std::string | proxy = "" |
||
) |
Lists all objects prefixed by a give s3 url.
if s3_url points to a valid prefix, it return the prefix's contents like a directory.
foo/hello.txt
list_objects("s3://foo") will return "foo/hello.txt"
If s3_url points to an object it will just return the object.
list_objects_response turi::list_objects | ( | std::string | s3_url, |
std::string | proxy = "" |
||
) |
Lists objects or prefixes prefixed by a give s3 url.
This is a thin wrapper around the S3 API http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGET.html and may not quite do what you think it does.
if s3_url points to a valid prefix, it will return only the prefix as a directory. For instance if I have an S3 bucket containing
foo/hello.txt
list_objects("s3://foo") will return simply "foo/" as a directory.
See list_directory() and is_directory() for a more sensible implementation which behaves somewhat more file system like.
std::string turi::fileio::make_absolute_path | ( | std::string | root_directory, |
std::string | path | ||
) |
Given a root directory and a relative path, tries to convert the relative path to an absolute path. If the path is already an absolute path, returns the original path with no changes.
This function is absolutely limited. It will not handle "../" structures to the returned relative path. In other words, the output path path must point to a file/folder inside of the root directory.
Example: make_absolute_path("s3://bucket/data", "123") returns "s3://bucket/data/123".
make_absolute_path("s3://bucket/data", "s3://foo/123") returns "s3://foo/123".
std::string turi::fileio::make_canonical_path | ( | const std::string & | path | ) |
Return canonical absolute path, eliminating dots, and symlinks
std::string turi::fileio::make_relative_path | ( | std::string | root_directory, |
std::string | path | ||
) |
Given a root directory and an absolute path, tries to create a relative path address between root_directory and the path; if not possible, returns the the original path with no changes.
This function is relatively limited. It will not add "../" structures to the returned relative path. In other words, the path must point to a file/folder inside of the root directory for this to return a relative path.
Example: make_relative_path("s3://bucket/data", "s3://bucket/data/123") returns "123".
make_relative_path("s3://bucket/data", "s3://foo/123") returns "s3://foo/123".
size_t turi::num_temp_directories | ( | ) |
Returns the number of temp directories
std::tuple<std::string, std::string, std::string> turi::fileio::parse_hdfs_url | ( | std::string | url | ) |
A helper function to parse the hdfs url. Return a tuple of host, port, and path.
bool turi::parse_s3url | ( | const std::string & | url, |
s3url & | ret, | ||
std::string & | err_msg | ||
) |
This splits a URL of the form s3://[access_key_id]:[secret_key]:[endpoint/][bucket]/[object_name] into several pieces.
endpoint and object_name are optional.
Returns true on success, false on failure.
void turi::reap_current_process_temp_files | ( | ) |
Deletes all temp files created by the current process
void turi::reap_unused_temp_files | ( | ) |
Deletes all temporary directories in the temporary turicreate/ directory (/var/tmp/turicreate) which are no longer used. i.e. was created by a process which no longer exists.
std::string turi::fileio::remove_protocol | ( | std::string | path | ) |
Returns the path removing the protocol header if there is one. .
remove_protocol("http://www.google.com") == "www.google.com" remove_protocol("s3://www.google.com") == "www.google.com" remove_protocol("/root/test") == "/root/test" remove_protocol("file:///root/test") == "/root/test"
std::string turi::sanitize_s3_url | ( | const std::string & | url | ) |
Given an S3 URL of the form expected by parse_s3url, this function drops the access_key_id and the secret_key from the string returning s3://[bucket]/[object_name]
If the url cannot be parsed, we try the best to remove information associated with ':'.
If the url does not begin with s3://, return as is.
std::string turi::sanitize_url | ( | std::string | url | ) |
Sanitizes a general_fstream URL so that it is suitable for printing; right now, all it does is to drop all credential information when the protocol is s3.
void turi::fileio::set_cache_file_locations | ( | std::string | ) |
Sets the physical directory (/var/tmp) which all cached files are located in . colon seperated.
void turi::fileio::set_curl_options | ( | void * | ecurl | ) |
Sets curl options for everywhere curl is used.
void turi::set_download_timeout | ( | long | timeout | ) |
Set the timeout for S3 download.
timeout | Timeout value in secs. |
void turi::set_upload_timeout | ( | long | timeout | ) |
Set the timeout for S3 upload.
timeout | Timeout value in secs. |
bool turi::fileio::try_to_open_file | ( | const std::string | url | ) |
Returns true if the file can be opened. False otherwise.
const size_t turi::fileio::FILEIO_INITIAL_CAPACITY_PER_FILE |
The initial memory capacity assigned to caches
size_t turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY |
The maximum memory capacity used by all cached files be flushed.
size_t turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE |
The maximum memory capacity assigned to a cached file until it has to be flushed.
size_t turi::fileio::FILEIO_READER_BUFFER_SIZE |
The default fileio reader buffer size
size_t turi::fileio::FILEIO_WRITER_BUFFER_SIZE |
The default fileio writer buffer size
int64_t turi::fileio::NUM_GPUS |
The number of GPUs.
std::string turi::fileio::S3_ENDPOINT |
The S3 connection endpoint; if empty string, S3 is assumed.
std::string turi::fileio::S3_REGION |
The S3 connection region; if empty string, region will be guessed by: