Turi Create  4.0
File IO Library

The fileio library provides a collection of generic libraries to unify access to HDFS, S3, local filesystem and in-memory filesystem. More...

Classes

class  turi::block_cache
 
class  turi::buffered_writer< ValueType, OutIterator >
 
class  turi::file_download_cache
 
class  turi::fileio::file_handle_pool
 
struct  turi::fileio::file_ownership_handle
 
struct  turi::fileio::cache_block
 
class  turi::fileio::fixed_size_cache_manager
 
class  turi::general_ifstream
 
class  turi::general_ofstream
 
class  turi::fileio_impl::general_fstream_sink
 
class  turi::fileio_impl::general_fstream_source
 
class  turi::read_caching_device< T >
 
struct  turi::s3url
 
struct  turi::list_objects_response
 
class  turi::s3_device
 
class  turi::union_fstream
 

Typedefs

typedef boost::iostreams::stream< turi::fileio_impl::cache_stream_sourceturi::fileio::icache_stream
 
typedef boost::iostreams::stream< turi::fileio_impl::cache_stream_sinkturi::fileio::ocache_stream
 

Enumerations

enum  turi::fileio::file_status
 

Functions

int turi::download_url (std::string url, std::string output_file)
 
std::tuple< int, bool, std::string > turi::download_url (std::string url)
 
std::string turi::fileio::get_system_temp_directory ()
 
std::string turi::fileio::get_cache_prefix ()
 
std::string turi::fileio::get_temp_cache_prefix ()
 
std::string turi::fileio::get_cache_file_locations ()
 
void turi::fileio::set_cache_file_locations (std::string)
 
std::string turi::fileio::get_cache_file_hdfs_location ()
 
const std::string & turi::fileio::get_alternative_ssl_cert_dir ()
 
const std::string & turi::fileio::get_alternative_ssl_cert_file ()
 
const bool turi::fileio::insecure_ssl_cert_checks ()
 
std::pair< file_status, std::string > turi::fileio::get_file_status (const std::string &path)
 
std::vector< std::pair< std::string, file_status > > turi::fileio::get_directory_listing (const std::string &path)
 
bool turi::fileio::create_directory (const std::string &path)
 
bool turi::fileio::create_directory_or_throw (const std::string &path)
 
bool turi::fileio::delete_path (const std::string &path, file_status status=file_status::FS_UNAVAILABLE)
 
bool turi::fileio::delete_path_impl (const std::string &path, file_status status=file_status::FS_UNAVAILABLE)
 
bool turi::fileio::delete_path_recursive (const std::string &path)
 
std::tuple< std::string, std::string, std::string > turi::fileio::parse_hdfs_url (std::string url)
 
std::string turi::fileio::remove_protocol (std::string path)
 
std::string turi::fileio::get_filename (std::string path)
 
std::string turi::fileio::get_dirname (std::string path)
 
std::string turi::fileio::convert_to_generic (const std::string &path)
 
std::string turi::fileio::make_relative_path (std::string root_directory, std::string path)
 
std::string turi::fileio::make_absolute_path (std::string root_directory, std::string path)
 
std::vector< std::pair< std::string, file_status > > turi::fileio::get_glob_files (const std::string &url)
 
size_t turi::fileio::get_io_parallelism_id (const std::string url)
 
bool turi::fileio::try_to_open_file (const std::string url)
 
void turi::fileio::copy (const std::string src, const std::string dest)
 
bool turi::fileio::change_file_mode (const std::string path, short mode)
 
std::string turi::fileio::make_canonical_path (const std::string &path)
 
std::vector< std::string > turi::fileio::get_s3_endpoints ()
 
std::string turi::fileio::get_region_name_from_endpoint (std::string endpoint)
 
std::string turi::fileio::get_bucket_path (const std::string &bucket)
 
Aws::S3::S3Client turi::init_aws_sdk_with_turi_env (s3url &parsed_url)
 
std::string turi::get_s3_file_last_modified (const std::string &url)
 
list_objects_response turi::list_objects (std::string s3_url, std::string proxy="")
 
list_objects_response turi::list_directory (std::string s3_url, std::string proxy="")
 
std::string turi::delete_object (std::string s3_url, std::string proxy="")
 
std::string turi::delete_prefix (std::string s3_url, std::string proxy="")
 
std::string turi::sanitize_s3_url (const std::string &url)
 
bool turi::parse_s3url (const std::string &url, s3url &ret, std::string &err_msg)
 
void turi::set_upload_timeout (long timeout)
 
void turi::set_download_timeout (long timeout)
 
std::string turi::sanitize_url (std::string url)
 
void turi::fileio::set_curl_options (void *ecurl)
 
std::string turi::get_system_user_name ()
 
std::string turi::get_temp_name (const std::string &prefix="", bool _prefer_hdfs=false)
 
std::string turi::get_temp_name_prefer_hdfs (const std::string &prefix="")
 
bool turi::delete_temp_file (std::string s)
 
void turi::delete_temp_files (std::vector< std::string > files)
 
void turi::reap_unused_temp_files ()
 
void turi::reap_current_process_temp_files ()
 
std::vector< std::string > turi::get_temp_directories ()
 
size_t turi::num_temp_directories ()
 

Variables

const size_t turi::fileio::FILEIO_INITIAL_CAPACITY_PER_FILE
 
size_t turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE
 
size_t turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY
 
size_t turi::fileio::FILEIO_READER_BUFFER_SIZE
 
size_t turi::fileio::FILEIO_WRITER_BUFFER_SIZE
 
std::string turi::fileio::S3_ENDPOINT
 
std::string turi::fileio::S3_REGION
 
int64_t turi::fileio::NUM_GPUS
 

Detailed Description

The fileio library provides a collection of generic libraries to unify access to HDFS, S3, local filesystem and in-memory filesystem.

The two key objects everybody should care about is turi::general_ifstream and turi::general_ofstream. Almost everything else is implementation detail. boost::stream is used liberally to provide compatibility.

Generally,

Typedef Documentation

◆ icache_stream

icache_stream provides an input stream to a cache object; this should not be used directly.

See also
general_ifstream

Definition at line 22 of file cache_stream.hpp.

◆ ocache_stream

ocache_stream provides an output stream to a cache object; this should not be used directly.

See also
general_ofstream

Definition at line 32 of file cache_stream.hpp.

Enumeration Type Documentation

◆ file_status

Return values of get_file_status

Definition at line 24 of file fs_utils.hpp.

Function Documentation

◆ change_file_mode()

bool turi::fileio::change_file_mode ( const std::string  path,
short  mode 
)

Changes the file mode bits of the given file or directory in the url

◆ convert_to_generic()

std::string turi::fileio::convert_to_generic ( const std::string &  path)

Converts the path to a generic format for operation.

Currently, all this means is that backslashes are converted to forward slashes.

◆ copy()

void turi::fileio::copy ( const std::string  src,
const std::string  dest 
)

Copies a file from src to dest

◆ create_directory()

bool turi::fileio::create_directory ( const std::string &  path)

Creates a directory and all parent required directories (like mkdir -p). Path can be hdfs, s3, or regular filesystem. Returns true on creation, false on failure or if the directory already exists. To get meaningful error messages thrown on failure, use create_directory_or_throw.

◆ create_directory_or_throw()

bool turi::fileio::create_directory_or_throw ( const std::string &  path)

Creates a directory and all parent required directories (like mkdir -p). Path can be hdfs, s3, or regular filesystem. Returns true on creation, false if the directory already exists. Throws std::ios_base::failure on failure.

◆ delete_object()

std::string turi::delete_object ( std::string  s3_url,
std::string  proxy = "" 
)

Where url points to a single object, this deletes the object. Returns an empty string on success, and an error string on failure.

◆ delete_path()

bool turi::fileio::delete_path ( const std::string &  path,
file_status  status = file_status::FS_UNAVAILABLE 
)

Try delete a given path. Path can be hdfs, s3, or regular filesystem. If the path is a directory, then try remove all files under directory If the path is a file, then file will be deleted immediately if the file is not currently in use, otherwise the files are to be removed later when the files are not used by anyone If path doesn't exist, this returns true. Returns true on success, false on failure.

Parameters
pathThe path to delete
statusThe file status if known. (Optional)

◆ delete_path_impl()

bool turi::fileio::delete_path_impl ( const std::string &  path,
file_status  status = file_status::FS_UNAVAILABLE 
)

Deletes a path. Internal function not meant to be called by external components

If the path is a directory, then try remove all files under directory If the path is a file, then file will be removed If path doesn't exist, this returns true. Returns true on success, false on failure.

Parameters
pathThe path to delete
statusThe file status if known. (Optional)

◆ delete_path_recursive()

bool turi::fileio::delete_path_recursive ( const std::string &  path)

Deletes a path. if path is a directory, deletion will delete all files and directories it contains. Path can be hdfs, s3, or regular filesystem. If path doesn't exist, this returns true. Returns true on success, false on failure.

◆ delete_prefix()

std::string turi::delete_prefix ( std::string  s3_url,
std::string  proxy = "" 
)

Where url points to a prefix, this deletes all objects with the specified prefix. Returns an empty string on success, and an error string on failure.

◆ delete_temp_file()

bool turi::delete_temp_file ( std::string  s)

Deletes the temporary file with the name s. Returns true on success, false on failure (file does not exist, or cannot be deleted). The file will only be deleted if a prefix of s was previously returned by get_temp_name(). This is done for safety to prevent this function from being used to delete arbitrary files.

For instance, if get_temp_name() previously returned /tmp/file51apTO , delete_temp_file will succeed on /tmp/file51apTO.csv . delete_temp_file will fail on stuff like /usr/bin/bash

◆ delete_temp_files()

void turi::delete_temp_files ( std::vector< std::string >  files)

Deletes a collection of temporary files. The files will only be deleted if a prefix of s was previously returned by get_temp_name(). This is done for safety to prevent this function from being used to delete arbitrary files.

For instance, if get_temp_name() previously returned /tmp/file51apTO , delete_temp_files will succeed on a collection of files {/tmp/file51apTO.csv, /tmp/file51apTO.txt}. delete_temp_file will fail on stuff like /usr/bin/bash

◆ download_url() [1/2]

int turi::download_url ( std::string  url,
std::string  output_file 
)

Downloads a given URL into a given output file

retcode = download_url("http://google.com", "google.html");

Returns 0 on success, non-zero (a curl error code) on failure.

◆ download_url() [2/2]

std::tuple<int, bool, std::string> turi::download_url ( std::string  url)

Downlaods a given URL returning the local filename it has been downloaded to. If the url is a remote URL, the URL will be downloaded to a temporary local file (created using tmpnam), and the local file name returned. If the url is a local file, the local filename will be returned directly.

Returns
A tuple of (error_code, is_temporary, local_file_name) Error_code is non-zero on failure, in which case the other arguments should be ignored. is_temporary is true if the URL is a remote URL. local_file_name contains the local file name in which the data can be accessed.
std::tie(status, is_temporary, filename) = download_url("http://google.com");

Returns 0 on success, non-zero (a curl error code) on failure.

◆ get_alternative_ssl_cert_dir()

const std::string& turi::fileio::get_alternative_ssl_cert_dir ( )

Gets the alternative ssl certificate file and directory.

◆ get_alternative_ssl_cert_file()

const std::string& turi::fileio::get_alternative_ssl_cert_file ( )

Sets the alternative ssl certificate file and directory.

◆ get_bucket_path()

std::string turi::fileio::get_bucket_path ( const std::string &  bucket)

Returns a S3 bucket specific path. On regular S3 this returns the virtualhosting style bucket. On other explicitly specified endpoints, this returns $S3_ENDPOINT/[bucket]/

For consistency, the returned bucket path will always end with a "/"

◆ get_cache_file_hdfs_location()

std::string turi::fileio::get_cache_file_hdfs_location ( )

Additional HDFS location for storing large temp files.

◆ get_cache_file_locations()

std::string turi::fileio::get_cache_file_locations ( )

Gets the physical directory (/var/tmp) which all cached files are located in . colon seperated.

◆ get_cache_prefix()

std::string turi::fileio::get_cache_prefix ( )

The protocol prefix cache:// to identify a cached file.

◆ get_directory_listing()

std::vector<std::pair<std::string, file_status> > turi::fileio::get_directory_listing ( const std::string &  path)

Enumerates the contents of a directory, listing all the files as well as the file type. Path can be hdfs, s3, or regular filesystem.

◆ get_dirname()

std::string turi::fileio::get_dirname ( std::string  path)

Extracts the directory name from a fully qualified path. So given: s3://bucket/data/123 This will return "s3://bucket/data"

In short, this will return everything to the left of the last trailing "/".

◆ get_file_status()

std::pair<file_status, std::string> turi::fileio::get_file_status ( const std::string &  path)

Checks a path (can be hdfs, s3, or regular) to see if it is a local path, or a remote path.

if file is missing, err_msg will be the second value if provided

◆ get_filename()

std::string turi::fileio::get_filename ( std::string  path)

Extracts the file name from a fully qualified path. So given: s3://bucket/data/123 This will return "123".

In short, this will return everything to the right of the last trailing "/".

◆ get_glob_files()

std::vector<std::pair<std::string, file_status> > turi::fileio::get_glob_files ( const std::string &  url)

Where URL is a glob of the form directory1/directory2/[glob] (glob must only be on the file portion), returns a list of files matching the glob pattern.

◆ get_io_parallelism_id()

size_t turi::fileio::get_io_parallelism_id ( const std::string  url)

Given a URL, returns an ID value where URLs which return different ID values are ok to be read in parallel, and URLs which return the same ID value are probably sub-optimal if read in parallel; An ID of (size_t)(-1) indicates that it can be read in parallel with everything.

◆ get_region_name_from_endpoint()

std::string turi::fileio::get_region_name_from_endpoint ( std::string  endpoint)

Get an region name from the endpoint url.

◆ get_s3_endpoints()

std::vector<std::string> turi::fileio::get_s3_endpoints ( )

Returns a complete list of all available S3 region-specific endpoints.

◆ get_s3_file_last_modified()

std::string turi::get_s3_file_last_modified ( const std::string &  url)

Get the last modified time stamp of file.

Throw exception if the url cannot be fetched.

Return empty string if last modified is not available, e.g. the url is a directory path or file does not exist.

◆ get_system_temp_directory()

std::string turi::fileio::get_system_temp_directory ( )

Returns the system temporary directory

◆ get_system_user_name()

std::string turi::get_system_user_name ( )

Get the current system user name.

◆ get_temp_cache_prefix()

std::string turi::fileio::get_temp_cache_prefix ( )

The "directory" (cache://tmp/) which all cached files are located in

◆ get_temp_directories()

std::vector<std::string> turi::get_temp_directories ( )

Returns the set of temp directories

◆ get_temp_name()

std::string turi::get_temp_name ( const std::string &  prefix = "",
bool  _prefer_hdfs = false 
)

Returns a file name which can be used for a temp file. Returns an empty string on failure, a temporary file name on success. The file name returned is allowed to be a "prefix". i.e. arbitrary extensions can be attached to be tail of the file. For instance, if get_temp_name() returns /tmp/file51apTO, you can use /tmp/file51apTO.csv

Parameters
prefixOptional. If specified, this exact prefix will be returned in the temporary path. ex: /var/tmp/turicreate-user/12345/[prefix]. If an empty string or not specified, a random unique prefix will be generated.
_prefer_hdfsOptional, defaults to false. If true, prefers to use HDFS if available.

Note that if you specify your own prefix it is up to you to manage collisions, i.e. multiple parts of the program using the same prefix for instance.

◆ get_temp_name_prefer_hdfs()

std::string turi::get_temp_name_prefer_hdfs ( const std::string &  prefix = "")

Same as get_temp_name but return the temp file on hdfs if avaiable. The hdfs temp file location is a runtime configurable variable TURI_CACHE_FILE_HDFS_LOCATION defined in fileio_constant.hpp.

Parameters
prefixOptional. If specified, this exact prefix will be returned in the temporary path. ex: /var/tmp/turicreate-user/12345/[prefix]. If an empty string or not specified, a random unique prefix will be generated.

Note that if you specify your own prefix it is up to you to manage collisions, i.e. multiple parts of the program using the same prefix for instance.

◆ init_aws_sdk_with_turi_env()

Aws::S3::S3Client turi::init_aws_sdk_with_turi_env ( s3url parsed_url)

initialize the sdk with TRUI constomized environment variable

will set the endpoint/region that used to configure the client

this call will modify optional sdk_* members

◆ insecure_ssl_cert_checks()

const bool turi::fileio::insecure_ssl_cert_checks ( )

If true, ssl certificate checks are disabled.

◆ list_directory()

list_objects_response turi::list_directory ( std::string  s3_url,
std::string  proxy = "" 
)

Lists all objects prefixed by a give s3 url.

if s3_url points to a valid prefix, it return the prefix's contents like a directory.

foo/hello.txt

list_objects("s3://foo") will return "foo/hello.txt"

If s3_url points to an object it will just return the object.

Returns
A list_objects_response object. If list_objects_response.error is an empty string, it indicates success. Otherwise, it contains an error code. list_objects_response.directories indicate all "directories" stored with the requested prefix. And list_objects_response.objects indicates all regular objects stored with the requested prefix.

◆ list_objects()

list_objects_response turi::list_objects ( std::string  s3_url,
std::string  proxy = "" 
)

Lists objects or prefixes prefixed by a give s3 url.

This is a thin wrapper around the S3 API http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGET.html and may not quite do what you think it does.

if s3_url points to a valid prefix, it will return only the prefix as a directory. For instance if I have an S3 bucket containing

foo/hello.txt

list_objects("s3://foo") will return simply "foo/" as a directory.

See list_directory() and is_directory() for a more sensible implementation which behaves somewhat more file system like.

Returns
A list_objects_response object. If list_objects_response.error is an empty string, it indicates success. Otherwise, it contains an error code. list_objects_response.directories indicate all "directories" stored with the requested prefix. And list_objects_response.objects indicates all regular objects stored with the requested prefix.

◆ make_absolute_path()

std::string turi::fileio::make_absolute_path ( std::string  root_directory,
std::string  path 
)

Given a root directory and a relative path, tries to convert the relative path to an absolute path. If the path is already an absolute path, returns the original path with no changes.

This function is absolutely limited. It will not handle "../" structures to the returned relative path. In other words, the output path path must point to a file/folder inside of the root directory.

Example: make_absolute_path("s3://bucket/data", "123") returns "s3://bucket/data/123".

make_absolute_path("s3://bucket/data", "s3://foo/123") returns "s3://foo/123".

◆ make_canonical_path()

std::string turi::fileio::make_canonical_path ( const std::string &  path)

Return canonical absolute path, eliminating dots, and symlinks

◆ make_relative_path()

std::string turi::fileio::make_relative_path ( std::string  root_directory,
std::string  path 
)

Given a root directory and an absolute path, tries to create a relative path address between root_directory and the path; if not possible, returns the the original path with no changes.

This function is relatively limited. It will not add "../" structures to the returned relative path. In other words, the path must point to a file/folder inside of the root directory for this to return a relative path.

Example: make_relative_path("s3://bucket/data", "s3://bucket/data/123") returns "123".

make_relative_path("s3://bucket/data", "s3://foo/123") returns "s3://foo/123".

◆ num_temp_directories()

size_t turi::num_temp_directories ( )

Returns the number of temp directories

◆ parse_hdfs_url()

std::tuple<std::string, std::string, std::string> turi::fileio::parse_hdfs_url ( std::string  url)

A helper function to parse the hdfs url. Return a tuple of host, port, and path.

◆ parse_s3url()

bool turi::parse_s3url ( const std::string &  url,
s3url ret,
std::string &  err_msg 
)

This splits a URL of the form s3://[access_key_id]:[secret_key]:[endpoint/][bucket]/[object_name] into several pieces.

endpoint and object_name are optional.

Returns true on success, false on failure.

◆ reap_current_process_temp_files()

void turi::reap_current_process_temp_files ( )

Deletes all temp files created by the current process

◆ reap_unused_temp_files()

void turi::reap_unused_temp_files ( )

Deletes all temporary directories in the temporary turicreate/ directory (/var/tmp/turicreate) which are no longer used. i.e. was created by a process which no longer exists.

◆ remove_protocol()

std::string turi::fileio::remove_protocol ( std::string  path)

Returns the path removing the protocol header if there is one. .

remove_protocol("http://www.google.com") == "www.google.com" remove_protocol("s3://www.google.com") == "www.google.com" remove_protocol("/root/test") == "/root/test" remove_protocol("file:///root/test") == "/root/test"

◆ sanitize_s3_url()

std::string turi::sanitize_s3_url ( const std::string &  url)

Given an S3 URL of the form expected by parse_s3url, this function drops the access_key_id and the secret_key from the string returning s3://[bucket]/[object_name]

If the url cannot be parsed, we try the best to remove information associated with ':'.

If the url does not begin with s3://, return as is.

◆ sanitize_url()

std::string turi::sanitize_url ( std::string  url)

Sanitizes a general_fstream URL so that it is suitable for printing; right now, all it does is to drop all credential information when the protocol is s3.

◆ set_cache_file_locations()

void turi::fileio::set_cache_file_locations ( std::string  )

Sets the physical directory (/var/tmp) which all cached files are located in . colon seperated.

◆ set_curl_options()

void turi::fileio::set_curl_options ( void *  ecurl)

Sets curl options for everywhere curl is used.

◆ set_download_timeout()

void turi::set_download_timeout ( long  timeout)

Set the timeout for S3 download.

Parameters
timeoutTimeout value in secs.

◆ set_upload_timeout()

void turi::set_upload_timeout ( long  timeout)

Set the timeout for S3 upload.

Parameters
timeoutTimeout value in secs.

◆ try_to_open_file()

bool turi::fileio::try_to_open_file ( const std::string  url)

Returns true if the file can be opened. False otherwise.

Variable Documentation

◆ FILEIO_INITIAL_CAPACITY_PER_FILE

const size_t turi::fileio::FILEIO_INITIAL_CAPACITY_PER_FILE

The initial memory capacity assigned to caches

◆ FILEIO_MAXIMUM_CACHE_CAPACITY

size_t turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY

The maximum memory capacity used by all cached files be flushed.

◆ FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE

size_t turi::fileio::FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE

The maximum memory capacity assigned to a cached file until it has to be flushed.

◆ FILEIO_READER_BUFFER_SIZE

size_t turi::fileio::FILEIO_READER_BUFFER_SIZE

The default fileio reader buffer size

◆ FILEIO_WRITER_BUFFER_SIZE

size_t turi::fileio::FILEIO_WRITER_BUFFER_SIZE

The default fileio writer buffer size

◆ NUM_GPUS

int64_t turi::fileio::NUM_GPUS

The number of GPUs.

◆ S3_ENDPOINT

std::string turi::fileio::S3_ENDPOINT

The S3 connection endpoint; if empty string, S3 is assumed.

◆ S3_REGION

std::string turi::fileio::S3_REGION

The S3 connection region; if empty string, region will be guessed by:

  1. TURI_S3_REGION environment variable
  2. AWS_DEFAULT_REGION environment variable
  3. known region to endpoint mappings if none of above works, empty region string will be set and AWS will guess bucket region from endpoint.