Turi Create  4.0
file_line_count_estimator.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_FILE_LINE_COUNT_ESTIMATOR_HPP
7 #define TURI_UNITY_FILE_LINE_COUNT_ESTIMATOR_HPP
8 #include <cstdlib>
9 namespace turi {
10 
11 /**
12  * Estimate the number of lines in a file and the number of bytes used
13  * to represent each line.
14  *
15  * We estimate the number of lines in a file by making continuous observations
16  * of the current file position, and the number of lines read so far, and
17  * making simple assumptions about buffering behavior.
18  *
19  * \code
20  * ifstream fin;
21  * file_line_count_estimator estimator;
22  * while(...) {
23  * read_lines.
24  * estimator.observe(lines_read since we last called observe,
25  * fin.tellg());
26  * estimator.lines_in_file() contains estimate of the number
27  * of lines in the file
28  * }
29  * \endcode
30  *
31  */
33  public:
34  /**
35  * The default constructor.
36  * If used, set_file_size must be used to set the filesize in bytes.
37  */
39 
40  /**
41  * Constructs a file line count estimator.
42  * \param file_size_in_bytes The file size in bytes.
43  */
44  inline file_line_count_estimator(size_t file_size_in_bytes):
45  file_size(file_size_in_bytes) {}
46 
47  /**
48  * Sets the file size in bytes.
49  */
50  inline void set_file_size(size_t file_size_in_bytes) {
51  file_size = file_size_in_bytes;
52  }
53 
54 
55  /**
56  * Integrates statistics from another estimator
57  */
58  inline void observe(file_line_count_estimator& other_estimator) {
59  accumulated_bytes += other_estimator.accumulated_bytes;
60  accumulated_lines += other_estimator.accumulated_lines;
61  num_observations += other_estimator.num_observations;
62  }
63 
64  /**
65  * This should be called for every block of read operations performed on the
66  * file. Missing observations will cause the estimate to drift.
67  * The more frequently this is called (preferably once for every line),
68  * the more accurate the estimate.
69  */
70  inline void observe(size_t line_count, size_t file_pos) {
71  if (file_pos == 0) {
72  // no reads have been performed yet. How can line_count have anything?
73  return;
74  }
75 
76  if (file_pos != 0 && last_file_pos == 0) {
77  // first read has been performed. buffer is now filled.
78  last_file_pos = file_pos;
79  last_buffer_size = file_pos;
80  current_lines_from_buffer += line_count;
81  } else if (file_pos == last_file_pos) {
82  // we are now reading from the buffer.
83  current_lines_from_buffer += line_count;
84  } else if (file_pos != last_file_pos) {
85  // we have now switched buffers
86  accumulated_lines += current_lines_from_buffer + line_count;
87  accumulated_bytes += last_buffer_size;
88 
89  current_lines_from_buffer = 0;
90  last_buffer_size = file_pos - last_file_pos;
91  last_file_pos = file_pos;
92  ++num_observations;
93  }
94 
95  }
96 
97  /**
98  * The current estimate of the number of lines left in the file.
99  * This returns 0 if the estimate is not available. One call to observe
100  * is sufficient to get a rough estimate.
101  */
102  inline double number_of_lines() const {
103  if (accumulated_lines == 0) {
104  return (double)file_size / last_buffer_size * current_lines_from_buffer;
105  } else {
106  // say we have on average half a buffer excess in the accumulated bytes
107  return (double)file_size / accumulated_bytes * accumulated_lines;
108  }
109  }
110 
111  /**
112  * Total number of lines observed so far
113  */
114  inline size_t num_lines_observed() const {
115  return accumulated_lines + current_lines_from_buffer;
116  }
117 
118 
119  private:
120  /// the size of the file in bytes
121  size_t file_size = 0;
122 
123  /// The number of lines read that are no longer in a buffer.
124  size_t accumulated_lines = 0;
125 
126  /// The number of bytes read that are no longer in a buffer.
127  size_t accumulated_bytes = 0;
128 
129  /// The number of lines read that may still be in a buffer.
130  size_t current_lines_from_buffer = 0;
131 
132  /// The last file position we have seen.
133  size_t last_file_pos = 0;
134 
135  /// The last change in file position (i.e. the buffer size)
136  size_t last_buffer_size = 0;
137 
138  /** The effective number of observations made. i.e.
139  * The number of times a buffer size change was observed
140  */
141  size_t num_observations = 0;
142 };
143 
144 
145 } // namespace turi
146 #endif
file_line_count_estimator(size_t file_size_in_bytes)
void observe(size_t line_count, size_t file_pos)
void observe(file_line_count_estimator &other_estimator)
void set_file_size(size_t file_size_in_bytes)