Turi Create  4.0
transform_utils.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_TRANSFORM_UTILS_H_
7 #define TURI_TRANSFORM_UTILS_H_
8 
9 #include <boost/regex.hpp>
10 
11 #include <core/data/sframe/gl_sframe.hpp>
12 #include <core/util/try_finally.hpp>
13 #include <core/parallel/lambda_omp.hpp>
14 
15 #include <core/data/sframe/gl_sframe.hpp>
16 #include <core/data/sframe/gl_sarray.hpp>
17 
18 #include <toolkits/feature_engineering/topk_indexer.hpp>
19 #include <toolkits/feature_engineering/statistics_tracker.hpp>
20 
21 namespace turi{
22 namespace transform_utils{
23 
24 /**
25  * Validate if the set of columns provided by the user is present in the
26  * input SFrame.
27  *
28  * \param[in] data_column_names Columns in the dataset.
29  * \param[in] feature_column_names Features provided by user.
30  *
31  * \notes Check if set(data_column_names) - set(feature_column_names) is
32  * non empty.
33  */
34 inline void validate_feature_columns(
35  const std::vector<std::string>& data_column_names,
36  const std::vector<std::string>& feature_column_names,
37  bool verbose = true){
38 
39  if(feature_column_names.empty()) {
40  log_and_throw("No input features are specified.");
41  }
42 
43  std::set<std::string> data_column_set(data_column_names.begin(),
44  data_column_names.end());
45  std::set<std::string> feature_column_set(feature_column_names.begin(),
46  feature_column_names.end());
47 
48  std::vector<std::string> result;
49  std::set_difference(feature_column_set.begin(), feature_column_set.end(),
50  data_column_set.begin(), data_column_set.end(),
51  inserter(result, result.end()));
52 
53  if (result.size() > 0 && verbose){
54  std::stringstream err_msg;
55  err_msg << "Feature(s) ";
56  for (size_t i=0; i < result.size()-1; i++) {
57  err_msg << result[i] << ", ";
58  }
59  err_msg << result[result.size()-1]
60  << " are missing from the dataset." << std::endl;
61  log_and_throw(err_msg.str());
62  }
63 }
64 
65 
66 /**
67  * Get a unique output feature name based on already existing features.
68  *
69  * \param[in] feature_columns A list of feature_column names to check.
70  * \param[in] output_column_name User's proposed output column name.
71  *
72  * \returns A valid output column name that is not present in feature_columns
73  *
74  * \note Assumes that feature_names is a subset of feature_types.keys().
75  */
76 inline std::string get_unique_feature_name(
77  const std::vector<std::string>& feature_columns,
78  const std::string& output_column_name) {
79 
80  std::string output_name = output_column_name;
81  int counter = 0;
82  while (std::find(feature_columns.begin(), feature_columns.end(),
83  output_name) != feature_columns.end()) {
84  counter++;
85  output_name = output_column_name + "." + std::to_string(counter);
86  }
87  return output_name;
88 }
89 
90 /**
91  * Validate if the types of the features are compatible during fit and
92  * transform mode.
93  *
94  * \param[in] feature_columns A list of feature_column names to check.
95  * \param[in] feature_types Fit mode feature types.
96  * \param[in] data Dataset during transform mode.
97  *
98  * \note Assumes that feature_names is a subset of feature_types.keys().
99  */
100 inline void validate_feature_types(
101  const std::vector<std::string>& feature_names,
102  const std::map<std::string, flex_type_enum>& feature_types,
103  const gl_sframe& data) {
104 
105  for (auto& col_name : feature_names){
106  DASSERT_TRUE(feature_types.count(col_name) > 0);
107  auto fit_type = feature_types.at(col_name);
108  auto transform_type = data[col_name].dtype();
109 
110  if (fit_type != transform_type) {
111  log_and_throw("Column '" + col_name + "' was of type " +
112  flex_type_enum_to_name(fit_type) + " when fitted using .fit(), but is of type " +
113  flex_type_enum_to_name(transform_type) + "during .transform()");
114  }
115  }
116 }
117 
118 /**
119  * Checks if particular type is numeric
120  */
121 inline bool is_numeric_type(flex_type_enum type) {
122  return (type == flex_type_enum::INTEGER || type == flex_type_enum::FLOAT);
123 }
124 /**
125  * Checks if particular type is categorical.
126  */
127 inline bool is_categorical_type(flex_type_enum type) {
128  return (type == flex_type_enum::INTEGER || type == flex_type_enum::STRING);
129 }
130 
131 /**
132  * Returns string vector of column names to perform transformation on.
133  *
134  * \param[in] data SFrame
135  * \param[in] exclude Flag which determines if feature_columns is an exclude set.
136  * \param[in] feature_columns Include/Excluded features.
137  *
138  * \returns Set of features to work with.
139  *
140  */
141 inline std::vector<std::string> get_column_names(const gl_sframe& data,
142  bool exclude,
143  const flexible_type& feature_columns) {
144 
145  std::vector<std::string> feature_columns_vector;
146  if (feature_columns.get_type() == flex_type_enum::UNDEFINED){
147  feature_columns_vector = data.column_names();
148  } else if (!feature_columns.get<flex_list>().size()) {
149  feature_columns_vector = data.column_names();
150  } else {
151  feature_columns_vector = variant_get_value<std::vector<std::string>>(
152  to_variant(feature_columns));
153  }
154 
155  if (exclude){
156  std::vector<std::string> data_column_names = data.column_names();
157  std::set<std::string> total_set(data_column_names.begin(),
158  data_column_names.end());
159  std::set<std::string> exclude_set(feature_columns_vector.begin(),
160  feature_columns_vector.end());
161  std::set<std::string> result;
162  std::set_difference(total_set.begin(), total_set.end(),
163  exclude_set.begin(), exclude_set.end(),
164  inserter(result, result.begin()));
165  return std::vector<std::string>(result.begin(),result.end());
166  } else {
167  return feature_columns_vector;
168  }
169 }
170 
171 /**
172  * Subselect features based on input features.
173  *
174  * \param[in] data SFrame
175  * \param[in] feature_columns Include/Excluded features.
176  *
177  * \returns feature_columns \intersect data.column_names()
178  *
179  */
180 inline std::vector<std::string> select_feature_subset(const gl_sframe& data,
181  const std::vector<std::string>& feature_columns) {
182 
183  std::vector<std::string> data_column_names = data.column_names();
184  std::set<std::string> total_set(data_column_names.begin(),
185  data_column_names.end());
186  std::set<std::string> feature_set(feature_columns.begin(),
187  feature_columns.end());
188  std::set<std::string> result;
189  std::set_intersection(total_set.begin(), total_set.end(),
190  feature_set.begin(), feature_set.end(),
191  inserter(result, result.begin()));
192 
193  if (result.size() != feature_columns.size()) {
194  logprogress_stream << "Warning: The model was fit with "
195  << feature_columns.size() << " feature columns but only "
196  << result.size() << " were present during transform(). "
197  << "Proceeding with transform by ignoring the missing columns."
198  << std::endl;
199  }
200 
201  // Need to preserve order.
202  std::vector<std::string> ret;
203  ret.reserve(result.size());
204 
205  for(const auto& s : feature_columns) {
206  if(result.count(s)) {
207  ret.push_back(s);
208  }
209  }
210 
211  return ret;
212 }
213 
214 /**
215  * Takes any flexible type and turns it into a flex dict
216  * In flex_dict -> out same flex_dict
217  * Flex dict output = input
218  * In String "x" -> {"x":1}
219  * String becomes key, 1 becomes value
220  * In list/vec [1,2,3] -> {0:1, 1:2, 2:3}
221  * Index becomes key, element becomes value
222  * In numeric type ie. 5 -> {0:5}
223  * 0 becomes key, numeric value becomes value
224  */
225 
226 inline flex_dict flexible_type_to_flex_dict(const flexible_type& in){
227  flex_dict out = {};
228  if (in.get_type() == flex_type_enum::DICT){
229  out = in;
230  } else if (in.get_type() == flex_type_enum::UNDEFINED){
231  out.resize(1);
232  out[0] = std::make_pair(0,in);
233  } else if (in.get_type() == flex_type_enum::STRING){
234  out.resize(1);
235  out[0] = std::make_pair(in,1);
236  } else if (in.get_type() == flex_type_enum::LIST){
237  flex_list list = in.get<flex_list>();
238  out.resize(list.size());
239  for (size_t i = 0 ; i < list.size(); i++){
240  out[i] = std::make_pair(i,list[i]);
241  }
242  } else if (in.get_type() == flex_type_enum::VECTOR) {
243  flex_vec vec = in.get<flex_vec>();
244  out.resize(vec.size());
245  for (size_t i = 0 ; i < vec.size(); i++){
246  out[i] = std::make_pair(i,vec[i]);
247  }
248  } else if (is_numeric_type(in.get_type())){
249  out.resize(1);
250  out[0] = std::make_pair(0, in);
251  }
252  return out;
253 }
254 
255 /**
256  * Takes training_data, feature columns to include/exclude, and exclude bool.
257  * Returns columns to perform transformations on.
258  */
259 inline gl_sframe extract_columns(const gl_sframe& training_data,
260  std::vector<std::string>& feature_columns,
261  bool exclude) {
262  if (!feature_columns.size()) {
263  feature_columns = training_data.column_names();
264  }
265  if (exclude){
266  std::vector<std::string> training_data_column_names =
267  training_data.column_names();
268  std::set<std::string> total_set(training_data_column_names.begin(),
269  training_data_column_names.end());
270  std::set<std::string> exclude_set(feature_columns.begin(),
271  feature_columns.end());
272 
273  std::set<std::string> result;
274  std::set_difference(total_set.begin(), total_set.end(),
275  exclude_set.begin(), exclude_set.end(),
276  inserter(result, result.begin()));
277  return training_data.select_columns(
278  std::vector<std::string>(result.begin(),result.end()));
279  } else {
280  return training_data.select_columns(feature_columns);
281  }
282 }
283 
284 
285 /**
286  * Utility function for selecting columns of only valid feature types.
287  *
288  * \param[in] dataset
289  * The input SFrame containing columns of potential features.
290  *
291  * \param[in] features
292  * List of feature column names. The list cannot be empty.
293  *
294  * \param[in] valid_feature_types
295  * List of Python types that represent valid features. If type is array.array,
296  * then an extra check is done to ensure that the individual elements of the array
297  * are of numeric type. If type is dict, then an extra check is done to ensure
298  * that dictionary values are numeric.
299  *
300  * \returns
301  * List of valid feature column names. Warnings are given for each candidate
302  * feature column that is excluded.
303  *
304  */
305 inline std::vector<std::string> select_valid_features_nothrow(const gl_sframe&
306  dataset,
307  const std::vector<std::string>& features,
308  const std::vector<flex_type_enum>& valid_feature_types,
309  bool verbose = true){
310 
311  // Create a map of col types from the col-names.
312  std::vector<flex_type_enum> col_types;
313  std::map<std::string, flex_type_enum> col_type_map;
314  for (size_t i=0; i < features.size(); i++) {
315  col_types.push_back(dataset[features[i]].dtype());
316  col_type_map[features[i]] = dataset[features[i]].dtype();
317  }
318 
319  // Check types for valid features.
320  std::vector<std::string> valid_features;
321  for (size_t i=0; i < features.size(); i++) {
322  auto col = features[i];
323  auto coltype = col_types[i];
324 
325  // Not a valid type. Warn the user.
326  if (std::find(valid_feature_types.begin(), valid_feature_types.end(),
327  coltype) == valid_feature_types.end()) {
328  if (verbose){
329  logprogress_stream << "WARNING: Column '" << col
330  << "' is excluded due to invalid column type ("
331  << flex_type_enum_to_name(coltype) <<")." << std::endl;
332  }
333  // Valid type. Include.
334  } else {
335  valid_features.push_back(col);
336  }
337  }
338 
339  return valid_features;
340 }
341 
342 /**
343  * Utility function for selecting columns of only valid feature types. Throws an
344  * exception if no features match.
345  *
346  * \param[in] dataset
347  * The input SFrame containing columns of potential features.
348  *
349  * \param[in] features
350  * List of feature column names. The list cannot be empty.
351  *
352  * \param[in] valid_feature_types
353  * List of Python types that represent valid features. If type is array.array,
354  * then an extra check is done to ensure that the individual elements of the array
355  * are of numeric type. If type is dict, then an extra check is done to ensure
356  * that dictionary values are numeric.
357  *
358  * \returns
359  * List of valid feature column names. Warnings are given for each candidate
360  * feature column that is excluded.
361  *
362  */
363 inline std::vector<std::string> select_valid_features(const gl_sframe& dataset,
364  const std::vector<std::string>& features,
365  const std::vector<flex_type_enum>& valid_feature_types,
366  bool verbose = true){
367 
368  // Check types for valid features.
369  std::vector<std::string> valid_features =
370  select_valid_features_nothrow(
371  dataset, features, valid_feature_types, verbose);
372 
373  // Throw an error if nothing was found to be valid.
374  if (valid_features.size() == 0 && verbose ) {
375  std::string err_msg = "The input data does not contain any usable feature"
376  " columns. This model only supports features of type: ";
377  for (size_t k = 0; k < valid_feature_types.size() - 1; ++k){
378  err_msg += std::string(
379  flex_type_enum_to_name(valid_feature_types[k])) + ", ";
380  }
381  err_msg += std::string(
382  flex_type_enum_to_name(valid_feature_types.back())) + ".";
383  log_and_throw(err_msg);
384  }
385  return valid_features;
386 }
387 
388 
389 /**
390  * Indexes an SArray of categorical types into an indexed representation.
391  *
392  * Categorical: If a column is categorical, each unique value is mapped to
393  * a unique index in the range 0, ..., n-1, where n is the number of unique
394  * values.
395  *
396  * Numeric: The column type is checked to be INT/FLOAT, then
397  * returned as-is.
398  *
399  * Numeric Vector: If the dictated column type is VECTOR, it is
400  * checked to make sure it is numeric and of homogeneous size.
401  *
402  * Categorical Vector: If the dictated column type is VECTOR, it is
403  * checked to make sure it is numeric and of homogeneous size.
404  *
405  * Dictionary : If the dictated column type is DICT, it is checked to make
406  * sure the values are numeric. The keys are then translated to 0..n-1
407  * where n is the number of unique keys.
408  *
409  * \param[in] src The SArray to map to indices.
410  * \param[in,out] indexer Unique column indexer.
411  *
412  */
413 inline void create_topk_index_mapping(const gl_sarray& src,
414  std::shared_ptr<topk_indexer> indexer) {
415 
416  // Get the column mode from the dtype.
417  flex_type_enum run_mode = src.dtype();
418 
419  // Setup the indexer.
420  indexer->initialize();
421  size_t src_size = src.size();
422 
423  // Perform the indexing.
424  in_parallel([&](size_t thread_idx, size_t num_threads) {
425 
426  // Break the SArray into various sizes.
427  size_t start_idx = src_size * thread_idx / num_threads;
428  size_t end_idx = src_size * (thread_idx + 1) / num_threads;
429 
430  for (const auto& v: src.range_iterator(start_idx, end_idx)) {
431  switch(run_mode) {
432  // Categorical cols.
435  case flex_type_enum::STRING: {
436  indexer->insert_or_update(v, thread_idx);
437  break;
438  }
439 
440  // Categorical vector
441  case flex_type_enum::LIST: {
442  const flex_list& vv = v.get<flex_list>();
443  size_t n_values = vv.size();
444 
445  for(size_t k = 0; k < n_values; ++k) {
446  indexer->insert_or_update(vv[k], thread_idx);
447  }
448  break;
449  }
450 
451  // Dictionary
452  case flex_type_enum::DICT: {
453 
454  const flex_dict& dv = v.get<flex_dict>();
455  size_t n_values = dv.size();
456 
457  for(size_t k = 0; k < n_values; ++k) {
458  const std::pair<flexible_type, flexible_type>& kvp = dv[k];
459  flexible_type out_key =
460  flex_string(kvp.first) + ":" + flex_string(kvp.second);
461  indexer->insert_or_update(out_key, thread_idx);
462  }
463  break;
464  }
465 
466  // Should not be here.
467  default:
468  DASSERT_TRUE(false);
469  break;
470 
471  } // End switch
472  } // End range iterator.
473  }); // End parallel evaluation
474 
475  indexer->finalize();
476 }
477 
478 /**
479  * Calculates length of list/vectors in a column src. If not constant length,
480  * errors out.
481  *
482  * \param[in] src The SArray to computer mean of.
483  * \param[in] column_name Name of column
484  *
485  */
486 inline size_t validate_list_vec_length(const gl_sarray& src,const std::string& column_name){
487 
488  size_t src_size = src.size();
489  flex_list length_list;
490  length_list.resize(thread::cpu_count());
491 
492  in_parallel([&](size_t thread_idx, size_t num_threads) {
493 
494  size_t start_idx = src_size * thread_idx / num_threads;
495  size_t end_idx = src_size * (thread_idx + 1) / num_threads;
496 
497  flexible_type length = flex_undefined();
498  flexible_type old_length = flex_undefined();
499 
500  for (const auto& v: src.range_iterator(start_idx, end_idx)){
501  if(v.get_type() == flex_type_enum::LIST){
502  length = v.get<flex_list>().size();
503  } else if (v.get_type() == flex_type_enum::VECTOR){
504  length = v.get<flex_vec>().size();
505  }
506  if (old_length.get_type() != flex_type_enum::UNDEFINED && old_length != length){
507  log_and_throw("All list/vectors in column" + column_name + "must be of same length or None.");
508  } else {
509  old_length = length;
510  }
511 
512  }
513 
514  length_list[thread_idx] = length;
515  });
516 
517  flexible_type total_length = flex_undefined();
518  flexible_type old_total_length = flex_undefined();
519  for (const auto& l: length_list){
520  if (l.get_type() != flex_type_enum::UNDEFINED){
521  total_length = l;
522  }
523  if (old_total_length.get_type() != flex_type_enum::UNDEFINED && old_total_length != total_length){
524  log_and_throw("All list/vectors in column" + column_name + "must be of same length or None.");
525  } else {
526  old_total_length = total_length;
527  }
528  }
529 
530  if (total_length.get_type() == flex_type_enum::UNDEFINED){
531  log_and_throw("At least one value in column_name" + column_name + "must have"
532  " a non-None value");
533  }
534  return total_length;
535 
536 }
537 
538 /**
539  * Computes set of all features in a sparse dictionary column
540  *
541  * \param[in] src The SArray to computer mean of.
542  * \param[in] column_name Name of column
543  * \param[out] out_set The set that will contain all features
544  *
545  */
546 
547 inline void num_sparse_features(const gl_sarray& src,const std::string& column_name, std::set<flexible_type>& out_set){
548 
549  out_set.clear();
550  size_t src_size = src.size();
551  std::vector<std::set<flexible_type>> threadlocal_key_set;
552  threadlocal_key_set.resize(thread::cpu_count());
553 
554  in_parallel([&](size_t thread_idx, size_t num_threads) {
555 
556  size_t start_idx = src_size * thread_idx / num_threads;
557  size_t end_idx = src_size * (thread_idx + 1) / num_threads;
558 
559 
560 
561  for (const auto& d: src.range_iterator(start_idx, end_idx)){
562  if(d.get_type() == flex_type_enum::DICT){
563  flex_dict dd = d.get<flex_dict>();
564  for (const auto& kvp: dd){
565  threadlocal_key_set[thread_idx].insert(kvp.first);
566  }
567  }
568  }
569  });
570 
571  for (const auto& t: threadlocal_key_set){
572  for (const auto& k: t){
573  out_set.insert(k);
574  }
575  }
576 
577  if(out_set.size() == 0 ){
578  log_and_throw("There must be at least one non-None value in dictionary"
579  " column for mean imputation");
580  }
581 
582 }
583 
584  /**
585  * Computes mean of a column. Columns of recursive types have behaviour that
586  * is equivalent to unpacking, computing means, then repacking(while preserving
587  * sparse interpretation of dictionary columns).
588  *
589  * \param[in] src The SArray to computer mean of.
590  * \param[in,out] tracker Unique statistics tracker.
591  * \param[in] column_name Name of column
592  *
593  */
594 inline void create_mean_mapping(const gl_sarray& src,
595  const std::string& column_name,
596  std::shared_ptr<statistics_tracker> tracker) {
597 
598  // Get the column mode from the dtype.
599  flex_type_enum run_mode = src.dtype();
600 
601  // Setup the indexer.
602  tracker->initialize();
603  size_t src_size = src.size();
604  size_t vec_list_length;
605  std::set<flexible_type> sparse_features;
606 
607 
608 
609  if (run_mode == flex_type_enum::LIST || run_mode == flex_type_enum::VECTOR){
610  vec_list_length = validate_list_vec_length(src, column_name);
611  } else if(run_mode == flex_type_enum::DICT){
612  num_sparse_features(src, column_name, sparse_features);
613  }
614 
615  // Perform the indexing.
616  in_parallel([&](size_t thread_idx, size_t num_threads) {
617 
618  // Break the SArray into various sizes.
619  size_t start_idx = src_size * thread_idx / num_threads;
620  size_t end_idx = src_size * (thread_idx + 1) / num_threads;
621 
622  for (const auto& v: src.range_iterator(start_idx, end_idx)) {
623  switch(run_mode) {
624  // Numerical cols.
626  case flex_type_enum::FLOAT: {
627  flexible_type key = 0;
628  tracker->insert_or_update(key, v, thread_idx);
629  break;
630  }
631 
632  // Categorical vector
633  case flex_type_enum::LIST: {
634  flex_list vv;
635  if (v.get_type() != flex_type_enum::UNDEFINED){
636  vv = v.get<flex_list>();
637  }
638  size_t n_values = vec_list_length;
639  for(size_t k = 0; k < n_values; ++k) {
640  if (v.get_type() != flex_type_enum::UNDEFINED){
641  if (!transform_utils::is_numeric_type(vv[k].get_type())
642  && vv[k].get_type() != flex_type_enum::UNDEFINED){
643  log_and_throw("All list elements must be numeric for mean"
644  " imputation");
645  }
646  tracker->insert_or_update(k, vv[k], thread_idx);
647  } else {
648  tracker->insert_or_update(k, flex_undefined(), thread_idx);
649  }
650  }
651  break;
652  }
653 
654  // Numerical vector
655  case flex_type_enum::VECTOR: {
656  flex_vec vv = flex_vec();
657  if (v.get_type() != flex_type_enum::UNDEFINED){
658  vv = v.get<flex_vec>();
659  }
660  size_t n_values = vec_list_length;
661  for(size_t k = 0; k < n_values; ++k) {
662  if(v.get_type() != flex_type_enum::UNDEFINED){
663  tracker->insert_or_update(k, vv[k], thread_idx);
664  } else{
665  tracker->insert_or_update(k, flex_undefined(), thread_idx);
666  }
667  }
668  break;
669  }
670 
671 
672  // Dictionary
673  case flex_type_enum::DICT: {
674 
675  if (v.get_type() != flex_type_enum::UNDEFINED) {
676  const flex_dict& dv = v.get<flex_dict>();
677  size_t n_values = dv.size();
678 
679  for(size_t k = 0; k < n_values; ++k) {
680  const std::pair<flexible_type, flexible_type>& kvp = dv[k];
681  if (!transform_utils::is_numeric_type(kvp.second.get_type())
682  && kvp.second.get_type() != flex_type_enum::UNDEFINED){
683  log_and_throw("All dictionary entries must be numeric for mean"
684  "imputation");
685  }
686  tracker->insert_or_update(kvp.first,kvp.second, thread_idx);
687  }
688  } else {
689 
690  for(const auto& v : sparse_features){
691  tracker->insert_or_update(v, flex_undefined(), thread_idx);
692  }
693 
694  }
695  break;
696  }
697 
698  // Should not be here.
699  default:
700  DASSERT_TRUE(false);
701  break;
702 
703  } // End switch
704  } // End range iterator.
705  }); // End parallel evaluation
706 
707  tracker->finalize(src_size);
708 }
709 
710 /////////////////////////////////////
711 // Utilities for string tokenization
712 /////////////////////////////////////
713 
714 // A funtion that checks whether or not the input string should be filtered.
715 typedef std::function<bool(const std::string&)> string_filter_condition;
716 // A list of filtering conditions and regex filtering patterns.
717 typedef std::vector<std::pair<boost::regex, string_filter_condition> > string_filter_list;
718 
719 /**
720  * An approximate Penn Tree Bank tokenization filter.
721  *
722  * TODO: this should
723  * 1) account for multi-word proper nouns
724  * 2) separate scientific units from values
725  * 3) keep periods at the end of abbreviations
726  * 4) keep slashes at the end of urls
727  * 5) capture sequences of punctuation like emoticons, ellipses, and "?!?!?!?!"
728  * 6) bug: children's is being tokenized as "childre n's"
729  */
730 static const string_filter_list ptb_filters = {
731  std::make_pair(
732  boost::regex(
733  std::string("([+.-]?[0-9]+([.,()-]+[0-9]+)*)|") + // positive and negative real numbers, and phone numbers with no spaces
734  std::string("([^\\w\\s])|") + // separates individual punctuation marks
735  std::string("(\\b[^\\w\\s]+)|") + // leading punctuation (e.g. leading quotation or dollar sign)
736  std::string("([\\w]([^\\s]*[\\w])?)|") + // sequences of non-space characters with leading and trailing letters/numbers (e.g. urls, emails)
737  std::string("([^\\w\\s]+\\b)")), // trailing punctuation (e.g. trailing quotations, sentence-final punctuation)
738  [](const std::string& current){return true;}),
739  std::make_pair(
740  boost::regex(
741  std::string("([nN]?'\\w*)|([^\\s']*[^nN\\s'])")), // separate contractions and possessives
742  [](const std::string& current){return current.find("'") != std::string::npos;})
743 };
744 
745 /**
746  * Tokenizes the input string according to the input filtering patterns,
747  * returns a flex_list of the token strings.
748  *
749  * \param[in] to_tokenize The string to tokenize.
750  * \param[in] filter_list A list of regex patterns and string_filter_conditions.
751  * \param[out] A list of string tokens, tokenized according to the tokenization patterns.
752  *
753  * The filter_list offers a way to take some logic that might overcomplicate a
754  * regex and export it to a filter list comprehension. For each filter,
755  * for each item in the current token list (at the beginning, a singleton
756  * list with the full doc string), the filter condition is checked. If the
757  * condition is satisfied, the regex is applied, and the resulting list is
758  * inserted in place of the original token. Otherwise, the original token
759  * is placed back into the list.
760  *
761  */
762 inline flex_list tokenize_string(const std::string& to_tokenize,
763  const string_filter_list& filter_list,
764  const bool to_lower) {
765  flex_list previous = {to_tokenize};
766  boost::sregex_token_iterator end;
767 
768  if (to_lower) {
769  std::string str_copy = std::string(to_tokenize);
770  std::transform(str_copy.begin(), str_copy.end(), str_copy.begin(), ::tolower);
771  previous = {str_copy};
772  }
773 
774  // The algorithm operates recursively:
775  // - start with the original single string to be tokenized,
776  // - tokenize that string according to the first pattern in the filter list,
777  // - further tokenize each token according to subsequent patterns in the filter list.
778  //
779  // For each filter pattern in the given filter_list:
780  // - Check the condition on the current token
781  // - if condition is satisfied, tokenize the token using the regex pattern
782  // - otherwise, place the original token back into the list
783  //for (auto filter = filter_list.begin(); filter != filter_list.end(); ++filter)
784  for (const auto& filter : filter_list)
785  {
786  std::vector<std::string> current;
787  boost::smatch matches;
788  const boost::regex& expr = filter.first;
789  string_filter_condition condition = filter.second;
790 
791  for (const auto& token : previous)
792  {
793  const std::string& token_string = token.to<std::string>();
794 
795  if ( condition(token_string) )
796  {
797  boost::sregex_token_iterator match(token_string.begin(), token_string.end(), expr, 0);
798 
799  for ( ; match != end; ++match)
800  {
801  current.push_back(*match);
802  }
803  }
804  else
805  {
806  current.push_back(token);
807  }
808  }
809 
810  previous = flex_list(current.begin(), current.end());
811  }
812 
813  return previous;
814 }
815 
816 
817 }// transform_utils
818 }//turicreate
819 #endif
std::vector< double > flex_vec
const char * flex_type_enum_to_name(flex_type_enum en)
static size_t cpu_count()
#define logprogress_stream
Definition: logger.hpp:325
std::string flex_string
variant_type to_variant(const T &f)
Definition: variant.hpp:308
std::set< T > set_difference(const std::set< T > &a, const std::set< T > &b)
Definition: stl_util.hpp:91
void in_parallel(const std::function< void(size_t thread_id, size_t num_threads)> &fn)
Definition: lambda_omp.hpp:35
std::vector< std::pair< flexible_type, flexible_type > > flex_dict
std::vector< flexible_type > flex_list
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364