Turi Create  4.0
tokenizer.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_TEXT_TOKENIZER_H_
7 #define TURI_TEXT_TOKENIZER_H_
8 #include <model_server/lib/toolkit_class_macros.hpp>
9 #include <toolkits/feature_engineering/transformer_base.hpp>
10 #include <toolkits/feature_engineering/transform_utils.hpp>
11 #include <core/export.hpp>
12 
13 namespace turi {
14 namespace sdk_model {
15 namespace feature_engineering {
16 
17 class EXPORT tokenizer: public transformer_base {
18 
19  static constexpr size_t TOKENIZER_VERSION = 1;
20  bool fitted = false;
21  bool to_lower = false;
22  bool exclude = false;
23  std::map<std::string, flex_type_enum> feature_types;
24  std::vector<std::string> feature_columns;
25  flexible_type unprocessed_features; // List of feature columns provided by the user.
26  flexible_type delimiters; // List of delimiters provided by the user.
27 
28  private:
29  transform_utils::string_filter_list string_filters;
30 
31  void set_string_filters();
32 
33  public:
34 
35  /**
36  * Methods that must be implemented in a new transformer model.
37  * -------------------------------------------------------------------------
38  */
39 
40  virtual inline ~tokenizer() {}
41 
42  /**
43  * Set one of the options in the model. Use the option manager to set
44  * these options. If the option does not satisfy the conditions that the
45  * option manager has imposed on it. Errors will be thrown.
46  *
47  * \param[in] options Options to set
48  */
49  void init_options(const std::map<std::string, flexible_type>&_options) override;
50  /**
51  * Set constant.
52  *
53  * \param[in] data (SFrame of data)
54  */
55  size_t get_version() const override;
56 
57  /**
58  * Save the object using Turi's oarc.
59  */
60  void save_impl(turi::oarchive& oarc) const override;
61 
62  /**
63  * Load the object using Turi's iarc.
64  */
65  void load_version(turi::iarchive& iarc, size_t version) override;
66 
67  /**
68  * Initialize the transformer.
69  */
70  void init_transformer(const std::map<std::string,
71  flexible_type>& _options) override;
72 
73  /**
74  * Set constant.
75  *
76  * \param[in] data (SFrame of data)
77  */
78  void fit(gl_sframe data) override;
79 
80  /**
81  * Transform the given data.
82  *
83  * \param[in] data (SFrame of data)
84  *
85  * Python side interface
86  * ------------------------
87  * This function directly interfaces with "transform" in python.
88  *
89  */
90  gl_sframe transform(gl_sframe data) override;
91 
92  /**
93  * Fit and transform the given data. Intended as an optimization because
94  * fit and transform are usually always called together. The default
95  * implementaiton calls fit and then transform.
96  *
97  * \param[in] data (SFrame of data)
98  */
99  gl_sframe fit_transform(gl_sframe data) {
100  data.materialize();
101  fit(data);
102  return transform(data);
103  }
104 
105 
106  // Functions that all transformers need to register. Can be copied verbatim
107  // for other classes.
108  // --------------------------------------------------------------------------
109  BEGIN_CLASS_MEMBER_REGISTRATION("_Tokenizer")
110  REGISTER_CLASS_MEMBER_FUNCTION(tokenizer::init_transformer, "_options");
111  REGISTER_CLASS_MEMBER_FUNCTION(tokenizer::fit, "data");
112  REGISTER_CLASS_MEMBER_FUNCTION(tokenizer::fit_transform, "data");
113  REGISTER_CLASS_MEMBER_FUNCTION(tokenizer::transform, "data");
114  REGISTER_CLASS_MEMBER_FUNCTION(tokenizer::get_current_options);
115  REGISTER_CLASS_MEMBER_FUNCTION(tokenizer::list_fields);
116  REGISTER_NAMED_CLASS_MEMBER_FUNCTION("_get_default_options",
117  tokenizer::get_default_options);
119  tokenizer::get_value_from_state,
120  "key");
122 
123 };
124 
125 } // feature_engineering
126 } // sdk_model
127 } // turicreate
128 
129 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
#define END_CLASS_MEMBER_REGISTRATION
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())
Definition: algorithm.hpp:64