Turi Create  4.0
tfidf.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef _TFIDF_H_
7 #define _TFIDF_H_
8 
9 #include <core/export.hpp>
10 #include <string>
11 #include <unordered_map>
12 
13 #include <model_server/lib/toolkit_class_macros.hpp>
14 #include <toolkits/feature_engineering/transformer_base.hpp>
15 #include <toolkits/feature_engineering/topk_indexer.hpp>
16 
17 namespace turi {
18 namespace sdk_model {
19 namespace feature_engineering {
20 
21 typedef std::unordered_map<std::string, std::shared_ptr<topk_indexer>> indexer_type;
22 
23 class EXPORT tfidf : public transformer_base {
24 
25  static constexpr size_t TFIDF_VERSION = 0;
26  indexer_type index_map;
27  bool exclude = false;
28  std::map<std::string, flex_type_enum> feature_types;
29  flexible_type feature_columns; // Input provided by the user.
30  size_t num_documents;
31 
32  public:
33 
34  /**
35  * Methods that must be implemented in a new transformer model.
36  * -------------------------------------------------------------------------
37  */
38 
39  virtual inline ~tfidf() {}
40 
41  /**
42  * Set one of the options in the model. Use the option manager to set
43  * these options. If the option does not satisfy the conditions that the
44  * option manager has imposed on it. Errors will be thrown.
45  *
46  * \param[in] options Options to set
47  */
48  void init_options(const std::map<std::string, flexible_type>&_options) override;
49 
50  /**
51  * Get a version for the object.
52  */
53  size_t get_version() const override;
54 
55  /**
56  * Save the object using Turi's oarc.
57  */
58  void save_impl(turi::oarchive& oarc) const override;
59 
60  /**
61  * Load the object using Turi's iarc.
62  */
63  void load_version(turi::iarchive& iarc, size_t version) override;
64 
65 
66  /**
67  * Initialize the transformer.
68  */
69  void init_transformer(const std::map<std::string,
70  flexible_type>& _options) override;
71 
72  /**
73  * Set constant.
74  *
75  * \param[in] data (SFrame of data)
76  */
77  void fit(gl_sframe data) override;
78 
79  /**
80  * Transform the given data.
81  *
82  * \param[in] data (SFrame of data)
83  *
84  * Python side interface
85  * ------------------------
86  * This function directly interfaces with "transform" in python.
87  *
88  */
89  gl_sframe transform(gl_sframe data) override;
90 
91  /**
92  * Fit and transform the given data. Intended as an optimization because
93  * fit and transform are usually always called together. The default
94  * implementaiton calls fit and then transform.
95  *
96  * \param[in] data (SFrame of data)
97  */
98  gl_sframe fit_transform(gl_sframe data) {
99  data.materialize();
100  fit(data);
101  return transform(data);
102  }
103 
104  /**
105  * Retrieves a pointer to the internal document-frequency map
106  */
107  const indexer_type& get_indexer() const {
108  return index_map;
109  }
110 
111  // Functions that all transformers need to register. Can be copied verbatim
112  // for other classes.
113  // --------------------------------------------------------------------------
115  REGISTER_CLASS_MEMBER_FUNCTION(tfidf::init_transformer, "_options");
116  REGISTER_CLASS_MEMBER_FUNCTION(tfidf::fit, "data");
117  REGISTER_CLASS_MEMBER_FUNCTION(tfidf::fit_transform, "data");
119  REGISTER_CLASS_MEMBER_FUNCTION(tfidf::get_current_options);
120  REGISTER_CLASS_MEMBER_FUNCTION(tfidf::list_fields);
121  REGISTER_NAMED_CLASS_MEMBER_FUNCTION("_get_default_options",
122  tfidf::get_default_options);
124  tfidf::get_value_from_state,
125  "key");
127 
128 };
129 
130 
131 } // feature_engineering
132 } // sdk_model
133 } // turicreate
134 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
#define END_CLASS_MEMBER_REGISTRATION
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())
Definition: algorithm.hpp:64