Turi Create  4.0
unity_sgraph.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_SGRAPH_HPP
7 #define TURI_UNITY_SGRAPH_HPP
8 #include <model_server/lib/api/unity_graph_interface.hpp>
9 #include <model_server/lib/api/unity_sframe_interface.hpp>
10 #include <model_server/lib/sgraph_triple_apply_typedefs.hpp>
11 #include <core/storage/sgraph_data/sgraph_constants.hpp>
12 #include <core/storage/sframe_data/dataframe.hpp>
13 #include <memory>
14 #include <core/parallel/mutex.hpp>
15 
16 namespace turi {
17 
18 // forward declarations
19 template <typename T>
20 class lazy_eval_operation_dag;
21 template <typename T>
22 class lazy_eval_future;
23 
24 class sframe;
25 class sgraph;
26 
27 typedef lazy_eval_operation_dag<sgraph> unity_graph_dag_type;
28 typedef lazy_eval_future<sgraph> sgraph_future;
29 
30 /**
31  * \ingroup unity
32  * The \ref turi::unity_sgraph and \ref turi::unity_sgraph_base classes
33  * implement a graph object on the server side which is exposed to the
34  * client via the cppipc system.
35  *
36  * The unity_sgraph is a lazily evaluated, immutable
37  * graph datastructure where most operations do not take time, and instead,
38  * the graph is only fully constructed when accessed. Furthermore we can
39  * further exploit immutability for efficiency, by allowing graphs to shared
40  * data/structure/etc through the use of shared_ptr, etc.
41  */
43  public:
44  /// Global lazy evaluation DAG object
45  static unity_graph_dag_type* dag_singleton;
46 
47  /// Gets the lazy evaluation DAG object
48  static unity_graph_dag_type* get_dag();
49 
50  static const char* GRAPH_MAGIC_HEADER;
51 
52  /// Default constructor
53  explicit unity_sgraph(size_t npartitions = SGRAPH_DEFAULT_NUM_PARTITIONS);
54 
55  /**
56  * Constructs a unity_sgraph by taking over an existing sgraph
57  * object.
58  */
59  unity_sgraph(std::shared_ptr<sgraph>);
60 
61  /**
62  * Makes a copy of a graph. (This is a shallow copy. The resultant graphs
63  * share pointers. But since graphs are immutable, this is safe and can be
64  * treated like a deep copy.)
65  */
66  explicit unity_sgraph(const unity_sgraph&) = default;
67 
68  /**
69  * Copy assignment.
70  */
71  unity_sgraph& operator=(const unity_sgraph&) = default;
72 
73  /// Destructor
74  virtual ~unity_sgraph();
75 
76  /**
77  * Returns a new copy of this graph object
78  */
79  std::shared_ptr<unity_sgraph_base> clone();
80 
81  /**************************************************************************/
82  /* */
83  /* UNITY GRAPH BASE INTERFACE API */
84  /* */
85  /**************************************************************************/
86 
87  /**
88  * Returns a sframe of vertices satisfying the certain constraints.
89  *
90  * \ref vid_vec A list of vertices. If provided, will only return vertices
91  * in the list. Otherwise all vertices will be considered.
92  * \ref field_constraint A mapping of string-->value. Only vertices which
93  * contain a field with the particular value will be returned.
94  * value can be UNDEFINED, in which case vertices simply must
95  * contain the particular field.
96  * \ref group The vertex group id.
97  */
98  std::shared_ptr<unity_sframe_base> get_vertices(
99  const std::vector<flexible_type>& vid_vec = std::vector<flexible_type>(),
100  const options_map_t& field_constraint = options_map_t(),
101  size_t group = 0);
102 
103  /**
104  * Returns a sframe of edges satisfying certain constraints.
105  * source_vids and target_vids arrays must match up in length, and denote
106  * source->target edges. For instance: i-->j will return the edge i--> if it
107  * exists. Wildcards are supported by setting the flexible_type to UNDEFINED.
108  * For instance, i-->UNDEFINED will match every edge with source i. And
109  * UNDEFINED-->j will match every edge with target j.
110  *
111  * The returned edge will have source vertices from one group and
112  * target vertices from another group (can be the same).
113  *
114  * The edge must further match the values specified by the field_constraint.
115  *
116  * \ref source_vids A list of source vertices or wildcards (UNDEFINED).
117  * Must match the length of target_vids. See above for
118  * semantics.
119  * \ref target_vids A list of target vertices or wildcards (UNDEFINED).
120  * Must match the length of source_vids. See above for
121  * semantics.
122  * \ref field_constraint A mapping of string-->value. Only edges which
123  * contain a field with the particular value will be returned.
124  * value can be UNDEFINED, in which case edges simply must
125  * contain the particular field.
126  *
127  * \ref groupa The vertex group id for source vertices.
128  *
129  * \ref groupb The vertex group id for target vertices.
130  */
131  std::shared_ptr<unity_sframe_base> get_edges(const std::vector<flexible_type>& source_vids=std::vector<flexible_type>(),
132  const std::vector<flexible_type>& target_vids=std::vector<flexible_type>(),
133  const options_map_t& field_constraint=options_map_t(),
134  size_t groupa = 0, size_t groupb = 0);
135 
136  /**
137  * Returns a summary of the basic graph information such as the number of
138  * vertices / number of edges.
139  */
141 
142  /**
143  * Adds each row of the sframe as a new vertex; A new graph
144  * corresponding to the current graph with new vertices added will be
145  * returned. Columns of the dataframes are treated as fields (new fields
146  * will be created as appropriate). The column with name 'id_field_name'
147  * will be treated as the vertex ID field. The column with name
148  * 'id_field_name' must therefore exist in the 'vertices' dataframe. If the
149  * vertex with the given ID already exists, all field values will
150  * overwrite. This function can therefore be also used to perform
151  * modification of graph data. An exception is thrown on failure.
152  */
153  std::shared_ptr<unity_sgraph_base> add_vertices(
154  std::shared_ptr<unity_sframe_base> vertices,
155  const std::string& id_field_name, size_t group = 0);
156 
157 
158  /**
159  * Adds each row of the sframe as a new edge; A new graph
160  * corresponding to the current graph with new edges added will be
161  * returned. Columns of the dataframes are treated as fields (new fields
162  * will be created as appropriate). The columns with name 'source_field_name'
163  * and 'target_field_name' will be used to denote the source
164  * and target vertex IDs for the edge. These columns must therefore exist
165  * in the 'edges' dataframe. If the vertex does not exist, it will be
166  * created. Unlike add_vertices, edge "merging" is not performed, but
167  * instead, multiple edges can be created between any pair of vertices.
168  * Throws an exception on failure.
169  */
170  std::shared_ptr<unity_sgraph_base> add_edges(
171  std::shared_ptr<unity_sframe_base> edges,
172  const std::string& source_field_name,
173  const std::string& target_field_name,
174  size_t groupa = 0, size_t groupb = 0);
175 
176  /**
177  * Returns a list of of the vertex fields in the graph
178  */
179  std::vector<std::string> get_vertex_fields(size_t group = 0);
180 
181  /**
182  * Returns a list of of the vertex fields in the graph
183  */
184  std::vector<flex_type_enum> get_vertex_field_types(size_t group = 0);
185 
186  /**
187  * Returns a list of of the edge fields in the graph
188  */
189  std::vector<std::string> get_edge_fields(size_t groupa = 0, size_t groupb = 0);
190 
191  /**
192  * Returns a list of of the edge field types in the graph
193  */
194  std::vector<flex_type_enum> get_edge_field_types(size_t groupa = 0, size_t groupb = 0);
195 
196  /**
197  * Returns a new graph corresponding to the curent graph with only
198  * the fields listed in "fields".
199  */
200  std::shared_ptr<unity_sgraph_base> select_vertex_fields(
201  const std::vector<std::string>& fields, size_t group=0);
202 
203  /**
204  * Returns a new graph corresponding to the current graph with the field
205  * "field" renamed to "newfield".
206  */
207  std::shared_ptr<unity_sgraph_base> copy_vertex_field(
208  std::string field, std::string newfield, size_t group=0);
209 
210  /**
211  * Returns a new graph corresponding to the curent graph with the field
212  * "field" deleted.
213  */
214  std::shared_ptr<unity_sgraph_base> delete_vertex_field(
215  std::string field, size_t group=0);
216 
217  /**
218  * Add a new vertex field with column_data and return as a new graph.
219  */
220  std::shared_ptr<unity_sgraph_base> add_vertex_field(
221  std::shared_ptr<unity_sarray_base> column_data, std::string field);
222 
223  /**
224  * Rename the edge fields whoes names are in oldnames to the corresponding new names.
225  * Return the new graph.
226  */
227  std::shared_ptr<unity_sgraph_base> rename_vertex_fields(
228  const std::vector<std::string>& oldnames,
229  const std::vector<std::string>& newnames);
230 
231  /**
232  * Switch the column order of field1 and field2 in the vertex data.
233  * Return the new graph.
234  */
235  std::shared_ptr<unity_sgraph_base> swap_vertex_fields(
236  const std::string& field1, const std::string& field2);
237 
238 
239  /**
240  * Returns a new graph corresponding to the curent graph with only
241  * the fields listed in "fields".
242  */
243  std::shared_ptr<unity_sgraph_base> select_edge_fields(
244  const std::vector<std::string>& fields,
245  size_t groupa=0, size_t groupb=0);
246 
247  /**
248  * Returns a new graph corresponding to the current graph with the field
249  * "field" renamed to "newfield".
250  */
251  std::shared_ptr<unity_sgraph_base> copy_edge_field(
252  std::string field, std::string newfield,
253  size_t groupa=0, size_t groupb=0);
254 
255  /**
256  * Returns a new graph corresponding to the curent graph with the field
257  * "field" deleted.
258  */
259  std::shared_ptr<unity_sgraph_base> delete_edge_field(
260  std::string field, size_t groupa=0, size_t groupb=0);
261 
262  /**
263  * Add a new edge field with column_data and return as a new graph.
264  */
265  std::shared_ptr<unity_sgraph_base> add_edge_field(
266  std::shared_ptr<unity_sarray_base> column_data, std::string field);
267 
268  /**
269  * Rename the edge fields whoes names are in oldnames to the corresponding new names.
270  * Return the new graph.
271  */
272  std::shared_ptr<unity_sgraph_base> rename_edge_fields(
273  const std::vector<std::string>& oldnames,
274  const std::vector<std::string>& newnames);
275 
276  /**
277  * Switch the column order of field1 and field2 in the vertex data.
278  * Return the new graph.
279  */
280  std::shared_ptr<unity_sgraph_base> swap_edge_fields(const std::string& field1, const std::string& field2);
281 
282 
283  std::shared_ptr<unity_sgraph_base> lambda_triple_apply(const std::string& lambda_str,
284  const std::vector<std::string>& mutated_fields);
285 
286  std::shared_ptr<unity_sgraph_base> lambda_triple_apply_native(
287  const lambda_triple_apply_fn& lambda,
288  const std::vector<std::string>& mutated_fields);
289 
290  std::shared_ptr<unity_sgraph_base> lambda_triple_apply_native(
291  const function_closure_info& toolkit_fn_name,
292  const std::vector<std::string>& mutated_fields);
293 
294  /**************************************************************************/
295  /* */
296  /* Internal Functions */
297  /* */
298  /**************************************************************************/
299  /**
300  * Internal
301  *
302  * Returns a reference to the underlying sgraph.
303  *
304  * Note: This operation willforce the lazy operations to be performed.
305  */
306  sgraph& get_graph() const;
307 
308  /**
309  * Internal
310  *
311  * Deep serialization.
312  */
313  void save(oarchive& oarc) const;
314 
315  /**
316  * Internal
317  *
318  * Save to oarchive using sframe reference save.
319  */
320  void save_reference(oarchive& oarc) const;
321 
322  /**
323  * Internal
324  *
325  * Deep deserialization.
326  */
327  void load(iarchive& iarc);
328 
329  /**
330  * Save the sgraph using reference to SFrames in other locations.
331  *
332  * \see unity_sframe::save_frame_reference
333  */
334  void save_reference(std::string target_dir) const;
335 
336  /**
337  * Saves the graph with the specified name to a given file in a
338  * non-portable binary format. File can be on disk, or on HDFS.
339  *
340  * Supported formats are 'bin', 'json', 'csv'.
341  *
342  * Returns true on success, false on failure.
343  */
344  bool save_graph(std::string target_dir, std::string format);
345 
346  /**
347  * Loads the graph from the given file in a
348  * non-portable binary format. File can be on disk, or on HDFS.
349  * Returns true on success, false on failure.
350  */
351  bool load_graph(std::string target_dir);
352 
353 
354  private:
355  mutable std::shared_ptr<sgraph_future> m_graph;
356  mutex dag_mtx;
357 
358  private:
359  void fast_validate_add_vertices(const sframe& vertices,
360  std::string id_field,
361  size_t group) const;
362 
363  void fast_validate_add_edges(const sframe& vertices,
364  std::string src_field,
365  std::string dst_field,
366  size_t groupa, size_t groupb) const;
367  /**
368  * Returns an lazy edge sframe containing all the edges from groupa to groupb.
369  */
370  std::shared_ptr<unity_sframe_base> get_edges_lazy(size_t groupa = 0, size_t groupb = 0);
371 };
372 } // namespace turi
373 #endif
std::vector< flex_type_enum > get_vertex_field_types(size_t group=0)
std::vector< std::string > get_edge_fields(size_t groupa=0, size_t groupb=0)
sgraph & get_graph() const
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
std::shared_ptr< unity_sgraph_base > add_edges(std::shared_ptr< unity_sframe_base > edges, const std::string &source_field_name, const std::string &target_field_name, size_t groupa=0, size_t groupb=0)
std::shared_ptr< unity_sgraph_base > copy_vertex_field(std::string field, std::string newfield, size_t group=0)
sframe group(sframe sframe_in, std::string key_column)
std::shared_ptr< unity_sgraph_base > swap_vertex_fields(const std::string &field1, const std::string &field2)
bool load_graph(std::string target_dir)
std::shared_ptr< unity_sgraph_base > copy_edge_field(std::string field, std::string newfield, size_t groupa=0, size_t groupb=0)
std::shared_ptr< unity_sgraph_base > add_vertices(std::shared_ptr< unity_sframe_base > vertices, const std::string &id_field_name, size_t group=0)
static unity_graph_dag_type * get_dag()
Gets the lazy evaluation DAG object.
std::vector< flex_type_enum > get_edge_field_types(size_t groupa=0, size_t groupb=0)
options_map_t summary()
void save_reference(oarchive &oarc) const
unity_sgraph(size_t npartitions=SGRAPH_DEFAULT_NUM_PARTITIONS)
Default constructor.
std::shared_ptr< unity_sgraph_base > select_vertex_fields(const std::vector< std::string > &fields, size_t group=0)
std::shared_ptr< unity_sgraph_base > add_edge_field(std::shared_ptr< unity_sarray_base > column_data, std::string field)
std::shared_ptr< unity_sgraph_base > rename_edge_fields(const std::vector< std::string > &oldnames, const std::vector< std::string > &newnames)
std::shared_ptr< unity_sgraph_base > swap_edge_fields(const std::string &field1, const std::string &field2)
std::shared_ptr< unity_sframe_base > get_vertices(const std::vector< flexible_type > &vid_vec=std::vector< flexible_type >(), const options_map_t &field_constraint=options_map_t(), size_t group=0)
bool save_graph(std::string target_dir, std::string format)
std::shared_ptr< unity_sgraph_base > add_vertex_field(std::shared_ptr< unity_sarray_base > column_data, std::string field)
std::shared_ptr< unity_sgraph_base > delete_vertex_field(std::string field, size_t group=0)
void save(oarchive &oarc) const
size_t SGRAPH_DEFAULT_NUM_PARTITIONS
std::shared_ptr< unity_sgraph_base > clone()
unity_sgraph & operator=(const unity_sgraph &)=default
void load(iarchive &iarc)
std::shared_ptr< unity_sgraph_base > delete_edge_field(std::string field, size_t groupa=0, size_t groupb=0)
std::function< void(edge_triple &)> lambda_triple_apply_fn
static unity_graph_dag_type * dag_singleton
Global lazy evaluation DAG object.
std::shared_ptr< unity_sframe_base > get_edges(const std::vector< flexible_type > &source_vids=std::vector< flexible_type >(), const std::vector< flexible_type > &target_vids=std::vector< flexible_type >(), const options_map_t &field_constraint=options_map_t(), size_t groupa=0, size_t groupb=0)
std::map< std::string, flexible_type > options_map_t
Definition: options_map.hpp:14
virtual ~unity_sgraph()
Destructor.
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
std::vector< std::string > get_vertex_fields(size_t group=0)
std::shared_ptr< unity_sgraph_base > select_edge_fields(const std::vector< std::string > &fields, size_t groupa=0, size_t groupb=0)
std::shared_ptr< unity_sgraph_base > rename_vertex_fields(const std::vector< std::string > &oldnames, const std::vector< std::string > &newnames)