Turi Create  4.0
gl_sgraph.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_GL_SGRAPH_HPP
7 #define TURI_UNITY_GL_SGRAPH_HPP
8 #include <cmath>
9 #include <memory>
10 #include <cstddef>
11 #include <string>
12 #include <iostream>
13 #include <core/data/flexible_type/flexible_type.hpp>
14 #include <model_server/lib/sgraph_triple_apply_typedefs.hpp>
15 #include "gl_gframe.hpp"
16 
17 namespace turi {
18 class gl_sarray;
19 class gl_sframe;
20 class gl_gframe;
21 class unity_sgraph;
22 class unity_sgraph_base;
23 
24 
25 /**
26  * \ingroup group_glsdk
27  * A scalable graph data structure backed by persistent storage (\ref gl_sframe).
28  *
29  * The SGraph (\ref gl_sgraph) data structure allows arbitrary
30  * dictionary attributes on vertices and edges, provides flexible vertex and
31  * edge query functions, and seamless transformation to and from SFrame.
32  *
33  * ### Construction
34  *
35  * There are several ways to create an SGraph. The simplest way is to make an
36  * empty graph then add vertices and edges with the \ref add_vertices
37  * and \ref add_edges methods.
38  *
39  * \code
40  * gl_sframe vertices { {"vid", {1,2,3} };
41  * gl_sframe edges { {"src", {1, 3}}, {"dst", {2, 2}} };
42  * gl_sgraph g = gl_sgraph().add_vertices(vertices, "vid").add_edges(edges, "src", "dst");
43  * \endcode
44  *
45  * Columns in the \ref gl_sframes that are not used as id fields are assumed
46  * to be vertex or edge attributes.
47  *
48  * \ref gl_sgraph objects can also be created from vertex and edge lists stored
49  * in \ref gl_sframe.
50  *
51  * \code
52  * gl_sframe vertices { {"vid", {1,2,3}, {"vdata" : {"foo", "bar", "foobar"}} };
53  * gl_sframe edges { {"src", {1, 3}}, {"dst", {2, 2}}, {"edata": {0., 0.}} };
54  * gl_sgraph g = gl_sgraph(vertices, edges, "vid", "src", "dst");
55  * \endcode
56  *
57  * ### Usage
58  *
59  * The most convenient way to access vertex and edge data is through
60  * the \ref vertices and \ref edges.
61  * Both functions return a GFrame (\ref gl_gframe) object. GFrame is
62  * like SFrame but is bound to the host SGraph,
63  * so that modification to GFrame is applied to SGraph, and vice versa.
64  *
65  * For instance, the following code shows how to add/remove columns
66  * to/from the vertex data. The change is applied to SGraph.
67  * \code
68  * // add a new edge attribute with const value.
69  * g.edges().add_column("likes", 0);
70  *
71  * // remove a vertex attribute.
72  * g.vertices().remove_column("age");
73  *
74  * // transforms one attribute to the other
75  * g.vertices()["likes_fish"] = g.vertices()["__id"] == "cat";
76  * \endcode
77  *
78  * You can also query for specific vertices and edges using the
79  * \ref get_vertices and \ref get_edges functionality.
80  *
81  * For instance,
82  * \code
83  * gl_sframe selected_edges = g.get_edges(
84  * { {0, UNDEFINED}, {UNDEFINED, 1}, {2, 3} },
85  * { {"likes_fish", 1} });
86  * \endcode
87  * selects out going edges of 0, incoming edges of 1, edge 2->3, such that
88  * the edge attribute "like_fish" evaluates to 1.
89  *
90  * In addition, you can perform other non-mutating \ref gl_sframe operations like
91  * groupby, join, logical_filter in the same way, and the returned object will
92  * be \ref gl_sframe.
93  *
94  * In the case where you want to perform vertex-specified operations,
95  * such as "gather"/"scatter" over the neighborhood of each vertex,
96  * we provide \ref triple_apply which is essentially a
97  * "parallel for" over (Vertex, Edge, Vertex) triplets.
98  *
99  * For instance, the following code shows how to implement the update function
100  * for synchronous pagerank.
101  *
102  * \code
103  *
104  * const double RESET_PROB = 0.15;
105  *
106  * void pr_update(edge_triple& triple) {
107  * triple.target["pagerank"] += triple.source["pagerank_prev"] / triple.source["out_degree"];
108  * }
109  *
110  * gl_sgraph pagerank(const gl_sgraph& g, size_t iters) {
111  *
112  * // Count the out degree of each vertex into an gl_sframe.
113  * gl_sframe out_degree = g.get_edges().groupby("__src_id", {{"out_degree", aggregate::COUNT()}});
114  *
115  * // Add the computed "out_degree" to the graph as vertex attribute.
116  * // We exploit that adding the same vertex will overwrite the vertex data.
117  * gl_sgraph g2 = g.add_vertices(out_degree, "__src_id");
118  *
119  * // Initialize pagerank value
120  * g2.vertices()["pagerank"] = 0.0;
121  * g2.vertices()["pagerank_prev"] = 1.0;
122  *
123  * for (size_t i = 0; i < iters; ++i) {
124  * g2.vertices()["pagerank"] = 0.0;
125  * g2 = g2.triple_apply(pr_update, {"pagerank"});
126  * g2.vertices()["pagerank"] = RESET_PROB + (1 - RESET_PROB) * g2.vertices()["pagerank"];
127  * g2.vertices()["pagerank_prev"] = g2.vertices()["pagerank"];
128  * }
129  *
130  * g2.vertices().remove_column("pagerank_prev");
131  * g2.vertices().remove_column("out_degree");
132  * return g2;
133  * }
134  *
135  * \endcode
136  *
137  * ### Mutability
138  *
139  * \ref gl_sgraph is structurally immutable but data (or field) Mutable.
140  * You can add new vertices and edges, but the operation returns a new \ref
141  Ï
142  * \endcode
143  *
144  * ### Example
145  *
146  * Please checkout turicreate/sdk_example/sgraph_example.cpp for a concrete example.
147  */
148 class gl_sgraph {
149 
150  public:
151  gl_sgraph();
152  gl_sgraph(const gl_sgraph&);
153  gl_sgraph(gl_sgraph&&);
154  gl_sgraph& operator=(const gl_sgraph&);
155  gl_sgraph& operator=(gl_sgraph&&);
156 
157  /**
158  * Construct gl_sgraph with given vertex data and edge data.
159  *
160  * \param vertices Vertex data. Must include an ID column with the name specified by
161  * "vid_field." Additional columns are treated as vertex attributes.
162  *
163  * \param edges Edge data. Must include source and destination ID columns as specified
164  * by "src_field" and "dst_field". Additional columns are treated as edge
165  * attributes.
166  *
167  * \param vid_field Optional. The name of vertex ID column in the "vertices" \ref gl_sframe.
168  *
169  * \param src_field Optional. The name of source ID column in the "edges" \ref gl_sframe.
170  *
171  * \param dst_field Optional. The name of destination ID column in the "edges" \ref gl_sframe.
172  *
173  * Example
174  *
175  * \code
176  *
177  * gl_sframe vertices { {"vid", {"cat", "dog", "fossa"}} };
178  * gl_sframe edges { {"source", {"cat", "dog", "dog"}},
179  * {"dest", {"dog", "cat", "foosa"}},
180  * {"relationship", {"dislikes", "likes", "likes"}} };
181  * gl_sgraph g = gl_sgraph(vertices, edges, "vid", "source", "dest");
182  *
183  * std::cout << g.vertices() << std::endl;
184  * std::cout << g.edges() << std::endl;
185  *
186  * // This following code is equivalent.
187  * // gl_sgraph g = gl_sgraph().add_vertices(vertices, "vid") \
188  * // .add_edges(edges, "source", "dest");
189  *
190  * \endcode
191  *
192  * Produces output:
193  *
194  * \code{.txt}
195  *
196  * vertices of the gl_sgraph
197  * +-------+
198  * | __id |
199  * +-------+
200  * | cat |
201  * | dog |
202  * | foosa |
203  * +-------+
204  *
205  * edges of the gl_sgraph
206  * +----------+----------+--------------+
207  * | __src_id | __dst_id | relationship |
208  * +----------+----------+--------------+
209  * | cat | dog | dislikes |
210  * | dog | cat | likes |
211  * | dog | foosa | likes |
212  * +----------+----------+--------------+
213  *
214  * \endcode
215  *
216  * \see \ref gl_sframe
217  */
218  gl_sgraph(const gl_sframe& vertices, const gl_sframe& edges,
219  const std::string& vid_field="__id",
220  const std::string& src_field="__src_id",
221  const std::string& dst_field="__dst_id");
222 
223  /**
224  * Constructs from a saved gl_sgraph.
225  *
226  * \see save
227  */
228  explicit gl_sgraph(const std::string& directory);
229 
230  /*
231  * Implicit Type converters
232  */
233  gl_sgraph(std::shared_ptr<unity_sgraph>);
234  gl_sgraph(std::shared_ptr<unity_sgraph_base>);
235  operator std::shared_ptr<unity_sgraph>() const;
236  operator std::shared_ptr<unity_sgraph_base>() const;
237 
238  /**************************************************************************/
239  /* */
240  /* Graph query */
241  /* */
242  /**************************************************************************/
243 
244  typedef std::pair<flexible_type, flexible_type> vid_pair;
245 
246  /**
247  * Return a collection of edges and their attributes.
248  *
249  * This function is used to find edges by vertex IDs,
250  * filter on edge attributes, or list in-out * neighbors of vertex sets.
251  *
252  * \param ids Optional. Array of pairs of source and target vertices, each
253  * corresponding to an edge to fetch. Only edges in this list are returned.
254  * FLEX_UNDEFINED can be used to designate a wild card. For instance, {{1,3},
255  * {2,FLEX_UNDEFINED}, {FLEX_UNDEFINED, 5}} will fetch the edge 1->3, all
256  * outgoing edges of 2 and all incoming edges of 5. ids may be left empty,
257  * which implies an array of all wild cards.
258  *
259  * \param fields Optional. Dictionary specifying equality constraints on
260  * field values. For example, { {"relationship", "following"} }, returns only
261  * edges whose 'relationship' field equals 'following'. FLEX_UNDEFINED can be
262  * used as a value to designate a wild card. e.g. { {"relationship",
263  * FLEX_UNDEFINED} } will find all edges with the field 'relationship'
264  * regardless of the value.
265  *
266  * Example:
267  * \code
268  * gl_sframe edges{ {"__src_id", {0, 0, 1}},
269  * {"__dst_id", {1, 2, 2}},
270  * {"rating", {5, 2, FLEX_UNDEFINED}} };
271  * gl_sgraph g = gl_sgraph().add_edges(edges);
272  *
273  * std::cout << g.get_edges() << std::endl;
274  * std::cout << g.get_edges({}, { {"rating": 5} }) << std::endl;
275  * std::cout << g.get_edges({ {0, 1}, {1, 2} }) << std::endl;
276  * \endcode
277  *
278  * Produces output:
279  * \code{.txt}
280  * Return all edges in the graph.
281  *
282  * +----------+----------+--------+
283  * | __src_id | __dst_id | rating |
284  * +----------+----------+--------+
285  * | 0 | 2 | 2 |
286  * | 0 | 1 | 5 |
287  * | 1 | 2 | None |
288  * +----------+----------+--------+
289  *
290  * Return edges with the attribute "rating" of 5.
291  *
292  * +----------+----------+--------+
293  * | __src_id | __dst_id | rating |
294  * +----------+----------+--------+
295  * | 0 | 1 | 5 |
296  * +----------+----------+--------+
297  *
298  * Return edges 0 --> 1 and 1 --> 2 (if present in the graph).
299  *
300  * +----------+----------+--------+
301  * | __src_id | __dst_id | rating |
302  * +----------+----------+--------+
303  * | 0 | 1 | 5 |
304  * | 1 | 2 | None |
305  * +----------+----------+--------+
306  * \endcode
307  *
308  * \see edges
309  * \see get_vertices
310  */
311  gl_sframe get_edges(const std::vector<vid_pair>& ids=std::vector<vid_pair>(),
312  const std::map<std::string, flexible_type>& fields=std::map<std::string, flexible_type>()) const;
313 
314  /**
315  * Return a collection of vertices and their attributes.
316  *
317  * \param ids List of vertex IDs to retrieve. Only vertices in this list will be
318  * returned.
319  *
320  * \param fields Dictionary specifying equality constraint on field values. For
321  * example { {"gender", "M"} }, returns only vertices whose 'gender'
322  * field is 'M'. FLEX_UNDEFINED can be used to designate a wild card. For
323  * example, { {"relationship", FLEX_UNDEFINED } } will find all vertices with the
324  * field 'relationship' regardless of the value.
325  *
326  * Example:
327  *
328  * \code
329  *
330  * gl_sframe vertices { {"__id", {0, 1, 2}},
331  * {"gender", {"M", "F", "F"}} };
332  * g = gl_sgraph().add_vertices(vertices);
333  *
334  * std::cout << g.get_vertices() << std::endl;
335  * std::cout << g.get_vertices({0, 2}) << std::endl;
336  * std::cout << g.get_vertices({}, { {"gender", "M"} }) << std::endl;
337  *
338  * \endcode
339  *
340  * Produces output:
341  * \code{.txt}
342  * Return all vertices in the graph.
343  *
344  * +------+--------+
345  * | __id | gender |
346  * +------+--------+
347  * | 0 | M |
348  * | 2 | F |
349  * | 1 | F |
350  * +------+--------+
351  *
352  * Return vertices 0 and 2.
353  *
354  * +------+--------+
355  * | __id | gender |
356  * +------+--------+
357  * | 0 | M |
358  * | 2 | F |
359  * +------+--------+
360  *
361  * Return vertices with the vertex attribute "gender" equal to "M".
362  *
363  * +------+--------+
364  * | __id | gender |
365  * +------+--------+
366  * | 0 | M |
367  * +------+--------+
368  * \endcode
369  *
370  * \see vertices
371  * \see get_edges
372  */
373  gl_sframe get_vertices(const std::vector<flexible_type>& ids=std::vector<flexible_type>(),
374  const std::map<std::string, flexible_type>& fields=std::map<std::string, flexible_type>()) const;
375 
376 
377  /**
378  * Return the number of vertices and edges as a dictionary.
379  *
380  * Example:
381  *
382  * \code
383  * g = gl_sgraph();
384  * std::cout << g.summary()['num_vertices'] << "\n"
385  * << g.summary()['num_edges'] << std::endl;;
386  * \endcode
387  *
388  * Produces output:
389  * \code{.txt}
390  * 0
391  * 0
392  * \endcode
393  *
394  * \see num_vertices
395  * \see num_edges
396  */
397  std::map<std::string, flexible_type> summary() const;
398 
399  /**
400  * Return the number of vertices in the graph.
401  */
402  size_t num_vertices() const;
403 
404  /**
405  * Return the number of edges in the graph.
406  */
407  size_t num_edges() const;
408 
409  /**
410  * Return the names of both vertex fields and edge fields in the graph.
411  */
412  std::vector<std::string> get_fields() const;
413 
414  /**
415  * Return the names of vertex fields in the graph.
416  */
417  std::vector<std::string> get_vertex_fields() const;
418 
419  /**
420  * Return the names of edge fields in the graph.
421  */
422  std::vector<std::string> get_edge_fields() const;
423 
424  /**
425  * Return the types of vertex fields in the graph.
426  */
427  std::vector<flex_type_enum> get_vertex_field_types() const;
428 
429  /**
430  * Return the types of edge fields in the graph.
431  */
432  std::vector<flex_type_enum> get_edge_field_types() const;
433 
434 
435  /**************************************************************************/
436  /* */
437  /* Graph modification */
438  /* */
439  /**************************************************************************/
440  /**
441  * Add vertices to the \ref gl_sgraph and return the new graph.
442  *
443  * Input vertices should be in the form of \ref gl_sframe and
444  * "vid_field" specifies which column contains the vertex ID. Remaining
445  * columns are assumed to hold additional vertex attributes. If these
446  * attributes are not already present in the graph's vertex data, they are
447  * added, with existing vertices acquiring the missing value FLEX_UNDEFINED.
448  *
449  * \param vertices Vertex data. An \ref gl_sframe whose
450  * "vid_field" column contains the vertex IDs.
451  * Additional columns are treated as vertex attributes.
452  *
453  * \param vid_field Optional. Specifies the vertex id column in the vertices
454  * gl_sframe.
455  *
456  * Example:
457  *
458  * \code
459  *
460  * // Add three vertices to an empty graph
461  * gl_sframe vertices { {"vid": {0, 1, 2}},
462  * {"breed": {"labrador", "labrador", "vizsla"}} };
463  * gl_sgraph g = gl_sgraph().add_vertices( vertices, "vid" );
464  *
465  * // Overwrite existing vertex
466  * gl_sgraph g2 = g.add_vertices ( gl_sframe { {"vid", {0}}, {"breed", "poodle"} },
467  * "vid" );
468  *
469  * // Add vertices with new attributes
470  * gl_sgraph g3 = g2.add_vertices (gl_sframe { {"vid", {3}},
471  * {"weight", "20 pounds"} },
472  * "vid" );
473  *
474  * std::cout << g.get_vertices() << std::endl;
475  * std::cout << g2.get_vertices() << std::endl;
476  * std::cout << g3.get_vertices() << std::endl;
477  *
478  * \endcode
479  *
480  * Produces output:
481  * \code{.txt}
482  *
483  * vertices of g1
484  * +------+----------+
485  * | __id | breed |
486  * +------+----------+
487  * | 0 | labrador |
488  * | 2 | vizsla |
489  * | 1 | labrador |
490  * +------+----------+
491  *
492  * vertices of g2
493  * +------+----------+
494  * | __id | breed |
495  * +------+----------+
496  * | 0 | poodle |
497  * | 2 | vizsla |
498  * | 1 | labrador |
499  * +------+----------+
500  *
501  * vertices of g3
502  * +------+----------+-----------+
503  * | __id | breed | weight |
504  * +------+----------+-----------+
505  * | 0 | poodle | None |
506  * | 2 | vizsla | None |
507  * | 1 | labrador | None |
508  * | 4 | None | 20 pounds |
509  * +------+----------+-----------+
510  *
511  * \endcode
512  *
513  * \note The column specified by vid_field will be renamed to "__id"
514  * as the special vertex attribute.
515  *
516  * \note If a vertex id already exists in the graph, adding a new vertex
517  * with the same id will overwrite the entire vertex attributes.
518  *
519  * \note If adding vertices to a non-empty graph, the types of the
520  * existing columns must be the same as those of the existing vertices.
521  *
522  * \note This function returns a new graph, and does not modify the
523  * current graph.
524  *
525  * \see gl_sframe
526  * \see vertices
527  * \see get_vertices
528  * \see add_edges
529  */
530  gl_sgraph add_vertices(const gl_sframe& vertices, const std::string& vid_field) const;
531 
532  /**
533  * Add edges to the \ref gl_sgraph and return the new graph.
534  *
535  * Input edges should be in the form of \ref gl_sframe and
536  * "src_field" and "dst_field" specifies which two columns contain the
537  * id of source vertex IDs and target vertices. Remaining
538  * columns are assumed to hold additional vertex attributes. If these
539  * attributes are not already present in the graph's edge data, they are
540  * added, with existing edges acquiring the missing value FLEX_UNDEFINED.
541  *
542  * \param edges Edge data. An \ref gl_sframe whose
543  * "src_field" and "dst_field" columns contain the source and target
544  * vertex IDs. Additional columns are treated as edge attributes.
545  *
546  * \param src_field Optional. Specifies the source id column in the edges
547  * gl_sframe.
548  *
549  * \param dst_field Optional. Specifies the target id column in the edges
550  * gl_sframe.
551  *
552  * Example:
553  *
554  * \code
555  *
556  * gl_sframe edges { {"source", {"cat", "fish"}},
557  * {"dest", {"fish", "cat"}},
558  * {"relation", {"eat", "eaten"}} };
559  *
560  * gl_sgraph g = gl_sgraph().add_edges(edges, "source", "dest");
561  *
562  * gl_sgraph g2 = g.add_edges(gl_sframe { {"source", {"cat"}},
563  * {"dest", {"fish"}},
564  * {"relation", {"like"}} },
565  * "source", "dest");
566  *
567  * std::cout << g.get_edges() << std::endl;
568  * std::cout << g2.get_edges() << std::endl;
569  *
570  * \endcode
571  *
572  * Produces output:
573  * \code{.txt}
574  *
575  * edges of g
576  * +----------+----------+----------+
577  * | __src_id | __dst_id | relation |
578  * +----------+----------+----------+
579  * | cat | fish | eat |
580  * | fish | cat | eaten |
581  * +----------+----------+----------+
582  *
583  * edges of g2
584  * +----------+----------+----------+
585  * | __src_id | __dst_id | relation |
586  * +----------+----------+----------+
587  * | cat | fish | eat |
588  * | cat | fish | like |
589  * | fish | cat | eaten |
590  * +----------+----------+----------+
591  *
592  * \endcode
593  *
594  * \note The columns specified by "src_id" and "dst_id" will be renamed to
595  * "__src_id", and "__dst_id" respectively as the special edge attributes.
596  *
597  * \note If an edge (identified by src and dst id) already exists in the graph,
598  * adding a new edge with the same src and dst will NOT overwrite the existing
599  * edge. The same edge is DUPLICATED.
600  *
601  * \note If an edge contains new vertices, the new vertices will be automatically
602  * added to the graph with all attributes default to FLEX_UNDEFINED.
603  *
604  * \note If adding edges to a non-empty graph, the types of the
605  * existing columns must be the same as those of the existing edges.
606  *
607  * \note This function returns a new graph, and does not modify the
608  * current graph.
609  *
610  * \see gl_sframe
611  * \see edges
612  * \see get_edges
613  * \see add_vertices
614  */
615  gl_sgraph add_edges(const gl_sframe& edges,
616  const std::string& src_field,
617  const std::string& dst_field) const;
618 
619  /**
620  * Return a new \ref gl_sgraph with only the selected vertex fields.
621  * Other vertex fields are discarded, while fields that do not exist in the
622  * \ref gl_sgraph are ignored. Edge fields remain the same in the new graph.
623  *
624  * \param fields A list of field names to select.
625  *
626  * Example:
627  *
628  * \code
629  *
630  * gl_sframe vertices { {"vid", {0, 1, 2}},
631  * {"breed", {"labrador", "labrador", "vizsla"}},
632  * {"age", {5, 3, 8}} };
633  * g = SGraph().add_vertices(vertices, "vid");
634  * g2 = g.select_vertex_fields({"breed"});
635  *
636  * std::cout << g.vertices() << std::endl;
637  * std::cout << g2.vertices() << std::endl;
638  *
639  * \endcode
640  *
641  * Produces output:
642  * \code{.txt}
643  *
644  * vertices of g
645  * +------+----------+-----+
646  * | __id | breed | age |
647  * +------+----------+-----+
648  * | 0 | labrador | 5 |
649  * | 2 | vizsla | 3 |
650  * | 1 | labrador | 8 |
651  * +------+----------+-----+
652  *
653  * vertices of g2
654  * +------+----------+
655  * | __id | breed |
656  * +------+----------+
657  * | 0 | labrador |
658  * | 2 | vizsla |
659  * | 1 | labrador |
660  * +------+----------+
661  * \endcode
662  *
663  * \note "__id" will always be selected.
664  *
665  * \see get_fields
666  * \see get_vertex_fields
667  * \see get_edge_fields
668  * \see select_fields
669  * \see select_edge_fields
670  */
671  gl_sgraph select_vertex_fields(const std::vector<std::string>& fields) const;
672 
673  /**
674  * Return a new \ref gl_sgraph with only the selected edge fields.
675  * Other edge fields are discarded, while fields that do not exist in the
676  * \ref gl_sgraph are ignored. Vertex fields remain the same in the new graph.
677  *
678  * \param fields A list of field names to select.
679  *
680  * Example:
681  *
682  * \code
683  * gl_sframe edges { {"source", {"Alice", "Bob"}},
684  * {"dest", {"Bob", "Alice"}},
685  * {"follows", {0, 1}},
686  * {"likes", {5, 3}} };
687  *
688  * g = SGraph().add_edges(edges, "source", "dest");
689  * g2 = g.select_edge_fields({"follows"});
690  *
691  * std::cout << g.edges() << std::endl;
692  * std::cout << g2.edges() << std::endl;
693  *
694  * \endcode
695  *
696  * Produces output:
697  * \code{.txt}
698  *
699  * vertices of g
700  *
701  * vertices of g2
702  * \endcode
703  *
704  * \note "__src_id" and "__dst_id" will always be selected.
705  *
706  * \see get_fields
707  * \see get_vertex_fields
708  * \see get_edge_fields
709  * \see select_vertex_fields
710  * \see select_fields
711  */
712  gl_sgraph select_edge_fields(const std::vector<std::string>& fields) const;
713 
714  /**
715  * Return a new \ref gl_sgraph with only the selected fields (both vertex and
716  * edge fields. Other fields are discarded, while fields that do not exist in the
717  * \ref gl_sgraph are ignored.
718  *
719  * \param fields A list of field names to select.
720  *
721  * \note "__id", "__src_id" and "__dst_id" will always be selected.
722  *
723  * \see select_vertex_fields
724  * \see select_edge_fields
725  */
726  gl_sgraph select_fields(const std::vector<std::string>& fields) const;
727 
728 
729  /**************************************************************************/
730  /* */
731  /* SFrame handler for mutating vertices and edges */
732  /* */
733  /**************************************************************************/
734 
735  /**
736  * Returns a convenient "SFrame like" handler for the vertices in this
737  * \ref gl_sgraph.
738  *
739  * While a regular \ref gl_sframe is independent of any \ref gl_sgraph, a
740  * \ref gl_gframe is bound (or points) to an \ref gl_sgraph. Modifying fields
741  * of the returned \ref gl_gframe changes the vertex data of the \ref
742  * gl_sgraph. Also, modifications to the fields in the \ref gl_sgraph, will
743  * be reflected in the \ref gl_gframe.
744  *
745  * Example:
746  *
747  * \code
748  *
749  * gl_sframe vertices { {"vid", {"cat", "dog", "hippo"}},
750  * {"fluffy", {1, 1, FLEX_UNDEFINED}},
751  * {"woof", {FLEX_UNDEFINED, 1, FLEX_UNDEFINED}} };
752  * gl_sgraph g = gl_sgraph().add_vertices(vertices, "vid");
753  *
754  * // Let's modify the vertex data by operating on g.vertices():
755  * // Copy the 'woof' vertex field into a new 'bark' vertex field.
756  * g.vertices()["bark"] = g.vertices["woof"];
757  * std::cout << g.vertices() << std::endl;
758  *
759  * // Remove the 'woof' field.
760  * g.vertices().remove_column("woof");
761  * std::cout << g.vertices() << std::endl;
762  *
763  * // Create a new field 'like_fish'.
764  * g.vertices()['likes_fish'] = g.vertices()['__id'] == "cat";
765  * std::cout << g.vertices() << std::endl;
766  *
767  * // Replace missing values with zeros.
768  * for (const auto& col : g.vertices().column_names()) {
769  * if (col != "__id") {
770  * g.vertices().fillna(col, 0);
771  * }
772  * }
773  * std::cout << g.vertices() << std::endl;
774  * \endcode
775  *
776  * Produces output:
777  * \code{.txt}
778  *
779  * Copy the 'woof' vertex attribute into a new 'bark' vertex attribute:
780  * +-------+--------+------+------+
781  * | __id | fluffy | woof | bark |
782  * +-------+--------+------+ -----+
783  * | dog | 1.0 | 1.0 | 1.0 |
784  * | cat | 1.0 | NA | NA |
785  * | hippo | NA | NA | NA |
786  * +-------+--------+------+ -----+
787  *
788  * Remove the 'woof' attribute:
789  * +-------+--------+------+
790  * | __id | fluffy | bark |
791  * +-------+--------+------+
792  * | dog | 1.0 | 1.0 |
793  * | cat | 1.0 | NA |
794  * | hippo | NA | NA |
795  * +-------+--------+------+
796  *
797  * Create a new field 'likes_fish':
798  * +-------+--------+------+------------+
799  * | __id | fluffy | bark | likes_fish |
800  * +-------+--------+------+------------+
801  * | dog | 1.0 | 1.0 | 0 |
802  * | cat | 1.0 | NA | 1 |
803  * | hippo | NA | NA | 0 |
804  * +-------+--------+------+------------+
805  *
806  * Replace missing values with zeros:
807  * +-------+--------+------+------------+
808  * | __id | fluffy | bark | likes_fish |
809  * +-------+--------+------+------------+
810  * | dog | 1.0 | 1.0 | 0 |
811  * | cat | 1.0 | 0.0 | 1 |
812  * | hippo | 0.0 | 0.0 | 0 |
813  * +-------+--------+------+------------+
814  * \endcode
815  *
816  * \note To preserve the graph structure the "__id" column of this \ref
817  * gl_sframe is read-only.
818  *
819  * \see edges
820  */
822 
823  /**
824  * Returns a convenient "SFrame like" handler for the edges in this
825  * \ref gl_sgraph.
826  *
827  * While a regular \ref gl_sframe is independent of any \ref gl_sgraph, a
828  * \ref gl_gframe is bound (or points) to an \ref gl_sgraph. Modifying fields
829  * of the returned \ref gl_gframe changes the edge data of the \ref
830  * gl_sgraph. Also, modifications to the fields in the \ref gl_sgraph, will
831  * be reflected in the \ref gl_gframe.
832  *
833  * Example:
834  *
835  * \code
836  *
837  * gl_sframe vertices { {"vid", {"cat", "dog", "fossa"}} };
838  * gl_sframe edges { {"source", {"cat", "dog", "dog"}},
839  * {"dest", {"dog", "cat", "foosa"}},
840  * {"relationship", {"dislikes", "likes", "likes"}} };
841  *
842  * gl_sgraph g = gl_sgraph(vertices, edges, "vid", "source", "dest");
843  *
844  * // Add a new edge field "size":
845  * g.edges()["size"] = { {"smaller than", "larger than", "equal to"} };
846  * std::cout << g.edges() << std::endl;
847  *
848  * \endcode
849  *
850  * Produces output:
851  * \code{.txt}
852  *
853  * Add a new edge field "size"
854  * +----------+----------+--------------+--------------+
855  * | __src_id | __dst_id | relationship | size |
856  * +----------+----------+--------------+--------------+
857  * | cat | dog | dislikes | smaller than |
858  * | dog | cat | likes | larger than |
859  * | dog | foosa | likes | equal to |
860  * +----------+----------+--------------+--------------+
861  *
862  * \endcode
863  *
864  * \note To preserve the graph structure the "__src_id" and "__dst_id"
865  * column of this \ref gl_sframe is read-only.
866  *
867  * \see vertices
868  */
869  gl_gframe edges();
870 
871  /**************************************************************************/
872  /* */
873  /* Triple Apply */
874  /* */
875  /**************************************************************************/
876  /**
877  * Apply a user defined lambda function on each of the edge triples, and returns
878  * the new graph.
879  *
880  * An \ref edge_triple is a simple struct containing source, edge and target
881  * of type std::map<std::string, flexible_type>. The lambda function
882  * is applied once on each of the edge_triple in parallel, with locking on
883  * both source and target vertices to prevent race conditions. The following
884  * pseudo code describes the effect of the function:
885  *
886  * \code
887  * INPUT: G
888  * OUTPUT: G'
889  * G' = copy(G)
890  * PARALLEL FOR (source, edge, target) in G':
891  * LOCK (source, target)
892  * edge_triple triple(source, edge, target)
893  * lambda(triple)
894  * FOR f in mutated_fields:
895  * source[f] = triple.source[f] // if f in source
896  * target[f] = triple.target[f] // if f in target
897  * edge[f] = triple.edge[f] // if f in edge
898  * END FOR
899  * UNLOCK (source, target)
900  * END PARALLEL FOR
901  * RETURN G'
902  * \endcode
903  *
904  * This function enables super easy implementations of common graph
905  * computations like degree_count, weighted_pagerank, connected_component, etc.
906  *
907  * Example
908  *
909  * \code
910  * gl_sframe edges { {"source": {0,1,2,3,4}},
911  * {"dest": {1,2,3,4,0}} };
912  * g = turicreate.SGraph().add_edges(edges, "source", "dest");
913  * g.vertices()['degree'] = 0
914  * std::cout << g.vertices() << std::endl;
915  *
916  * auto degree_count_fn = [](edge_triple& triple)->void {
917  * triple.source["degree"] += 1;
918  * triple.target["degree"] += 1;
919  * };
920  *
921  * g2 = g.triple_apply(degree_count_fn, {"degree"});
922  * std::cout << g2.vertices() << std::endl;
923  * \endcode
924  *
925  * Produces output:
926  *
927  * \code{.txt}
928  *
929  * Vertices of g
930  * +------+--------+
931  * | __id | degree |
932  * +------+--------+
933  * | 0 | 0 |
934  * | 2 | 0 |
935  * | 3 | 0 |
936  * | 1 | 0 |
937  * | 4 | 0 |
938  * +------+--------+
939  *
940  * Vertices of g2
941  * +------+--------+
942  * | __id | degree |
943  * +------+--------+
944  * | 0 | 2 |
945  * | 2 | 2 |
946  * | 3 | 2 |
947  * | 1 | 2 |
948  * | 4 | 2 |
949  * +------+--------+
950  *
951  * \endcode
952  *
953  * \note mutated fields must be pre-allocated before triple_apply.
954  *
955  * \see edge_triple
956  * \see lambda_triple_apply_fn
957  */
959  const std::vector<std::string>& mutated_fields) const;
960 
961  /**
962  * Save the sgraph into a directory.
963  */
964  void save(const std::string& directory) const;
965 
966  /**
967  * Save the sgraph using reference to other SFrames.
968  *
969  * \see gl_sframe::save_reference
970  */
971  void save_reference(const std::string& directory) const;
972 
973  friend class gl_gframe;
974 
975  public:
976  /**************************************************************************/
977  /* */
978  /* Lower level fields modifier primitives */
979  /* */
980  /**************************************************************************/
981 
982  /**
983  * Add a new vertex field with given field name and column_data.
984  * Using \ref vertices() is preferred.
985  *
986  * \param column_data gl_sarray of size equals to num_vertices. The order of
987  * column_data is aligned with the order which vertices are stored.
988  *
989  * \param field name of the new vertex field.
990  */
991  void add_vertex_field(gl_sarray column_data, const std::string& field);
992 
993  /**
994  * Add a new vertex field filled with constant data.
995  * Using \ref vertices() is preferred.
996  *
997  * \param column_data the constant data to fill the new field column.
998  *
999  * \param field name of the new vertex field.
1000  */
1001  void add_vertex_field(const flexible_type& column_data, const std::string& field);
1002 
1003  /**
1004  * Removes the vertex field
1005  *
1006  * \param name of the field to be removed
1007  */
1008  void remove_vertex_field(const std::string& field);
1009 
1010  /**
1011  * Renames the vertex fields
1012  *
1013  * \param oldnames list of names of the fields to be renamed
1014  *
1015  * \param newnames list of new names of the fields, aligned with oldnames.
1016  */
1017  void rename_vertex_fields(const std::vector<std::string>& oldnames,
1018  const std::vector<std::string>& newnames);
1019 
1020  /**
1021  * Add a new edge field with given field name and column_data.
1022  * Using \ref edges() is preferred.
1023  *
1024  * \param column_data gl_sarray of size equals to num_vertices. The order of
1025  * column_data is aligned with the order which vertices are stored.
1026  *
1027  * \param field name of the new edge field.
1028  */
1029  void add_edge_field(gl_sarray column_data, const std::string& field);
1030 
1031  /**
1032  * Add a new edge field filled with constant data.
1033  * Using \ref edges() is preferred.
1034  *
1035  * \param column_data the constant data to fill the new field column.
1036  *
1037  * \param field name of the new edge field.
1038  */
1039  void add_edge_field(const flexible_type& column_data, const std::string& field);
1040 
1041  /**
1042  * Removes the edge field
1043  *
1044  * \param name of the field to be removed
1045  */
1046  void remove_edge_field(const std::string& field);
1047 
1048  /**
1049  * Renames the edge fields
1050  *
1051  * \param oldnames list of names of the fields to be renamed
1052  *
1053  * \param newnames list of new names of the fields, aligned with oldnames.
1054  */
1055  void rename_edge_fields(const std::vector<std::string>& oldnames,
1056  const std::vector<std::string>& newnames);
1057 
1058  /**
1059  * Retrieves a pointer to the underlying unity_sgraph
1060  */
1061  virtual std::shared_ptr<unity_sgraph> get_proxy() const;
1062 
1063  private:
1064  void instantiate_new();
1065  void instantiate_from_proxy(std::shared_ptr<unity_sgraph>);
1066  void _swap_vertex_fields(const std::string& field1, const std::string& field2);
1067  void _swap_edge_fields(const std::string& field1, const std::string& field2);
1068 
1069  private:
1070  std::shared_ptr<unity_sgraph> m_sgraph;
1071 };
1072 
1073 } // namespace turi
1074 #endif
void rename_vertex_fields(const std::vector< std::string > &oldnames, const std::vector< std::string > &newnames)
void remove_vertex_field(const std::string &field)
std::vector< std::string > get_vertex_fields() const
gl_sgraph select_edge_fields(const std::vector< std::string > &fields) const
std::map< std::string, flexible_type > summary() const
void add_edge_field(gl_sarray column_data, const std::string &field)
std::vector< std::string > get_fields() const
gl_sgraph add_vertices(const gl_sframe &vertices, const std::string &vid_field) const
gl_sgraph select_vertex_fields(const std::vector< std::string > &fields) const
gl_sframe get_vertices(const std::vector< flexible_type > &ids=std::vector< flexible_type >(), const std::map< std::string, flexible_type > &fields=std::map< std::string, flexible_type >()) const
gl_gframe edges()
gl_sgraph select_fields(const std::vector< std::string > &fields) const
gl_sframe get_edges(const std::vector< vid_pair > &ids=std::vector< vid_pair >(), const std::map< std::string, flexible_type > &fields=std::map< std::string, flexible_type >()) const
void save_reference(const std::string &directory) const
size_t num_edges() const
gl_sgraph add_edges(const gl_sframe &edges, const std::string &src_field, const std::string &dst_field) const
std::vector< std::string > get_edge_fields() const
std::vector< flex_type_enum > get_vertex_field_types() const
size_t num_vertices() const
std::function< void(edge_triple &)> lambda_triple_apply_fn
gl_sgraph triple_apply(const lambda_triple_apply_fn &lambda, const std::vector< std::string > &mutated_fields) const
void add_vertex_field(gl_sarray column_data, const std::string &field)
virtual std::shared_ptr< unity_sgraph > get_proxy() const
std::vector< flex_type_enum > get_edge_field_types() const
void rename_edge_fields(const std::vector< std::string > &oldnames, const std::vector< std::string > &newnames)
void save(const std::string &directory) const
gl_gframe vertices()
void remove_edge_field(const std::string &field)