Turi Create  4.0
sgraph.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_SGRAPH_SGRAPH_HPP
7 #define TURI_SGRAPH_SGRAPH_HPP
8 #include <memory>
9 #include <core/data/flexible_type/flexible_type.hpp>
10 #include <core/storage/sgraph_data/sgraph_constants.hpp>
11 #include <core/storage/sframe_data/sframe.hpp>
12 #include <sparsehash/sparse_hash_map>
13 
14 namespace turi {
15 
16 /**
17  * \ingroup sgraph_physical
18  * \addtogroup sgraph_main Main SGraph Objects
19  * \{
20  */
21 
22 
23 /**
24  * An On disk representation of a graph.
25  *
26  * The actual on disk representation looks like the following:
27  *
28  * Where the partition size (\ref partition_size) is 4, we shuffle all the
29  * vertices into 4 SFrames, each of 1 segment. The shuffling is performed by
30  * simply hashing the vertex ID into one of the buckets.
31  *
32  * The edges however, are placed into 4*4=16 SFrames, each of 1 segment.
33  * Each edge (src,dst) is placed into the (hash(src) % 4) * 4 + hash(dst) % 4.
34  * Essentially the Edge SFrame can be thought of as cutting the adjacency matrix
35  * into a 4x4 grid.
36  *
37  * The result is that the edges in the block (0,0) is adjacent only to the
38  * vertices in the first block (0), the edges in block (0,1) is adjacent only
39  * to the vertices in block 0 and 1 and so on.
40  *
41  * \verbatim
42  *
43  * Vertices Edges
44  * +---+ +-------+-------+-------+-------+
45  * | | | | | | |
46  * | 0 | | (0,0) | (0,1) | (0,2) | (0,3) |
47  * +---+ +-------+-------+-------+-------+
48  * | | | | | | |
49  * | 1 | | (1,0) | (1,1) | (1,2) | (1,3) |
50  * +---+ +-------+-------+-------+-------+
51  * | | | | | | |
52  * | 2 | | (2,0) | (2,1) | (2,2) | (2,3) |
53  * +---+ +-------+-------+-------+-------+
54  * | | | | | | |
55  * | 3 | | (3,0) | (3,1) | (3,2) | (3,3) |
56  * +---+ +-------+-------+-------+-------+
57  *
58  * \endverbatim
59  *
60  * Vertices are partitioned into user-defined semantic groups. Each vertex
61  * can only show up in one group, and each vertex is uniquely identified by
62  * the combination of the group ID and the Vertex ID. The vertex ID type MUST
63  * be consistent and identical across all groups.
64  *
65  * Vertex grouping is implemented by having multiple of the vertex blocks, one
66  * for each group. Thus m_vertex_groups[0] contains a vector of SFrames for
67  * vertex group 0 and so on.
68  *
69  * Edges are not grouped and they may span any collection of vertices.
70  * However, to be able to efficiently slice vertices and edges across groups,
71  * there are g*g edges groups, where m_edge_groups[{a,b}]
72  * contain all the edges between group a and group b.
73  */
74 class sgraph {
75  public:
76 
77  static const char* DEFAULT_GROUP_NAME;
78  static const char* VID_COLUMN_NAME;
79  static const char* SRC_COLUMN_NAME;
80  static const char* DST_COLUMN_NAME;
81  static const flex_type_enum INTERNAL_ID_TYPE;
82 
83  explicit sgraph(size_t num_partitions = SGRAPH_DEFAULT_NUM_PARTITIONS);
84 
85  typedef std::map<std::string, flexible_type> options_map_t;
86 
87 
88  enum class edge_direction {
89  IN_EDGE = 1, OUT_EDGE = 2, ANY_EDGE = 3
90  };
91 
92  struct vertex_partition_address {
93  size_t group = 0, partition = 0;
94  vertex_partition_address() = default;
95  vertex_partition_address(size_t group, size_t partition):
96  group(group),partition(partition) { }
97 
98  bool operator==(const vertex_partition_address& other) const {
99  return group == other.group && partition == other.partition;
100  }
101  bool operator<(const vertex_partition_address& other) const {
102  return group < other.group ||
103  (group == other.group && partition < other.partition);
104  }
105  };
106 
107  class edge_partition_address {
108  public:
109  size_t src_group = 0, dst_group = 0, partition1 = 0, partition2 = 0;
110 
111  edge_partition_address() = default;
112  edge_partition_address(size_t src_group, size_t dst_group,
113  size_t partition1, size_t partition2) :
114  src_group(src_group), dst_group(dst_group),
115  partition1(partition1), partition2(partition2) { }
116 
117  vertex_partition_address get_src_vertex_partition() {
118  vertex_partition_address ret;
119  ret.group = src_group; ret.partition = partition1;
120  return ret;
121  }
122 
123  vertex_partition_address get_dst_vertex_partition() {
124  vertex_partition_address ret;
125  ret.group = dst_group; ret.partition = partition2;
126  return ret;
127  }
128  };
129 
130 
131  public:
132  sgraph(const sgraph& other) = default;
133 
134  sgraph& operator=(const sgraph& other) = default;
135 
136  /**
137  * Returns a sframe of vertices satisfying the id and field constraints.
138  */
139  sframe get_vertices(const std::vector<flexible_type>& vid_vec = {},
140  const options_map_t& field_constraint = options_map_t(),
141  size_t groupid = 0) const;
142 
143  /**
144  * Returns a dataframe of vertices satisfying the id
145  * and field constraints. source_vids may contain UNDEFINED as wildcards
146  * and target_vids may contain UNDEFINED as wildcards.
147  * Each edge will only be represented once in the output.
148  *
149  * If source_vids and target_vids are empty, a universal
150  * "UNDEFINED-->UNDEFINED" query is assumed
151  */
152  sframe get_edges(const std::vector<flexible_type>& source_vids = {},
153  const std::vector<flexible_type>& target_vids = {},
154  const options_map_t& field_constraint = options_map_t(),
155  size_t groupa = 0, size_t groupb = 0) const;
156 
157  /**
158  * Returns a list of fields for given vertex group in the graph.
159  */
160  std::vector<std::string> get_vertex_fields(size_t groupid = 0) const;
161 
162  /**
163  * Returns a list of field types for given vertex group in the graph.
164  */
165  std::vector<flex_type_enum> get_vertex_field_types(size_t groupid = 0) const;
166 
167  /**
168  * Returns a list of fields for given edge group in the graph.
169  */
170  std::vector<std::string> get_edge_fields(size_t groupa = 0, size_t groupb = 0) const;
171 
172  /**
173  * Returns a list of field types for given edge group in the graph.
174  */
175  std::vector<flex_type_enum> get_edge_field_types(size_t groupa = 0, size_t groupb = 0) const;
176 
177  /**
178  * Adds vertices to the graph.
179  *
180  * Note: The dataframe must contain the id_field_name
181  */
182  bool add_vertices(const dataframe_t& vertices,
183  const std::string& id_field_name,
184  size_t group = 0);
185  /**
186  * \overload
187  */
188  bool add_vertices(sframe vertices,
189  const std::string& id_field_name,
190  size_t group = 0);
191 
192  /**
193  * Adds edges to the graph.
194  *
195  * Note: The dataframe must contain the {source, target}_field_name
196  */
197  bool add_edges(const dataframe_t& edges,
198  const std::string& source_field_name,
199  const std::string& target_field_name,
200  size_t groupa = 0, size_t groupb = 0);
201  /**
202  * \overload
203  */
204  bool add_edges(sframe edges,
205  const std::string& source_field_name,
206  const std::string& target_field_name,
207  size_t groupa = 0, size_t groupb = 0);
208 
209  /**
210  * Copies data from "field" to a new field with name "new_field" for a vertex group.
211  * If the new_field already exists, it will be replaced.
212  */
213  bool copy_vertex_field(const std::string& field, const std::string& new_field,
214  size_t group = 0);
215 
216  /**
217  * Similar to copy_vertex_field but work on edge data.
218  * If the new_field already exists, it will be replaced.
219  */
220  bool copy_edge_field(const std::string& field, const std::string& new_field,
221  size_t groupa = 0, size_t groupb = 0);
222 
223  /**
224  * Initialize a vertex field of group with const value.
225  * Creates a new column if the field does not exist.
226  */
227  bool init_vertex_field(const std::string& field, const flexible_type& init_value, size_t group = 0);
228 
229  /**
230  * Deletes a field from vertex data. Returns true on success.
231  * False on failure.
232  */
233  bool remove_vertex_field(const std::string& field, size_t group = 0);
234 
235  /**
236  * Subselect fields in the vertex sframe.
237  */
238  bool select_vertex_fields(const std::vector<std::string>& fields, size_t group = 0);
239 
240  /**
241  * Deletes a field from edge data. Returns true on success.
242  * False on failure.
243  */
244  bool remove_edge_field(const std::string& field, size_t groupa = 0, size_t groupb = 0);
245 
246  /**
247  * Initialize an edge field of group with const value.
248  * Creates a new column if the field does not exist.
249  */
250  bool init_edge_field(const std::string& field, const flexible_type& init_value, size_t groupa = 0, size_t groupb = 0);
251 
252  /**
253  * Subselect fields in the edge sframe.
254  */
255  bool select_edge_fields(const std::vector<std::string>& fields, size_t groupa = 0, size_t groupb = 0);
256 
257  /**
258  * Resets the graph
259  */
260  bool clear();
261 
262  /**
263  * Returns the collection of SFrames containing all the vertices
264  * in group groupid.
265  *
266  * This function can be used as the left hand side of an assignment.
267  * i.e.
268  * \code
269  * vertex_group(group) = blah
270  * \endcode
271  * The caller must guarantee that blah is of the right size. (i.e.
272  * blah.size() == get_num_partitions() )
273  */
274  inline std::vector<sframe>& vertex_group(size_t groupid = 0) {
275  ASSERT_LT(groupid, m_num_groups);
276  return m_vertex_groups[groupid];
277  }
278 
279  /**
280  * Returns the collection of SFrames containing all the vertices
281  * in group groupid
282  */
283  inline const std::vector<sframe>& vertex_group(size_t groupid = 0) const {
284  ASSERT_LT(groupid, m_num_groups);
285  return m_vertex_groups[groupid];
286  }
287 
288 
289  /**
290  * Returns the collection of SFrames containing all the edges
291  * between vertex group groupa and vertex group groupb.
292  *
293  * This function can be used as the left hand side of an assignment.
294  * i.e.
295  * \code
296  * edge_group(group) = blah
297  * \endcode
298  * The caller must guarantee that blah is of the right size. (i.e.
299  * blah.size() == get_num_partitions() * get_num_partitions() )
300  */
301  inline std::vector<sframe>& edge_group(size_t groupa = 0,
302  size_t groupb = 0) {
303  ASSERT_LT(groupa, m_num_groups);
304  ASSERT_LT(groupb, m_num_groups);
305  return m_edge_groups.at({groupa, groupb});
306  }
307 
308  /**
309  * Returns the collection of SFrames containing all the edges
310  * between vertex group groupa and vertex group groupb.
311  */
312  inline const std::vector<sframe>& edge_group(size_t groupa,
313  size_t groupb) const {
314  ASSERT_LT(groupa, m_num_groups);
315  ASSERT_LT(groupb, m_num_groups);
316  return m_edge_groups.at({groupa, groupb});
317  }
318 
319 
320  /**
321  * Returns the SFrame containing all the vertices in a given partition
322  * of a group groupid
323  *
324  * This function can be used as the left hand side of an assignment.
325  * i.e.
326  * \code
327  * vertex_partition(part, group) = sframe
328  * \endcode
329  */
330  inline sframe& vertex_partition(size_t partition,
331  size_t groupid = 0) {
332  ASSERT_LT(partition, m_num_partitions);
333  return vertex_group(groupid)[partition];
334  }
335 
336 
337  /**
338  * Returns the SFrame containing all the vertices in a given partition
339  * of a group groupid
340  */
341  inline const sframe& vertex_partition(size_t partition,
342  size_t groupid = 0) const {
343  ASSERT_LT(partition, m_num_partitions);
344  return vertex_group(groupid)[partition];
345  }
346 
347 
348  /**
349  * Returns the SFrame containing all the vertices in a given partition
350  * of a group groupid
351  *
352  * This function can be used as the left hand side of an assignment.
353  * i.e.
354  * \code
355  * vertex_partition(part, group) = sframe
356  * \endcode
357  */
358  inline sframe& vertex_partition(vertex_partition_address part) {
359  return vertex_partition(part.partition, part.group);
360  }
361 
362 
363  /**
364  * Returns the SFrame containing all the vertices in a given partition
365  * of a group groupid
366  */
367  inline const sframe& vertex_partition(vertex_partition_address part) const {
368  return vertex_partition(part.partition, part.group);
369  }
370 
371  /**
372  * Returns the SFrame containing all edges in the partition (partition1,
373  * partition2) between vertex group groupa and vertex group groupb.
374  *
375  * This function can be used as the left hand side of an assignment.
376  * i.e.
377  * \code
378  * edge_partition(part1, part2) = sframe
379  * \endcode
380  */
381  inline sframe& edge_partition(size_t partition1,
382  size_t partition2,
383  size_t groupa = 0,
384  size_t groupb = 0) {
385  ASSERT_LT(partition1, m_num_partitions);
386  ASSERT_LT(partition2, m_num_partitions);
387  return edge_group(groupa, groupb)[partition1 * m_num_partitions + partition2];
388  }
389 
390 
391  /**
392  * Returns the SFrame containing all edges in the partition (partition1,
393  * partition2) between vertex group groupa and vertex group groupb.
394  */
395  inline const sframe& edge_partition(size_t partition1,
396  size_t partition2,
397  size_t groupa = 0,
398  size_t groupb = 0) const {
399  ASSERT_LT(partition1, m_num_partitions);
400  ASSERT_LT(partition2, m_num_partitions);
401  return edge_group(groupa, groupb)[partition1 * m_num_partitions + partition2];
402  }
403 
404 
405  /**
406  * Returns the SFrame containing all edges in the partition (partition1,
407  * partition2) between vertex group groupa and vertex group groupb.
408  *
409  * This function can be used as the left hand side of an assignment.
410  * i.e.
411  * \code
412  * edge_partition(part1, part2) = sframe
413  * \endcode
414  */
415  inline sframe& edge_partition(edge_partition_address address) {
416  return edge_partition(address.partition1, address.partition2,
417  address.src_group, address.dst_group);
418  }
419 
420 
421  /**
422  * Returns the SFrame containing all edges in the partition (partition1,
423  * partition2) between vertex group groupa and vertex group groupb.
424  */
425  inline const sframe& edge_partition(edge_partition_address address) const {
426  return edge_partition(address.partition1, address.partition2,
427  address.src_group, address.dst_group);
428  }
429 
430 
431  /**
432  * Returns the name of the vertex group given the group id. Assertion failure
433  * if the group does not exist.
434  */
435  inline std::string get_vertex_group_name(size_t idx) const {
436  ASSERT_LT(idx, m_vertex_group_names.size());
437  return m_vertex_group_names[idx];
438  }
439 
440  /**
441  * Returns the id of the vertex group given the group name. Returns
442  * (size_t)(-1) on failure.
443  */
444  inline size_t get_vertex_group_id(std::string name) const {
445  auto iter = std::find(m_vertex_group_names.begin(), m_vertex_group_names.end(), name);
446  if (iter != m_vertex_group_names.end()) return std::distance(m_vertex_group_names.begin(), iter);
447  else return (size_t)(-1);
448  }
449 
450  /**
451  * Returns number of edges from groupa to groupb.
452  */
453  inline size_t num_edges(size_t groupa, size_t groupb) const {
454  size_t ret = 0;
455  auto& egroup = edge_group(groupa, groupb);
456  for (auto& sf : egroup) {
457  ret += sf.size();
458  }
459  return ret;
460  };
461 
462  /**
463  * Returns the total number of edges.
464  */
465  inline size_t num_edges() const { return m_num_edges; };
466 
467  /**
468  * Returns the number of vertices in the group.
469  */
470  inline size_t num_vertices(size_t group) const {
471  size_t ret = 0;
472  auto& vgroup = vertex_group(group);
473  for (auto& sf : vgroup) {
474  ret += sf.size();
475  }
476  return ret;
477  };
478 
479  /**
480  * Returns the total number of vertices.
481  */
482  inline size_t num_vertices() const { return m_num_vertices; }
483 
484  /**
485  * Returns true if the graph is empty.
486  */
487  inline bool empty() const { return (m_num_vertices == 0) && (m_num_edges == 0);}
488 
489  /**
490  * Returns the number of vertex partitions
491  */
492  inline size_t get_num_partitions() const {
493  return m_num_partitions;
494  }
495 
496  /**
497  * Returns the number of vertex groups
498  */
499  inline size_t get_num_groups() const {
500  return m_num_groups;
501  }
502 
503  inline flex_type_enum vertex_id_type() const { return m_vid_type; }
504 
505 /**************************************************************************/
506 /* */
507 /* Serialization */
508 /* */
509 /**************************************************************************/
510  /**
511  * \Internal
512  *
513  * Save to a directory oarchive.
514  */
515  void save(oarchive& oarc) const;
516 
517  /**
518  * \Internal
519  *
520  * Save to directory oarchive using sframe save reference.
521  *
522  * \see sframe_save_weak_reference
523  */
524  void save_reference(oarchive& oarc) const;
525 
526  /**
527  * \Internal
528  *
529  * Load from a directory oarchive.
530  */
531  void load(iarchive& iarc);
532 
533 /**************************************************************************/
534 /* */
535 /* Unity Related Operations */
536 /* */
537 /**************************************************************************/
538  void add_vertex_field(std::shared_ptr<sarray<flexible_type>> data, std::string field);
539 
540  void add_edge_field(std::shared_ptr<sarray<flexible_type>> data, std::string field);
541 
542  void swap_vertex_fields(const std::string& field1, const std::string& field2);
543 
544  void swap_edge_fields(const std::string& field1, const std::string& field2);
545 
546  void rename_vertex_fields(const std::vector<std::string>& oldnames,
547  const std::vector<std::string>& newnames);
548 
549  void rename_edge_fields(const std::vector<std::string>& oldnames,
550  const std::vector<std::string>& newnames);
551 
552 /**************************************************************************/
553 /* */
554 /* Compute Related Operations */
555 /* */
556 /**************************************************************************/
557  /**
558  * Replaces a particular column in all partitions of a particular group of
559  * vertices. The column must exist. Returns true on success.
560  */
561  bool replace_vertex_field(const std::vector<std::shared_ptr<sarray<flexible_type>>>& column,
562  std::string column_name,
563  size_t group = 0);
564 
565  /**
566  * Same as \ref replace_vertex_field, but all values are in memory
567  * The column must exist. Assertion failure otherwise.
568  */
569  template<typename T, typename FLEX_TYPE=T>
570  bool replace_vertex_field(std::vector<std::vector<T>>& column,
571  std::string column_name,
572  size_t group = 0);
573 
574  /**
575  * Replaces a particular edge in all partitions of a particular group of
576  * edges. The column must exist. Returns true on success.
577  */
578  bool replace_edge_field(const std::vector<std::shared_ptr<sarray<flexible_type>>>& column,
579  std::string column_name,
580  size_t groupa = 0, size_t groupb = 0);
581 
582  /**
583  * Add a particular column in all partitions of a particular group of
584  * vertices. The column must not exist. Returns true on success.
585  */
586  bool add_vertex_field(const std::vector<std::shared_ptr<sarray<flexible_type>>>& column,
587  std::string column_name,
588  size_t group = 0);
589 
590  /**
591  * Same as \ref add_vertex_field, but all values are in memory
592  * The column must exist. Assertion failure otherwise.
593  */
594  template<typename T, typename FLEX_TYPE=T>
595  bool add_vertex_field(std::vector<std::vector<T>>& column,
596  std::string column_name,
597  flex_type_enum column_type,
598  size_t group = 0);
599 
600  /**
601  * Add a particular edge in all partitions of a particular group of
602  * edges. The column must not exist. Returns true on success.
603  */
604  bool add_edge_field(const std::vector<std::shared_ptr<sarray<flexible_type>>>& column,
605  std::string column_name,
606  size_t groupa = 0, size_t groupb = 0);
607 
608 
609  /**
610  * Extracts the data for a particular field of a group of vertices.
611  * The column must exist. Assertion failure otherwise.
612  */
613  std::vector<std::shared_ptr<sarray<flexible_type>>>
614  fetch_vertex_data_field(std::string column_name, size_t group = 0) const;
615 
616 
617  /**
618  * Same as \ref fetch_vertex_data_field, but store all values in memory
619  * and return std::vector<std::vector<flexible_type>>
620  * The column must exist. Assertion failure otherwise.
621  */
622  std::vector<std::vector<flexible_type>>
623  fetch_vertex_data_field_in_memory(std::string column_name, size_t groupid = 0) const;
624 
625  /**
626  * Extracts the data for a particular field of a group of edges.
627  * The column must exist. Assertion failure otherwise.
628  */
629  std::vector<std::shared_ptr<sarray<flexible_type>>>
630  fetch_edge_data_field(std::string column_name, size_t groupa = 0, size_t groupb = 0) const;
631 
632 
633  /**
634  * Gets the offset of the vertex field. Throws an exception on failure.
635  */
636  size_t get_vertex_field_id(std::string column_name, size_t group = 0) const;
637 
638 
639  /**
640  * Gets the offset of the edge field. Throws an exception on failure.
641  */
642  size_t get_edge_field_id(std::string column_name, size_t groupa = 0, size_t groupb = 0);
643 
644  private:
645 
646  /**
647  * Clears the graph, and reinitializes it with a given number of partitions
648  * and only 1 group (the default group)
649  */
650  void init(size_t num_partitions);
651 
652  /**
653  * Initialize the vertex id type.
654  * Reset vertex/edge sframes to have the proper id column types.
655  */
656  void bootstrap_vertex_id_type(flex_type_enum id_type);
657 
658  /**
659  * Add new groups up to num_groups - 1. Initialize the vertex partitions
660  * and edge partitions.
661  * num_groups must be strictly greater than the current number of groups.
662  */
663  void increase_number_of_groups(size_t num_groups);
664 
665  /**
666  * Merges the insertion buffer for a vertex partition of a particular group
667  * into the main vertex store.
668  *
669  * \param edge_buffer vector of length m_num_partition^2.
670  */
671  void commit_edge_buffer(size_t groupa,
672  size_t groupb,
673  sframe edge_buffer);
674 
675  /**
676  * Merges the vertex data of the group with the vertex data buffer.
677  *
678  * \param vertex_buffer vector of length m_num_partition.
679  */
680  void commit_vertex_buffer(size_t group,
681  std::vector<sframe>& vertex_buffer);
682 
683  /**
684  * Helper function to merge a single vertex partition.
685  */
686  sframe merge_vertex_partition(sframe& current_vdata, sframe& new_vdata);
687 
688  /**
689  * Return the vertex partition number for given vertex id.
690  */
691  inline size_t get_vertex_partition(const flexible_type& vid) { return vid.hash() % m_num_partitions; }
692 
693  /**
694  * Return the edge partition number for an edge.
695  */
696  inline size_t get_edge_partition(const flexible_type& src, const flexible_type& dst) {
697  return get_vertex_partition(src) * m_num_partitions + get_vertex_partition(dst);
698  }
699 
700  /**
701  * Returns a vector of vertex ids in the given partition and vertex group.
702  */
703  inline std::vector<flexible_type> get_vertex_ids(size_t partition, size_t group) const {
704  std::vector<flexible_type> ret;
705  const sframe& sf = vertex_partition(partition, group);
706  auto id_column = sf.select_column(VID_COLUMN_NAME);
707  ret.reserve(id_column->size());
708  copy(*id_column, std::inserter(ret, ret.begin()));
709  return ret;
710  }
711 
712  /**
713  * Returns true if this field name begins with __
714  */
715  static inline bool is_private_field(std::string s) {
716  return s.length() > 2 && s[0] == '_' && s[1] == '_';
717  }
718 
719  /**
720  * A list of all the group names. The 0th group (the default group)
721  * is always the group name "default".
722  */
723  std::vector<std::string> m_vertex_group_names;
724 
725  /**
726  * Number of SFrames each vertex group is cut up into.
727  */
728  size_t m_num_partitions = 0;
729 
730  /**
731  * The number of groups
732  */
733  size_t m_num_groups = 1;
734 
735  /**
736  * Cached graph statistics: number of vertices.
737  */
738  size_t m_num_vertices = 0;
739 
740  /**
741  * Cached graph statistics: number of edges.
742  */
743  size_t m_num_edges = 0;
744 
745  /**
746  * The vertex id type.
747  */
749 
750  /**
751  * An array of the same length as vertex_group_names.
752  * Each vertex group is represented as an array of sframes.
753  */
754  std::vector<std::vector<sframe> > m_vertex_groups;
755 
756  /**
757  * A map from (group, group) to an edge group.
758  * Each edge group is represented as an array of sframes.
759  * Only the "upper triangle" of the group pair is defined. i.e.
760  * to find all edges between group a and group b, it is in
761  * m_edge_groups({min(a,b),max(a,b)})
762  */
763  std::map<std::pair<size_t, size_t>, std::vector<sframe> > m_edge_groups;
764 
765  /**************************************************************************/
766  /* */
767  /* Helper Functions */
768  /* */
769  /**************************************************************************/
770  private:
771  /**
772  * Adjust the columns in sf to be the order of column_names.
773  * All columns in sf must exist in column_names, and the types must match column_types.
774  * For columns in column_names that are not in sf, add a dummy column filled
775  * with flexible_undefined values.
776  */
777  static bool reorder_and_add_new_columns(sframe& sf,
778  const std::vector<std::string>& column_names,
779  const std::vector<flex_type_enum>& column_types);
780 
781  /**
782  * Union the column name and column types of two sframes.
783  * By the end of the function, two sframes should have
784  * the same column names and types, in the same order.
785  *
786  * The intersection of the column set must have the same type.
787  *
788  * \code
789  * (column_names, column_types)= (sframe_a.column_names(), sframe_a.column_types())
790  * for (col in sframe_b.columns() but not in sframe_a.columns()) {
791  * column_names.push_back(col.name);
792  * column_types.push_back(col.type);
793  * }
794  * reorder_and_add_new_columns(sframe_a, column_names, column_types);
795  * reorder_and_add_new_columns(sframe_b, column_names, column_types);
796  * \endcode
797  *
798  * Return true if union succeeds.
799  */
800  static bool union_columns(sframe& sframe_a, sframe& sframe_b);
801 
802  typedef google::sparse_hash_map<flexible_type, size_t, std::hash<flexible_type> > vid_hash_map_type;
803 
804  /**
805  * Fetch the vertex id to row id lookup table.
806  */
807  std::shared_ptr<vid_hash_map_type> fetch_vid_hash_map(size_t partition, size_t group);
808 
809  /**
810  * Initialize an empty sframe with column names and types.
811  */
812  static inline void init_empty_sframe(sframe& sf,
813  std::vector<std::string> column_names = {},
814  std::vector<flex_type_enum> column_types = {}) {
815  sframe new_sf;
816  new_sf.open_for_write(column_names, column_types, "", 1 /*one segment*/);
817  new_sf.close();
818  sf = new_sf;
819  }
820 
821  /**
822  * Do a quick validation of the add vertices operation
823  * Throws a string on failure.
824  */
825  void fast_validate_add_vertices(const sframe& vertices,
826  size_t group);
827 
828  /**
829  * Do a quick validation of the add edges operation
830  * Throws a string on failure.
831  */
832  void fast_validate_add_edges(const sframe& edges,
833  size_t groupa, size_t groupb);
834 };
835 
836 
837 template<typename T, typename FLEX_TYPE>
838 bool sgraph::add_vertex_field(std::vector<std::vector<T>>& column,
839  std::string column_name,
840  flex_type_enum column_type,
841  size_t groupid) {
842  auto vfields = get_vertex_fields();
843  if (std::count(vfields.begin(), vfields.end(), column_name) != 0) {
844  logstream(LOG_ERROR) << "Vertex field already exists." << std::endl;
845  return false;
846  }
847  auto& vgroups = vertex_group(groupid);
848  if (vgroups.size() != column.size()) {
849  logstream(LOG_ERROR) << "Partition Size Mismatch." << std::endl;
850  return false;
851  }
852  parallel_for (0, vgroups.size(), [&](size_t i) {
853  auto sa = std::make_shared<sarray<flexible_type>>();
854  sa->open_for_write(1);
855  sa->set_type(column_type);
856  auto writer = sa->get_output_iterator(0);
857  for (auto& v: column[i])
858  *writer++ = std::move((FLEX_TYPE)(v));
859  sa->close();
860  vgroups[i] = vgroups[i].add_column(sa, column_name);
861  });
862  return true;
863 }
864 
865 template<typename T, typename FLEX_TYPE>
866 bool sgraph::replace_vertex_field(std::vector<std::vector<T>>& column,
867  std::string column_name,
868  size_t groupid) {
869 
870  auto vfields = get_vertex_fields();
871  if (std::count(vfields.begin(), vfields.end(), column_name) == 0) {
872  logstream(LOG_ERROR) << "Vertex field not found." << std::endl;
873  return false;
874  }
875  auto& vgroups = vertex_group(groupid);
876  if (vgroups.size() != column.size()) {
877  logstream(LOG_ERROR) << "Partition Size Mismatch." << std::endl;
878  return false;
879  }
880 
881  auto column_type = get_vertex_field_types()[get_vertex_field_id(column_name)];
882 
883  parallel_for (0, vgroups.size(), [&](size_t i) {
884  auto sa = std::make_shared<sarray<flexible_type>>();
885  sa->open_for_write(1);
886  sa->set_type(column_type);
887  auto writer = sa->get_output_iterator(0);
888  for (auto& v: column[i])
889  *writer++ = std::move((FLEX_TYPE)(v));
890  sa->close();
891  vgroups[i] = vgroups[i].replace_column(sa, column_name);
892  });
893  return true;
894 }
895 
896 /// \}
897 } // namespace turi
898 #endif
size_t num_edges(size_t groupa, size_t groupb) const
Definition: sgraph.hpp:453
#define logstream(lvl)
Definition: logger.hpp:276
void parallel_for(size_t begin, size_t end, const FunctionType &fn)
Definition: lambda_omp.hpp:93
const sframe & vertex_partition(vertex_partition_address part) const
Definition: sgraph.hpp:367
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
sframe group(sframe sframe_in, std::string key_column)
size_t get_num_groups() const
Definition: sgraph.hpp:499
bool init_edge_field(const std::string &field, const flexible_type &init_value, size_t groupa=0, size_t groupb=0)
size_t num_vertices() const
Definition: sgraph.hpp:482
std::string get_vertex_group_name(size_t idx) const
Definition: sgraph.hpp:435
bool empty() const
Definition: sgraph.hpp:487
size_t num_edges() const
Definition: sgraph.hpp:465
std::vector< sframe > & vertex_group(size_t groupid=0)
Definition: sgraph.hpp:274
bool remove_edge_field(const std::string &field, size_t groupa=0, size_t groupb=0)
bool copy_edge_field(const std::string &field, const std::string &new_field, size_t groupa=0, size_t groupb=0)
const sframe & edge_partition(size_t partition1, size_t partition2, size_t groupa=0, size_t groupb=0) const
Definition: sgraph.hpp:395
size_t get_num_partitions() const
Definition: sgraph.hpp:492
bool copy_vertex_field(const std::string &field, const std::string &new_field, size_t group=0)
#define LOG_ERROR
Definition: logger.hpp:97
bool select_edge_fields(const std::vector< std::string > &fields, size_t groupa=0, size_t groupb=0)
std::shared_ptr< sarray< flexible_type > > select_column(size_t column_id) const
sframe get_edges(const std::vector< flexible_type > &source_vids={}, const std::vector< flexible_type > &target_vids={}, const options_map_t &field_constraint=options_map_t(), size_t groupa=0, size_t groupb=0) const
size_t SGRAPH_DEFAULT_NUM_PARTITIONS
size_t num_vertices(size_t group) const
Definition: sgraph.hpp:470
void open_for_write(const std::vector< std::string > &column_names, const std::vector< flex_type_enum > &column_types, const std::string &frame_sidx_file="", size_t nsegments=SFRAME_DEFAULT_NUM_SEGMENTS, bool fail_on_column_names=true)
Definition: sframe.hpp:265
size_t get_vertex_field_id(std::string column_name, size_t group=0) const
sframe & edge_partition(size_t partition1, size_t partition2, size_t groupa=0, size_t groupb=0)
Definition: sgraph.hpp:381
bool init_vertex_field(const std::string &field, const flexible_type &init_value, size_t group=0)
const sframe & vertex_partition(size_t partition, size_t groupid=0) const
Definition: sgraph.hpp:341
void load(iarchive &iarc)
const std::vector< sframe > & edge_group(size_t groupa, size_t groupb) const
Definition: sgraph.hpp:312
void save(oarchive &oarc) const
void save_reference(oarchive &oarc) const
sframe & vertex_partition(vertex_partition_address part)
Definition: sgraph.hpp:358
std::vector< std::shared_ptr< sarray< flexible_type > > > fetch_edge_data_field(std::string column_name, size_t groupa=0, size_t groupb=0) const
void copy(Iterator begin, Iterator end, SWriter &&writer)
Definition: algorithm.hpp:416
bool remove_vertex_field(const std::string &field, size_t group=0)
bool select_vertex_fields(const std::vector< std::string > &fields, size_t group=0)
size_t get_vertex_group_id(std::string name) const
Definition: sgraph.hpp:444
bool add_vertices(const dataframe_t &vertices, const std::string &id_field_name, size_t group=0)
const std::vector< sframe > & vertex_group(size_t groupid=0) const
Definition: sgraph.hpp:283
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
bool replace_vertex_field(const std::vector< std::shared_ptr< sarray< flexible_type >>> &column, std::string column_name, size_t group=0)
std::vector< std::vector< flexible_type > > fetch_vertex_data_field_in_memory(std::string column_name, size_t groupid=0) const
std::vector< sframe > & edge_group(size_t groupa=0, size_t groupb=0)
Definition: sgraph.hpp:301
size_t get_edge_field_id(std::string column_name, size_t groupa=0, size_t groupb=0)
sframe get_vertices(const std::vector< flexible_type > &vid_vec={}, const options_map_t &field_constraint=options_map_t(), size_t groupid=0) const
std::vector< std::shared_ptr< sarray< flexible_type > > > fetch_vertex_data_field(std::string column_name, size_t group=0) const
bool replace_edge_field(const std::vector< std::shared_ptr< sarray< flexible_type >>> &column, std::string column_name, size_t groupa=0, size_t groupb=0)
const sframe & edge_partition(edge_partition_address address) const
Definition: sgraph.hpp:425
bool add_edges(const dataframe_t &edges, const std::string &source_field_name, const std::string &target_field_name, size_t groupa=0, size_t groupb=0)
sframe & vertex_partition(size_t partition, size_t groupid=0)
Definition: sgraph.hpp:330
std::vector< std::string > get_vertex_fields(size_t groupid=0) const
sframe & edge_partition(edge_partition_address address)
Definition: sgraph.hpp:415
std::vector< flex_type_enum > get_edge_field_types(size_t groupa=0, size_t groupb=0) const
std::vector< flex_type_enum > get_vertex_field_types(size_t groupid=0) const
std::vector< std::string > get_edge_fields(size_t groupa=0, size_t groupb=0) const