Turi Create  4.0
turi::recsys::recsys_itemcf Class Referenceabstract

#include <toolkits/recsys/models/itemcf.hpp>

Public Member Functions

void init_options (const std::map< std::string, flexible_type > &options) override
 
std::map< std::string, flexible_typetrain (const v2::ml_data &data) override
 
sframe predict (const v2::ml_data &test_data) const override
 
void score_all_items (std::vector< std::pair< size_t, double > > &scores, const std::vector< v2::ml_data_entry > &query_row, size_t top_k, const std::vector< std::pair< size_t, double > > &user_item_list, const std::vector< std::pair< size_t, double > > &new_user_item_data, const std::vector< v2::ml_data_row_reference > &new_observation_data, const std::shared_ptr< v2::ml_data_side_features > &known_side_features) const override
 
std::string response_column_name () const
 
void internal_save (turi::oarchive &oarc) const override
 
void internal_load (turi::iarchive &iarc, size_t version) override
 
sframe get_similar_items (std::shared_ptr< sarray< flexible_type > > items, size_t topk=0) const override
 
sframe get_similar_users (std::shared_ptr< sarray< flexible_type > > items, size_t topk=0) const override
 
virtual void get_item_similarity_scores (size_t item, std::vector< std::pair< size_t, double > > &sim_scores) const
 
virtual sframe get_item_intersection_info (const sframe &unindexed_item_pairs) const
 
v2::ml_data create_ml_data (const sframe &data, const sframe &new_user_side_data=sframe(), const sframe &new_item_side_data=sframe()) const
 
flex_type_enum user_type () const
 
flex_type_enum item_type () const
 
void setup_and_train (const sframe &observation_data, const sframe &user_side_data=sframe(), const sframe &item_side_data=sframe(), const std::map< std::string, variant_type > &other_data=(std::map< std::string, variant_type >()))
 
void import_all_from_other_model (const recsys_model_base *other)
 
std::shared_ptr< recsys_model_baseget_popularity_baseline () const
 
sframe recommend (const sframe &reference_data, size_t top_k, const sframe &restriction_data=sframe(), const sframe &exclusion_data=sframe(), const sframe &new_observation_data=sframe(), const sframe &new_user_data=sframe(), const sframe &new_item_data=sframe(), bool exclude_training_interactions=true, double diversity_factor=0, size_t random_seed=0) const
 
sframe precision_recall_stats (const sframe &indexed_validation_data, const sframe &recommend_output, const std::vector< size_t > &cutoffs) const
 
sframe get_num_items_per_user () const
 
sframe get_num_users_per_item () const
 
size_t get_version () const override
 
virtual void save_impl (turi::oarchive &oarc) const override
 Serialization – save.
 
void load_version (turi::iarchive &iarc, size_t version) override
 Serialization – load.
 
std::map< std::string, flexible_typeget_train_stats ()
 Get stats about algorithm runtime.
 
std::vector< std::string > list_fields ()
 
const variant_typeget_value_from_state (std::string key)
 
const std::map< std::string, flexible_type > & get_current_options () const
 
std::map< std::string, flexible_typeget_default_options () const
 
const flexible_typeget_option_value (const std::string &name) const
 
const std::map< std::string, variant_type > & get_state () const
 
bool is_trained () const
 
void set_options (const std::map< std::string, flexible_type > &_options)
 
void add_or_update_state (const std::map< std::string, variant_type > &dict)
 
const std::vector< option_handling::option_info > & get_option_info () const
 
virtual const char * name ()=0
 
virtual const std::string & uid ()=0
 
void save_to_url (const std::string &url, const variant_map_type &side_data={})
 
void save_model_to_data (std::ostream &out)
 
const std::map< std::string, std::vector< std::string > > & list_functions ()
 
const std::vector< std::string > & list_get_properties ()
 
const std::vector< std::string > & list_set_properties ()
 
variant_type call_function (const std::string &function, variant_map_type argument)
 
variant_type get_property (const std::string &property)
 
variant_type set_property (const std::string &property, variant_map_type argument)
 
const std::string & get_docstring (const std::string &symbol)
 
virtual void perform_registration ()
 

Static Public Attributes

static constexpr size_t USER_COLUMN_INDEX = 0
 The metadata needed for translating the data back and forth.
 

Protected Member Functions

virtual std::map< std::string, flexible_typetrain (const v2::ml_data &training_data_by_user, const v2::ml_data &training_data_by_item)
 
template<typename GetSimilarFunction >
sframe _create_similar_sframe (size_t column_index, std::shared_ptr< sarray< flexible_type > > items, size_t k, GetSimilarFunction &&similar) const
 
void register_defaults (const std::string &fnname, const variant_map_type &arguments)
 
void register_function (std::string fnname, const std::vector< std::string > &arguments, impl_fn fn)
 
void register_setter (const std::string &propname, impl_fn setfn)
 
void register_getter (const std::string &propname, impl_fn getfn)
 
void register_docstring (const std::pair< std::string, std::string > &fnname_docstring)
 

Protected Attributes

std::shared_ptr< sparse_similarity_lookupitem_sim
 
std::map< std::string, variant_typestate
 

Detailed Description

This provides an implementation of a collaborative filtering algorithm. The premise is to compute similarities (or distances) between all pairs of items. Several choices of similarity will be available, and these are functions of the list of users that were observed with the pair of items. Some choices of similarity can also leverage a score that the item was given by the user, e.g. a rating.

In the following, let u(a) be the set of users who rated item a, let E be the set of all (user, item) pairs, and let $ r_{u,i} $ be the rating that user $ u $ gave to item $ i $.

The three functions currently implemented are: Jaccard similarity: Let $u(a) = {k: (k,a) in E}$. Then Jaccard similarity is defined by:

\[J(a,b) = \frac{| u(a) \cap u(b) |}{ | u(a) \cup u(b) |} \]

Cosine similarity: This compares the ratings given by all users who rated both items (where all unobserved ratings $ r_{ua}$ are considered to be 0.

\[ d(a,b) = \frac{\sum_{k} r_{ka} * r_{kb}} {\sqrt{\sum_{k} r_{ka}^2} \sqrt{\sum_{k} r_{kb}^2}} \]

Pearson Correlation similarity: A problem with Cosine similarity measure is that it does not consider the differences in the mean and variance of the ratings of items a and b. Pearson Correlation is a popular measure where the effects of mean and variance have been removed. Let $ u(a,b) = \{k: (k,a) \in E and (k,b) \in E\} $ denote the set of users who rated both items a and b.

\[ d(a,b) = \frac{\sum_{k \in u(a,b)} (r_{ka} - \bar{r}_a) * (r_{kb} - \bar{r}_b)} {\sqrt{\sum_{k \in u(a,b)} (r_{ka} - \bar{r}_a)^2} \sqrt{\sum_{k \in u(a,b)} (r_{kb} - \bar{r}_b)^2}} \]


Implementation details:

  • Jaccard is implemented using two sufficient statistics:
    1. C(i): The number of times item i was rated.
    2. C(i,j): The number of times item i and j were rated by the same user.
    The final similarity is computed by: $ C(i,j)/(C(i) + C(j) - C(i,j)) $
  • Cosine similarity is implemented using two statistics.
    1. C(i): The sum of squared ratings given to item i.
    2. C(i,j): The sum of products of the ratings for all users who rated both i and j.
    The distance is computed as: $ d(i,j) = \frac{C(i,j)}{\sqrt{C(i)C(j)}} $
  • Pearson Correlation similarity is implemented using three statistics.
    1. C(i) : the variance of ratings given to item i.
    2. C(i,j) : the sum of correlation score by all users who rated both items i and j
    The final similarity is computed by: $ d(i,j) = \frac{C(i,j)}{\sqrt{C(i)C(j)}} $

Details of computing item similarities:

  1. Get the individual statistics of items C(i)
  2. For each pair of items i and j that both rated by user u, update the overlapping statistics C(i,j).
  3. Get the final score matrix by normalizing C(i,j) with individual statistics
  4. Sort each row of the score matrix to get top-k similar items

Definition at line 96 of file itemcf.hpp.

Member Function Documentation

◆ _create_similar_sframe()

template<typename GetSimilarFunction >
sframe turi::recsys::recsys_model_base::_create_similar_sframe ( size_t  column_index,
std::shared_ptr< sarray< flexible_type > >  items,
size_t  k,
GetSimilarFunction &&  similar 
) const
protectedinherited

Utility function to aid in the retrieval of similar items.

GetSimilarFunction is a function called as f(size_t idx, std::vector<std::pair<size_t, double> >& idx_dist_dest);

Definition at line 476 of file recsys_model_base.hpp.

◆ add_or_update_state()

void turi::ml_model_base::add_or_update_state ( const std::map< std::string, variant_type > &  dict)
inherited

Append the key value store of the model.

Parameters
[in]dictOptions (Key-Value pairs) to set

◆ call_function()

variant_type turi::model_base::call_function ( const std::string &  function,
variant_map_type  argument 
)
inherited

Calls a user defined function.

◆ create_ml_data()

v2::ml_data turi::recsys::recsys_model_base::create_ml_data ( const sframe data,
const sframe new_user_side_data = sframe(),
const sframe new_item_side_data = sframe() 
) const
inherited

Creates an ml_data object according to the given schema. No target column.

◆ get_current_options()

const std::map<std::string, flexible_type>& turi::ml_model_base::get_current_options ( ) const
inherited

Get current options.

Returns
Dictionary containing current options.

Python side interface

Interfaces with the get_current_options function in the Python side.

◆ get_default_options()

std::map<std::string, flexible_type> turi::ml_model_base::get_default_options ( ) const
inherited

Get default options.

Returns
Dictionary with default options.

Python side interface

Interfaces with the get_default_options function in the Python side.

◆ get_docstring()

const std::string& turi::model_base::get_docstring ( const std::string &  symbol)
inherited

Returns the toolkit documentation for a function or property.

◆ get_item_intersection_info()

virtual sframe turi::recsys::recsys_model_base::get_item_intersection_info ( const sframe unindexed_item_pairs) const
virtualinherited

Returns information about all the users in the overlap of the item pairs listed in two columns in unindexed_item_pairs. All these items must be present in the training data.

Returns an sframe with information about this intersection. Columns are item_1, item_2, num_users_1, num_users_2, item_intersection (dict, user ->

◆ get_item_similarity_scores()

virtual void turi::recsys::recsys_model_base::get_item_similarity_scores ( size_t  item,
std::vector< std::pair< size_t, double > > &  sim_scores 
) const
inlinevirtualinherited

For each of the items in sim_scores (first part of tuple), sets a similarity score (second part of tuple) that is higher for items similar to item.

Reimplemented in turi::recsys::recsys_factorization_model_base.

Definition at line 107 of file recsys_model_base.hpp.

◆ get_num_items_per_user()

sframe turi::recsys::recsys_model_base::get_num_items_per_user ( ) const
inherited

Return an SFrame containing each user id and the number of observations with that user in the training set.

◆ get_num_users_per_item()

sframe turi::recsys::recsys_model_base::get_num_users_per_item ( ) const
inherited

Return an SFrame containing each item and the number of observations with that item in the training set.

◆ get_option_info()

const std::vector<option_handling::option_info>& turi::ml_model_base::get_option_info ( ) const
inherited

Returns the option information struct for each of the set parameters.

◆ get_option_value()

const flexible_type& turi::ml_model_base::get_option_value ( const std::string &  name) const
inherited

Returns the value of an option. Throws an error if the option does not exist.

Parameters
[in]nameName of the option to get.

◆ get_popularity_baseline()

std::shared_ptr<recsys_model_base> turi::recsys::recsys_model_base::get_popularity_baseline ( ) const
inherited

Creates and returns a popularity baseline

◆ get_property()

variant_type turi::model_base::get_property ( const std::string &  property)
inherited

Reads a property.

◆ get_similar_items()

sframe turi::recsys::recsys_itemcf::get_similar_items ( std::shared_ptr< sarray< flexible_type > >  items,
size_t  topk = 0 
) const
overridevirtual

Get the nearest neighbors of a set of items.

Parameters
[in]indexed_itemsA SArray of items in flexible_type
[in]topkNumber of neighbors returned for each item
Returns
A SFrame with columns {"item", "similar", "score", "rank"}

Implements turi::recsys::recsys_model_base.

◆ get_similar_users()

sframe turi::recsys::recsys_itemcf::get_similar_users ( std::shared_ptr< sarray< flexible_type > >  items,
size_t  topk = 0 
) const
inlineoverridevirtual

Get the nearest neighbors of a set of users.

Parameters
[in]indexed_usersA SArray of users in flexible_type
[in]topkNumber of neighbors returned for each item
Returns
A SFrame with columns {"user", "similar", "score", "rank"}

Implements turi::recsys::recsys_model_base.

Definition at line 212 of file itemcf.hpp.

◆ get_state()

const std::map<std::string, variant_type>& turi::ml_model_base::get_state ( ) const
inherited

Get model.

Returns
Model map.

◆ get_value_from_state()

const variant_type& turi::ml_model_base::get_value_from_state ( std::string  key)
inherited

Returns the value of a particular key from the state.

Returns
Value of a key model_base for details.

Python side interface

From the python side, this is interfaced with the get() function or the [] operator in python.

◆ get_version()

size_t turi::recsys::recsys_model_base::get_version ( ) const
inlineoverridevirtualinherited

Returns the current version of the toolkit class for this instance, for serialization purposes.

Reimplemented from turi::model_base.

Definition at line 380 of file recsys_model_base.hpp.

◆ import_all_from_other_model()

void turi::recsys::recsys_model_base::import_all_from_other_model ( const recsys_model_base other)
inherited

Some of the models, such as popularity, can be built entirely from data already contained in the model. This method allows us to create a new model while bypassing the typical setup_and_train method. This simply imports all the relevant variables over; the final training is left up to the model.

◆ init_options()

void turi::recsys::recsys_itemcf::init_options ( const std::map< std::string, flexible_type > &  _options)
overridevirtual

Set one of the options in the algorithm. Use the option manager to set these options. If the option does not satisfy the conditions that the option manager has imposed on it. Errors will be thrown.

Parameters
[in]optionsOptions to set

Reimplemented from turi::ml_model_base.

◆ internal_load()

void turi::recsys::recsys_itemcf::internal_load ( turi::iarchive iarc,
size_t  version 
)
overridevirtual

Implement serialization (load). The model subclass should reimplement this particular function. The syntax follows the standard turicreate load() method.

When this method is called, all the model options have been set up in the base class and are readily accessible. Furthermore, once this function is called, the model is treated as trained and ready to be used for prediction and ranking. Thus loading a model can effectively replace the training stage.

Implements turi::recsys::recsys_model_base.

◆ internal_save()

void turi::recsys::recsys_itemcf::internal_save ( turi::oarchive oarc) const
overridevirtual

Implement serialization (save). The model subclass should reimplement this particular function. The syntax follows the standard turicreate save() method.

Implements turi::recsys::recsys_model_base.

◆ is_trained()

bool turi::ml_model_base::is_trained ( ) const
inherited

Is this model trained.

Returns
True if already trained.

◆ item_type()

flex_type_enum turi::recsys::recsys_model_base::item_type ( ) const
inlineinherited

Returns the flexible data type of the item column; The model must be trained at this point.

Definition at line 216 of file recsys_model_base.hpp.

◆ list_fields()

std::vector<std::string> turi::ml_model_base::list_fields ( )
inherited

Methods with already meaningful default implementations.

Lists all the keys accessible in the "model" map.

Returns
List of keys in the model map. model_base for details.

Python side interface

This is the function that the list_fields should call in python.

◆ list_functions()

const std::map<std::string, std::vector<std::string> >& turi::model_base::list_functions ( )
inherited

Lists all the registered functions. Returns a map of function name to array of argument names for the function.

◆ list_get_properties()

const std::vector<std::string>& turi::model_base::list_get_properties ( )
inherited

Lists all the get-table properties of the class.

◆ list_set_properties()

const std::vector<std::string>& turi::model_base::list_set_properties ( )
inherited

Lists all the set-table properties of the class.

◆ name()

virtual const char* turi::model_base::name ( )
pure virtualinherited

Returns the name of the toolkit class, as exposed to client code. For example, the Python proxy for this instance will have a type with this name.

Note: this function is typically overridden using the BEGIN_CLASS_MEMBER_REGISTRATION macro.

◆ perform_registration()

virtual void turi::model_base::perform_registration ( )
virtualinherited

Declare the base registration function. This class has to be handled specially; the macros don't work here due to the override declarations.

Reimplemented in turi::model_proxy.

◆ precision_recall_stats()

sframe turi::recsys::recsys_model_base::precision_recall_stats ( const sframe indexed_validation_data,
const sframe recommend_output,
const std::vector< size_t > &  cutoffs 
) const
inherited

Compute the precision and recall for a (potentially held out) set of observations.

Parameters
validation_dataA ml_data giving the validation set the precision and recall should be calculated on.
recommend_outputThe output of the recommend method. Note that recommend should be called with top_k larger than the max value in cutoffs.
cutoffsA vector of cutoffs for computing e.g. the top [5,10,50] rankings.
Returns
An sframe with 5 columns – user, cutoff, precision, recall, and item counts.

◆ predict()

sframe turi::recsys::recsys_itemcf::predict ( const v2::ml_data &  test_data) const
overridevirtual

During the predict phase, we perform the "vector matrix product" where we compute a score for a particular (user, item) pair. This score is a sum of similarities between an item and all the items observed for the given user. For similarity functions that incorporate some target value for each (user, item) pair, this prediction also multiples each similarity by that value, e.g. a rating they gave the item in question.

Implements turi::recsys::recsys_model_base.

◆ recommend()

sframe turi::recsys::recsys_model_base::recommend ( const sframe reference_data,
size_t  top_k,
const sframe restriction_data = sframe(),
const sframe exclusion_data = sframe(),
const sframe new_observation_data = sframe(),
const sframe new_user_data = sframe(),
const sframe new_item_data = sframe(),
bool  exclude_training_interactions = true,
double  diversity_factor = 0,
size_t  random_seed = 0 
) const
inherited

Return the top_k ranks for this model based on sorted predictions.

Here, for each user in users, the top_k ranks are returned in the same format as the previous function.

If exclude_observations is given, these observations are excluded from the returned values.

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

◆ register_defaults()

void turi::model_base::register_defaults ( const std::string &  fnname,
const variant_map_type &  arguments 
)
protectedinherited

Registers default argument values

◆ register_docstring()

void turi::model_base::register_docstring ( const std::pair< std::string, std::string > &  fnname_docstring)
protectedinherited

Adds a docstring for the specified function or property name.

◆ register_function()

void turi::model_base::register_function ( std::string  fnname,
const std::vector< std::string > &  arguments,
impl_fn  fn 
)
protectedinherited

Adds a function with the specified name, and argument list.

◆ register_getter()

void turi::model_base::register_getter ( const std::string &  propname,
impl_fn  getfn 
)
protectedinherited

Adds a property getter with the specified name.

◆ register_setter()

void turi::model_base::register_setter ( const std::string &  propname,
impl_fn  setfn 
)
protectedinherited

Adds a property setter with the specified name.

◆ response_column_name()

std::string turi::recsys::recsys_itemcf::response_column_name ( ) const

Utilities

◆ save_model_to_data()

void turi::model_base::save_model_to_data ( std::ostream &  out)
inherited

Save a toolkit class to a data stream.

◆ save_to_url()

void turi::model_base::save_to_url ( const std::string &  url,
const variant_map_type &  side_data = {} 
)
inherited

Save a toolkit class to disk.

Parameters
urlThe destination url to store the class.
sidedataAny additional side information

◆ score_all_items()

void turi::recsys::recsys_itemcf::score_all_items ( std::vector< std::pair< size_t, double > > &  scores,
const std::vector< v2::ml_data_entry > &  query_row,
size_t  top_k,
const std::vector< std::pair< size_t, double > > &  user_item_list,
const std::vector< std::pair< size_t, double > > &  new_user_item_data,
const std::vector< v2::ml_data_row_reference > &  new_observation_data,
const std::shared_ptr< v2::ml_data_side_features > &  known_side_features 
) const
overridevirtual

For a given base observation, predict the score for all the items with all non-item columns replaced by the values in the base observation.

The base_observation vector is used to generate all the observations predicted. New observations are generated by repeatedly copying template_observation, then replacing the values in item_column_index by each possible item value.

Implements turi::recsys::recsys_model_base.

◆ set_options()

void turi::ml_model_base::set_options ( const std::map< std::string, flexible_type > &  _options)
inherited

Set one of the options in the algorithm.

The value are checked with the requirements given by the option instance.

Parameters
[in]nameName of the option.
[in]valueValue for the option.

◆ set_property()

variant_type turi::model_base::set_property ( const std::string &  property,
variant_map_type  argument 
)
inherited

Sets a property. The new value of the property should appear in the argument map under the key "value".

◆ setup_and_train()

void turi::recsys::recsys_model_base::setup_and_train ( const sframe observation_data,
const sframe user_side_data = sframe(),
const sframe item_side_data = sframe(),
const std::map< std::string, variant_type > &  other_data = (std::map< std::string, variant_type >()) 
)
inherited

Train the model using an sframe as the primary observations. This method constructs the internal ml_data objects from the current options.

Parameters
observation_dataAn SFrame containing at least a column containing user ids and a column containing item ids.
user_side_dataAn SFrame containing side information about users, where one column matches with the user column of observation data.
item_side_dataAn SFrame containing side information about items, where one column matches with the item column of observation data.
other_dataWhen provided, each model can implement a method set_extra_data in order to use this argument during training.
Returns
Statistics about the training.

◆ train() [1/2]

virtual std::map<std::string, flexible_type> turi::recsys::recsys_model_base::train ( const v2::ml_data &  training_data_by_user,
const v2::ml_data &  training_data_by_item 
)
inlineprotectedvirtualinherited

Takes two datasets for training.

Parameters
[in]training_data_by_userML-Data sorted by user
[in]training_data_by_itemML-Data sorted by item

Reimplemented in turi::recsys::recsys_factorization_model_base.

Definition at line 68 of file recsys_model_base.hpp.

◆ train() [2/2]

std::map<std::string, flexible_type> turi::recsys::recsys_itemcf::train ( const v2::ml_data &  data)
overridevirtual

When the number of items is less than 20k, it uses in memory computations train_in_memory(). Otherwise, it uses the implementation based on SGraph train_using_sgraph().

Implements turi::recsys::recsys_model_base.

◆ uid()

virtual const std::string& turi::model_base::uid ( )
pure virtualinherited

Returns a unique identifier for the toolkit class. It can be any unique ID. The UID is only used at runtime (to determine the concrete type of an arbitrary model_base instance) and is never stored.

Note: this function is typically overridden using the BEGIN_CLASS_MEMBER_REGISTRATION macro.

Implemented in turi::model_proxy.

◆ user_type()

flex_type_enum turi::recsys::recsys_model_base::user_type ( ) const
inlineinherited

Returns the flexible data type of the user column; The model must be trained at this point.

Definition at line 209 of file recsys_model_base.hpp.

Member Data Documentation

◆ item_sim

std::shared_ptr<sparse_similarity_lookup> turi::recsys::recsys_itemcf::item_sim
protected

The primary tool for the item similarity modeling part.

Definition at line 184 of file itemcf.hpp.

◆ state

std::map<std::string, variant_type> turi::ml_model_base::state
protectedinherited

All things python

Definition at line 206 of file ml_model.hpp.


The documentation for this class was generated from the following file: