Source code for coremltools.models.tree_ensemble

# Copyright (c) 2017, Apple Inc. All rights reserved.
#
# Use of this source code is governed by a BSD-3-clause license that can be
# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause

"""
Tree ensemble builder class to construct CoreML models.
"""
import collections as _collections

from .. import SPECIFICATION_VERSION as _SPECIFICATION_VERSION
from ..proto import Model_pb2 as _Model_pb2
from ..proto import TreeEnsemble_pb2 as _TreeEnsemble_pb2
from ._interface_management import (set_classifier_interface_params,
                                    set_regressor_interface_params)


[docs]class TreeEnsembleBase: """ Base class for the tree ensemble builder class. This should be instantiated either through the :py:class:`TreeEnsembleRegressor` or :py:class:`TreeEnsembleClassifier` classes. """
[docs] def __init__(self): """ High level Python API to build a tree ensemble model for Core ML. """ # Set inputs and outputs spec = _Model_pb2.Model() spec.specificationVersion = _SPECIFICATION_VERSION # Save the spec in the protobuf self.spec = spec
[docs] def set_default_prediction_value(self, values): """ Set the default prediction value(s). The values given here form the base prediction value that the values at activated leaves are added to. If values is a scalar, then the output of the tree must also be 1 dimensional; otherwise, values must be a list with length matching the dimension of values in the tree. Parameters ---------- values: [int | double | list[double]] Default values for predictions. """ if type(values) is not list: values = [float(values)] self.tree_parameters.numPredictionDimensions = len(values) for value in values: self.tree_parameters.basePredictionValue.append(value)
[docs] def set_post_evaluation_transform(self, value): r""" Set the post processing transform applied after the prediction value from the tree ensemble. Parameters ---------- value: str A value denoting the transform applied. Possible values are: - ``"NoTransform"`` (default). Do not apply a transform. - ``"Classification_SoftMax"``. Apply a softmax function to the outcome to produce normalized, non-negative scores that sum to 1. The transformation applied to dimension `i` is equivalent to: .. math:: \frac{e^{x_i}}{\sum_j e^{x_j}} Note: This is the output transformation applied by the XGBoost package with multiclass classification. - ``"Regression_Logistic"``. Applies a logistic transform the predicted value, specifically: .. math:: (1 + e^{-v})^{-1} This is the transformation used in binary classification. """ self.tree_spec.postEvaluationTransform = _TreeEnsemble_pb2.TreeEnsemblePostEvaluationTransform.Value( value )
[docs] def add_branch_node( self, tree_id, node_id, feature_index, feature_value, branch_mode, true_child_id, false_child_id, relative_hit_rate=None, missing_value_tracks_true_child=False, ): """ Add a branch node to the tree ensemble. Parameters ---------- tree_id: int ID of the tree to add the node to. node_id: int ID of the node within the tree. feature_index: int Index of the feature in the input being split on. feature_value: double or int The value used in the feature comparison determining the traversal direction from this node. branch_mode: str Branch mode of the node, specifying the condition under which the node referenced by ``true_child_id`` is called next. Must be one of the following: - ``"BranchOnValueLessThanEqual"``. Traverse to node ``true_child_id`` if ``input[feature_index] <= feature_value``, and ``false_child_id`` otherwise. - ``"BranchOnValueLessThan"``. Traverse to node ``true_child_id`` if ``input[feature_index] < feature_value``, and ``false_child_id`` otherwise. - ``"BranchOnValueGreaterThanEqual"``. Traverse to node ``true_child_id`` if ``input[feature_index] >= feature_value``, and ``false_child_id`` otherwise. - ``"BranchOnValueGreaterThan"``. Traverse to node ``true_child_id`` if ``input[feature_index] > feature_value``, and ``false_child_id`` otherwise. - ``"BranchOnValueEqual"``. Traverse to node ``true_child_id`` if ``input[feature_index] == feature_value``, and ``false_child_id`` otherwise. - ``"BranchOnValueNotEqual"``. Traverse to node ``true_child_id`` if ``input[feature_index] != feature_value``, and ``false_child_id`` otherwise. true_child_id: int ID of the child under the true condition of the split. An error will be raised at model validation if this does not match the ``node_id`` of a node instantiated by ``add_branch_node`` or ``add_leaf_node`` within this ``tree_id``. false_child_id: int ID of the child under the false condition of the split. An error will be raised at model validation if this does not match the ``node_id`` of a node instantiated by ``add_branch_node`` or ``add_leaf_node`` within this ``tree_id``. relative_hit_rate: float [optional] When the model is converted compiled by CoreML, this gives hints to Core ML about which node is more likely to be hit on evaluation, allowing for additional optimizations. The values can be on any scale, with the values between child nodes being compared relative to each other. missing_value_tracks_true_child: bool [optional] If the training data contains NaN values or missing values, then this flag determines which direction a NaN value traverses. """ spec_node = self.tree_parameters.nodes.add() spec_node.treeId = tree_id spec_node.nodeId = node_id spec_node.branchFeatureIndex = int(feature_index) spec_node.branchFeatureValue = feature_value spec_node.trueChildNodeId = true_child_id spec_node.falseChildNodeId = false_child_id spec_node.nodeBehavior = _TreeEnsemble_pb2.TreeEnsembleParameters.TreeNode.TreeNodeBehavior.Value( branch_mode ) if relative_hit_rate is not None: spec_node.relativeHitRate = relative_hit_rate spec_node.missingValueTracksTrueChild = missing_value_tracks_true_child
[docs] def add_leaf_node(self, tree_id, node_id, values, relative_hit_rate=None): """ Add a leaf node to the tree ensemble. Parameters ---------- tree_id: int ID of the tree to add the node to. node_id: int ID of the node within the tree. values: [float | int | list | dict] Value(s) at the leaf node to add to the prediction when this node is activated. If the prediction dimension of the tree is 1, then the value is specified as a float or integer value. For multidimensional predictions, the values can be a list of numbers with length matching the dimension of the predictions or a dictionary mapping index to value added to that dimension. Note that the dimension of any tree must match the dimension given when :py:meth:`set_default_prediction_value` is called. """ spec_node = self.tree_parameters.nodes.add() spec_node.treeId = tree_id spec_node.nodeId = node_id spec_node.nodeBehavior = _TreeEnsemble_pb2.TreeEnsembleParameters.TreeNode.TreeNodeBehavior.Value( "LeafNode" ) if not isinstance(values, _collections.abc.Iterable): values = [values] if relative_hit_rate is not None: spec_node.relativeHitRate = relative_hit_rate if type(values) == dict: iter = values.items() else: iter = enumerate(values) for index, value in iter: ev_info = spec_node.evaluationInfo.add() ev_info.evaluationIndex = index ev_info.evaluationValue = float(value) spec_node.nodeBehavior = _TreeEnsemble_pb2.TreeEnsembleParameters.TreeNode.TreeNodeBehavior.Value( "LeafNode" )
[docs]class TreeEnsembleRegressor(TreeEnsembleBase): """ Tree Ensemble builder class to construct a Tree Ensemble regression model. The TreeEnsembleRegressor class constructs a Tree Ensemble model incrementally using methods to add branch and leaf nodes specifying the behavior of the model. Examples -------- In the following example, the code saves the model to disk, which is a recommended practice but not required. .. sourcecode:: python >>> # Required inputs >>> import coremltools >>> from coremltools.models import datatypes >>> from coremltools.models.tree_ensemble import TreeEnsembleRegressor >>> import numpy as np >>> # Define input features >>> input_features = [("a", datatypes.Array(3)), ("b", (datatypes.Double()))] >>> # Define output_features >>> output_features = [("predicted_values", datatypes.Double())] >>> tm = TreeEnsembleRegressor(features = input_features, target = output_features) >>> # Split on a[2] <= 3 >>> tm.add_branch_node(0, 0, 2, 3, "BranchOnValueLessThanEqual", 1, 2) >>> # Add leaf to the true branch of node 0 that subtracts 1. >>> tm.add_leaf_node(0, 1, -1) >>> # Add split on b == 0 to the false branch of node 0, which is index 3 >>> tm.add_branch_node(0, 2, 3, 0, "BranchOnValueEqual", 3, 4) >>> # Add leaf to the true branch of node 2 that adds 1 to the result. >>> tm.add_leaf_node(0, 3, 1) >>> # Add leaf to the false branch of node 2 that subtracts 1 from the result. >>> tm.add_leaf_node(0, 4, -1) >>> tm.set_default_prediction_value([0, 0]) >>> # save the model to a .mlmodel file >>> model_path = './tree.mlmodel' >>> coremltools.models.utils.save_spec(tm.spec, model_path) >>> # load the .mlmodel >>> mlmodel = coremltools.models.MLModel(model_path) >>> # make predictions >>> test_input = { >>> 'a': np.array([0, 1, 2]).astype(np.float32), >>> "b": 3.0, >>> } >>> predictions = mlmodel.predict(test_input) """
[docs] def __init__(self, features, target): """ Create a Tree Ensemble regression model that takes one or more input features and maps them to an output feature. Parameters ---------- features: [list of features] Name(s) of the input features, given as a list of ``('name', datatype)`` tuples. The features are one of ``models.datatypes.Int64``, ``datatypes.Double``, or ``models.datatypes.Array``. Feature indices in the nodes are counted sequentially from 0 through the features. target: (default = None) Name of the target feature predicted. """ super().__init__() spec = self.spec spec = set_regressor_interface_params(spec, features, target) self.tree_spec = spec.treeEnsembleRegressor self.tree_parameters = self.tree_spec.treeEnsemble
[docs]class TreeEnsembleClassifier(TreeEnsembleBase): """ Tree Ensemble builder class to construct a Tree Ensemble classification model. The TreeEnsembleClassifier class constructs a Tree Ensemble model incrementally using methods to add branch and leaf nodes specifying the behavior of the model. Examples -------- In the following example, the code saves the model to disk, which is a recommended practice but not required. .. sourcecode:: python >>> input_features = [("a", datatypes.Array(3)), ("b", datatypes.Double())] >>> tm = TreeEnsembleClassifier(features = input_features, class_labels = [0, 1], output_features = "predicted_class") >>> # Split on a[2] <= 3 >>> tm.add_branch_node(0, 0, 2, 3, "BranchOnValueLessThanEqual", 1, 2) >>> # Add leaf to the true branch of node 0 that subtracts 1. >>> tm.add_leaf_node(0, 1, -1) >>> # Add split on b == 0 to the false branch of node 0. >>> tm.add_branch_node(0, 2, 3, 0, "BranchOnValueEqual", 3, 4) >>> # Add leaf to the true branch of node 2 that adds 1 to the result. >>> tm.add_leaf_node(0, 3, 1) >>> # Add leaf to the false branch of node 2 that subtracts 1 from the result. >>> tm.add_leaf_node(0, 4, -1) >>> # Put in a softmax transform to translate these into probabilities. >>> tm.set_post_evaluation_transform("Classification_SoftMax") >>> tm.set_default_prediction_value([0, 0]) >>> # save the model to a .mlmodel file >>> model_path = './tree.mlmodel' >>> coremltools.models.utils.save_spec(tm.spec, model_path) >>> # load the .mlmodel >>> mlmodel = coremltools.models.MLModel(model_path) >>> # make predictions >>> test_input = { >>> 'a': np.array([0, 1, 2]).astype(np.float32), >>> "b": 3.0, >>> } >>> predictions = mlmodel.predict(test_input) """
[docs] def __init__(self, features, class_labels, output_features): """ Create a tree ensemble classifier model. Parameters ---------- features: [list of features] Name(s) of the input features, given as a list of ``('name', datatype)`` tuples. The features are one of ``models.datatypes.Int64``, ``datatypes.Double``, or ``models.datatypes.Array``. Feature indices in the nodes are counted sequentially from 0 through the features. class_labels: [list] A list of string or integer class labels to use in making predictions. The length of this must match the dimension of the tree model. output_features: [list] A string or a list of two strings specifying the names of the two output features, the first being a class label corresponding to the class with the highest predicted score, and the second being a dictionary mapping each class to its score. If ``output_features`` is a string, it specifies the predicted class label and the class scores is set to the default value of ``"classProbability"``. """ super().__init__() spec = self.spec spec = set_classifier_interface_params( spec, features, class_labels, "treeEnsembleClassifier", output_features ) self.tree_spec = spec.treeEnsembleClassifier self.tree_parameters = self.tree_spec.treeEnsemble